KVM: nVMX: reset cache/shadows when switching loaded VMCS

[linux-2.6-microblaze.git] / arch / x86 / kvm / vmx.c
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 533a327..36d6025 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -121,7 +121,6 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
  
  #define MSR_BITMAP_MODE_X2APIC         1
  #define MSR_BITMAP_MODE_X2APIC_APICV   2
-#define MSR_BITMAP_MODE_LM             4
  
  #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
  
@@ -132,7 +131,7 @@ static bool __read_mostly enable_preemption_timer = 1;
  module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
  #endif
  
-#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
  #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
  #define KVM_VM_CR0_ALWAYS_ON                           \
         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST |      \
@@ -397,6 +396,7 @@ struct loaded_vmcs {
         int cpu;
         bool launched;
         bool nmi_known_unmasked;
+       bool hv_timer_armed;
         /* Support for vnmi-less CPUs */
         int soft_vnmi_blocked;
         ktime_t entry_time;
@@ -856,6 +856,7 @@ struct nested_vmx {
  
         /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
         u64 vmcs01_debugctl;
+       u64 vmcs01_guest_bndcfgs;
  
         u16 vpid02;
         u16 last_vpid;
@@ -1019,6 +1020,8 @@ struct vcpu_vmx {
         int ple_window;
         bool ple_window_dirty;
  
+       bool req_immediate_exit;
+
         /* Support for PML */
  #define PML_ENTITY_NUM         512
         struct page *pml_pg;
@@ -1610,11 +1613,6 @@ static inline bool is_page_fault(u32 intr_info)
         return is_exception_n(intr_info, PF_VECTOR);
  }
  
-static inline bool is_no_device(u32 intr_info)
-{
-       return is_exception_n(intr_info, NM_VECTOR);
-}
-
  static inline bool is_invalid_opcode(u32 intr_info)
  {
         return is_exception_n(intr_info, UD_VECTOR);
@@ -1625,12 +1623,6 @@ static inline bool is_gp_fault(u32 intr_info)
         return is_exception_n(intr_info, GP_VECTOR);
  }
  
-static inline bool is_external_interrupt(u32 intr_info)
-{
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
-               == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
  static inline bool is_machine_check(u32 intr_info)
  {
         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -2864,6 +2856,8 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
         u16 fs_sel, gs_sel;
         int i;
  
+       vmx->req_immediate_exit = false;
+
         if (vmx->loaded_cpu_state)
                 return;
  
@@ -2894,8 +2888,7 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
                 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
         }
  
-       if (is_long_mode(&vmx->vcpu))
-               wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+       wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
  #else
         savesegment(fs, fs_sel);
         savesegment(gs, gs_sel);
@@ -2946,8 +2939,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
         vmx->loaded_cpu_state = NULL;
  
  #ifdef CONFIG_X86_64
-       if (is_long_mode(&vmx->vcpu))
-               rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+       rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
  #endif
         if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
                 kvm_load_ldt(host_state->ldt_sel);
@@ -2975,24 +2967,19 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
  #ifdef CONFIG_X86_64
  static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
  {
-       if (is_long_mode(&vmx->vcpu)) {
-               preempt_disable();
-               if (vmx->loaded_cpu_state)
-                       rdmsrl(MSR_KERNEL_GS_BASE,
-                              vmx->msr_guest_kernel_gs_base);
-               preempt_enable();
-       }
+       preempt_disable();
+       if (vmx->loaded_cpu_state)
+               rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+       preempt_enable();
         return vmx->msr_guest_kernel_gs_base;
  }
  
  static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
  {
-       if (is_long_mode(&vmx->vcpu)) {
-               preempt_disable();
-               if (vmx->loaded_cpu_state)
-                       wrmsrl(MSR_KERNEL_GS_BASE, data);
-               preempt_enable();
-       }
+       preempt_disable();
+       if (vmx->loaded_cpu_state)
+               wrmsrl(MSR_KERNEL_GS_BASE, data);
+       preempt_enable();
         vmx->msr_guest_kernel_gs_base = data;
  }
  #endif
@@ -3292,10 +3279,13 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
                 }
         } else {
                 if (vmcs12->exception_bitmap & (1u << nr)) {
-                       if (nr == DB_VECTOR)
+                       if (nr == DB_VECTOR) {
                                 *exit_qual = vcpu->arch.dr6;
-                       else
+                               *exit_qual &= ~(DR6_FIXED_1 | DR6_BT);
+                               *exit_qual ^= DR6_RTM;
+                       } else {
                                 *exit_qual = 0;
+                       }
                         return 1;
                 }
         }
@@ -3528,9 +3518,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
  
-       if (kvm_mpx_supported())
-               msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
-
         /* We support free control of debug control saving. */
         msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
  
@@ -3547,8 +3534,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
                 VM_ENTRY_LOAD_IA32_PAT;
         msrs->entry_ctls_high |=
                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
-       if (kvm_mpx_supported())
-               msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
  
         /* We support free control of debug control loading. */
         msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
@@ -3596,12 +3581,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
                 msrs->secondary_ctls_high);
         msrs->secondary_ctls_low = 0;
         msrs->secondary_ctls_high &=
-               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                 SECONDARY_EXEC_DESC |
                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                 SECONDARY_EXEC_WBINVD_EXITING;
+
         /*
          * We can emulate "VMCS shadowing," even if the hardware
          * doesn't support it.
@@ -3658,6 +3643,10 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
                 msrs->secondary_ctls_high |=
                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
  
+       if (flexpriority_enabled)
+               msrs->secondary_ctls_high |=
+                       SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
         /* miscellaneous data */
         rdmsr(MSR_IA32_VMX_MISC,
                 msrs->misc_low,
@@ -5068,19 +5057,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
         if (!msr)
                 return;
  
-       /*
-        * MSR_KERNEL_GS_BASE is not intercepted when the guest is in
-        * 64-bit mode as a 64-bit kernel may frequently access the
-        * MSR.  This means we need to manually save/restore the MSR
-        * when switching between guest and host state, but only if
-        * the guest is in 64-bit mode.  Sync our cached value if the
-        * guest is transitioning to 32-bit mode and the CPU contains
-        * guest state, i.e. the cache is stale.
-        */
-#ifdef CONFIG_X86_64
-       if (!(efer & EFER_LMA))
-               (void)vmx_read_guest_kernel_gs_base(vmx);
-#endif
         vcpu->arch.efer = efer;
         if (efer & EFER_LMA) {
                 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
@@ -5276,7 +5252,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         unsigned long hw_cr0;
  
-       hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
+       hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
         if (enable_unrestricted_guest)
                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
         else {
@@ -5393,9 +5369,10 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                  * To use VMXON (and later other VMX instructions), a guest
                  * must first be able to turn on cr4.VMXE (see handle_vmon()).
                  * So basically the check on whether to allow nested VMX
-                * is here.
+                * is here.  We operate under the default treatment of SMM,
+                * so VMX cannot be enabled under SMM.
                  */
-               if (!nested_vmx_allowed(vcpu))
+               if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
                         return 1;
         }
  
@@ -6072,9 +6049,6 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
                         mode |= MSR_BITMAP_MODE_X2APIC_APICV;
         }
  
-       if (is_long_mode(vcpu))
-               mode |= MSR_BITMAP_MODE_LM;
-
         return mode;
  }
  
@@ -6115,9 +6089,6 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
         if (!changed)
                 return;
  
-       vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
-                                 !(mode & MSR_BITMAP_MODE_LM));
-
         if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
                 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
  
@@ -6183,6 +6154,32 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
         nested_mark_vmcs12_pages_dirty(vcpu);
  }
  
+static u8 vmx_get_rvi(void)
+{
+       return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
+}
+
+static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       void *vapic_page;
+       u32 vppr;
+       int rvi;
+
+       if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
+               !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+               WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
+               return false;
+
+       rvi = vmx_get_rvi();
+
+       vapic_page = kmap(vmx->nested.virtual_apic_page);
+       vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
+       kunmap(vmx->nested.virtual_apic_page);
+
+       return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
  static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
                                                      bool nested)
  {
@@ -7966,6 +7963,9 @@ static __init int hardware_setup(void)
                 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
         }
  
+       if (!cpu_has_vmx_preemption_timer())
+               kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
+
         if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
                 u64 vmx_msr;
  
@@ -8995,6 +8995,13 @@ static int handle_invept(struct kvm_vcpu *vcpu)
         return kvm_skip_emulated_instruction(vcpu);
  }
  
+static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
+}
+
  static int handle_invvpid(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -9006,6 +9013,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                 u64 vpid;
                 u64 gla;
         } operand;
+       u16 vpid02;
  
         if (!(vmx->nested.msrs.secondary_ctls_high &
               SECONDARY_EXEC_ENABLE_VPID) ||
@@ -9045,6 +9053,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                 return kvm_skip_emulated_instruction(vcpu);
         }
  
+       vpid02 = nested_get_vpid02(vcpu);
         switch (type) {
         case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
                 if (!operand.vpid ||
@@ -9053,12 +9062,11 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
                         return kvm_skip_emulated_instruction(vcpu);
                 }
-               if (cpu_has_vmx_invvpid_individual_addr() &&
-                   vmx->nested.vpid02) {
+               if (cpu_has_vmx_invvpid_individual_addr()) {
                         __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
-                               vmx->nested.vpid02, operand.gla);
+                               vpid02, operand.gla);
                 } else
-                       __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+                       __vmx_flush_tlb(vcpu, vpid02, false);
                 break;
         case VMX_VPID_EXTENT_SINGLE_CONTEXT:
         case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
@@ -9067,10 +9075,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
                         return kvm_skip_emulated_instruction(vcpu);
                 }
-               __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+               __vmx_flush_tlb(vcpu, vpid02, false);
                 break;
         case VMX_VPID_EXTENT_ALL_CONTEXT:
-               __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+               __vmx_flush_tlb(vcpu, vpid02, false);
                 break;
         default:
                 WARN_ON_ONCE(1);
@@ -9208,7 +9216,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
  
  static int handle_preemption_timer(struct kvm_vcpu *vcpu)
  {
-       kvm_lapic_expired_hv_timer(vcpu);
+       if (!to_vmx(vcpu)->req_immediate_exit)
+               kvm_lapic_expired_hv_timer(vcpu);
         return 1;
  }
  
@@ -9639,9 +9648,6 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
                         return false;
                 else if (is_page_fault(intr_info))
                         return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
-               else if (is_no_device(intr_info) &&
-                        !(vmcs12->guest_cr0 & X86_CR0_TS))
-                       return false;
                 else if (is_debug(intr_info) &&
                          vcpu->guest_debug &
                          (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -10214,15 +10220,16 @@ static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
         if (!lapic_in_kernel(vcpu))
                 return;
  
+       if (!flexpriority_enabled &&
+           !cpu_has_vmx_virtualize_x2apic_mode())
+               return;
+
         /* Postpone execution until vmcs01 is the current VMCS. */
         if (is_guest_mode(vcpu)) {
                 to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
                 return;
         }
  
-       if (!cpu_need_tpr_shadow(vcpu))
-               return;
-
         sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
         sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                               SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
@@ -10344,6 +10351,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
         return max_irr;
  }
  
+static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
+{
+       u8 rvi = vmx_get_rvi();
+       u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
+
+       return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
  static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
  {
         if (!kvm_vcpu_apicv_active(vcpu))
@@ -10595,24 +10610,43 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
                                         msrs[i].host, false);
  }
  
-static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
+{
+       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
+       if (!vmx->loaded_vmcs->hv_timer_armed)
+               vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+                             PIN_BASED_VMX_PREEMPTION_TIMER);
+       vmx->loaded_vmcs->hv_timer_armed = true;
+}
+
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         u64 tscl;
         u32 delta_tsc;
  
-       if (vmx->hv_deadline_tsc == -1)
+       if (vmx->req_immediate_exit) {
+               vmx_arm_hv_timer(vmx, 0);
                 return;
+       }
  
-       tscl = rdtsc();
-       if (vmx->hv_deadline_tsc > tscl)
-               /* sure to be 32 bit only because checked on set_hv_timer */
-               delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
-                       cpu_preemption_timer_multi);
-       else
-               delta_tsc = 0;
+       if (vmx->hv_deadline_tsc != -1) {
+               tscl = rdtsc();
+               if (vmx->hv_deadline_tsc > tscl)
+                       /* set_hv_timer ensures the delta fits in 32-bits */
+                       delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+                               cpu_preemption_timer_multi);
+               else
+                       delta_tsc = 0;
  
-       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+               vmx_arm_hv_timer(vmx, delta_tsc);
+               return;
+       }
+
+       if (vmx->loaded_vmcs->hv_timer_armed)
+               vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+                               PIN_BASED_VMX_PREEMPTION_TIMER);
+       vmx->loaded_vmcs->hv_timer_armed = false;
  }
  
  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@ -10672,7 +10706,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  
         atomic_switch_perf_msrs(vmx);
  
-       vmx_arm_hv_timer(vcpu);
+       vmx_update_hv_timer(vcpu);
  
         /*
          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -10917,6 +10951,10 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
         vmx->loaded_vmcs = vmcs;
         vmx_vcpu_load(vcpu, cpu);
         put_cpu();
+
+       vm_entry_controls_reset_shadow(vmx);
+       vm_exit_controls_reset_shadow(vmx);
+       vmx_segment_cache_clear(vmx);
  }
  
  /*
@@ -11214,6 +11252,23 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
  #undef cr4_fixed1_update
  }
  
+static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (kvm_mpx_supported()) {
+               bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
+
+               if (mpx_enabled) {
+                       vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+                       vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+               } else {
+                       vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
+                       vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
+               }
+       }
+}
+
  static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -11230,8 +11285,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
                         ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
  
-       if (nested_vmx_allowed(vcpu))
+       if (nested_vmx_allowed(vcpu)) {
                 nested_vmx_cr_fixed1_bits_update(vcpu);
+               nested_vmx_entry_exit_ctls_update(vcpu);
+       }
  }
  
  static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -11274,11 +11331,9 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
         return get_vmcs12(vcpu)->ept_pointer;
  }
  
-static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
  {
         WARN_ON(mmu_is_nested(vcpu));
-       if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
-               return 1;
  
         kvm_init_shadow_ept_mmu(vcpu,
                         to_vmx(vcpu)->nested.msrs.ept_caps &
@@ -11290,7 +11345,6 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
         vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
  
         vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
-       return 0;
  }
  
  static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
@@ -11427,16 +11481,18 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
         u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
-       if (vcpu->arch.virtual_tsc_khz == 0)
-               return;
-
-       /* Make sure short timeouts reliably trigger an immediate vmexit.
-        * hrtimer_start does not guarantee this. */
-       if (preemption_timeout <= 1) {
+       /*
+        * A timer value of zero is architecturally guaranteed to cause
+        * a VMExit prior to executing any instructions in the guest.
+        */
+       if (preemption_timeout == 0) {
                 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
                 return;
         }
  
+       if (vcpu->arch.virtual_tsc_khz == 0)
+               return;
+
         preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
         preemption_timeout *= 1000000;
         do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
@@ -11646,11 +11702,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
          * bits 15:8 should be zero in posted_intr_nv,
          * the descriptor address has been already checked
          * in nested_get_vmcs12_pages.
+        *
+        * bits 5:0 of posted_intr_desc_addr should be zero.
          */
         if (nested_cpu_has_posted_intr(vmcs12) &&
            (!nested_cpu_has_vid(vmcs12) ||
             !nested_exit_intr_ack_set(vcpu) ||
-           vmcs12->posted_intr_nv & 0xff00))
+           (vmcs12->posted_intr_nv & 0xff00) ||
+           (vmcs12->posted_intr_desc_addr & 0x3f) ||
+           (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr))))
                 return -EINVAL;
  
         /* tpr shadow is needed by all apicv features. */
@@ -11706,15 +11766,12 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
  static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
                                          struct vmcs12 *vmcs12)
  {
-       u64 address = vmcs12->pml_address;
-       int maxphyaddr = cpuid_maxphyaddr(vcpu);
+       if (!nested_cpu_has_pml(vmcs12))
+               return 0;
  
-       if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
-               if (!nested_cpu_has_ept(vmcs12) ||
-                   !IS_ALIGNED(address, 4096)  ||
-                   address >> maxphyaddr)
-                       return -EINVAL;
-       }
+       if (!nested_cpu_has_ept(vmcs12) ||
+           !page_address_valid(vcpu, vmcs12->pml_address))
+               return -EINVAL;
  
         return 0;
  }
@@ -11894,6 +11951,25 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
         return 0;
  }
  
+/*
+ * Returns if KVM is able to config CPU to tag TLB entries
+ * populated by L2 differently than TLB entries populated
+ * by L1.
+ *
+ * If L1 uses EPT, then TLB entries are tagged with different EPTP.
+ *
+ * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
+ * with different VPID (L1 entries are tagged with vmx->vpid
+ * while L2 entries are tagged with vmx->nested.vpid02).
+ */
+static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+       return nested_cpu_has_ept(vmcs12) ||
+              (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
+}
+
  static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -11993,8 +12069,13 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  
         set_cr4_guest_host_mask(vmx);
  
-       if (vmx_mpx_supported())
-               vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+       if (kvm_mpx_supported()) {
+               if (vmx->nested.nested_run_pending &&
+                       (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+                       vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+               else
+                       vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+       }
  
         if (enable_vpid) {
                 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
@@ -12076,11 +12157,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
  
         exec_control = vmcs12->pin_based_vm_exec_control;
  
-       /* Preemption timer setting is only taken from vmcs01.  */
-       exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       /* Preemption timer setting is computed directly in vmx_vcpu_run.  */
         exec_control |= vmcs_config.pin_based_exec_ctrl;
-       if (vmx->hv_deadline_tsc == -1)
-               exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       vmx->loaded_vmcs->hv_timer_armed = false;
  
         /* Posted interrupts setting is only taken from vmcs12.  */
         if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -12186,7 +12266,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
          * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
          * bits are further modified by vmx_set_efer() below.
          */
-       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+       vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
  
         /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
          * emulated by vmx_set_efer(), below.
@@ -12218,13 +12298,21 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                  * influence global bitmap(for vpid01 and vpid02 allocation)
                  * even if spawn a lot of nested vCPUs.
                  */
-               if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+               if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
                         if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
                                 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
-                               __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+                               __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
                         }
                 } else {
-                       vmx_flush_tlb(vcpu, true);
+                       /*
+                        * If L1 use EPT, then L0 needs to execute INVEPT on
+                        * EPTP02 instead of EPTP01. Therefore, delay TLB
+                        * flush until vmcs02->eptp is fully updated by
+                        * KVM_REQ_LOAD_CR3. Note that this assumes
+                        * KVM_REQ_TLB_FLUSH is evaluated after
+                        * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
+                        */
+                       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
                 }
         }
  
@@ -12240,15 +12328,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
         }
  
-       if (nested_cpu_has_ept(vmcs12)) {
-               if (nested_ept_init_mmu_context(vcpu)) {
-                       *entry_failure_code = ENTRY_FAIL_DEFAULT;
-                       return 1;
-               }
-       } else if (nested_cpu_has2(vmcs12,
-                                  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+       if (nested_cpu_has_ept(vmcs12))
+               nested_ept_init_mmu_context(vcpu);
+       else if (nested_cpu_has2(vmcs12,
+                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
                 vmx_flush_tlb(vcpu, true);
-       }
  
         /*
          * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
@@ -12313,11 +12397,15 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
  static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
+       bool ia32e;
  
         if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
             vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
  
+       if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
         if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
  
@@ -12383,6 +12471,21 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
             !nested_cr3_valid(vcpu, vmcs12->host_cr3))
                 return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
  
+       /*
+        * If the load IA32_EFER VM-exit control is 1, bits reserved in the
+        * IA32_EFER MSR must be 0 in the field for that register. In addition,
+        * the values of the LMA and LME bits in the field must each be that of
+        * the host address-space size VM-exit control.
+        */
+       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
+               ia32e = (vmcs12->vm_exit_controls &
+                        VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+               if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
+                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
+                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
+                       return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
+       }
+
         /*
          * From the Intel SDM, volume 3:
          * Fields relevant to VM-entry event injection must be set properly.
@@ -12439,6 +12542,10 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                 }
         }
  
+       if (nested_cpu_has_ept(vmcs12) &&
+           !valid_ept_address(vcpu, vmcs12->ept_pointer))
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
         return 0;
  }
  
@@ -12504,21 +12611,6 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                         return 1;
         }
  
-       /*
-        * If the load IA32_EFER VM-exit control is 1, bits reserved in the
-        * IA32_EFER MSR must be 0 in the field for that register. In addition,
-        * the values of the LMA and LME bits in the field must each be that of
-        * the host address-space size VM-exit control.
-        */
-       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
-               ia32e = (vmcs12->vm_exit_controls &
-                        VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
-               if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
-                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
-                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
-                       return 1;
-       }
-
         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
                 (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
                 (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
@@ -12537,18 +12629,23 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
         bool from_vmentry = !!exit_qual;
         u32 dummy_exit_qual;
-       u32 vmcs01_cpu_exec_ctrl;
+       bool evaluate_pending_interrupts;
         int r = 0;
  
-       vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
+               (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
+       if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
+               evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
  
         enter_guest_mode(vcpu);
  
         if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
                 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+       if (kvm_mpx_supported() &&
+               !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+               vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
  
         vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
-       vmx_segment_cache_clear(vmx);
  
         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
                 vcpu->arch.tsc_offset += vmcs12->tsc_offset;
@@ -12585,16 +12682,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
          * to L1 or delivered directly to L2 (e.g. In case L1 don't
          * intercept EXTERNAL_INTERRUPT).
          *
-        * Usually this would be handled by L0 requesting a
-        * IRQ/NMI window by setting VMCS accordingly. However,
-        * this setting was done on VMCS01 and now VMCS02 is active
-        * instead. Thus, we force L0 to perform pending event
-        * evaluation by requesting a KVM_REQ_EVENT.
-        */
-       if (vmcs01_cpu_exec_ctrl &
-               (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING)) {
+        * Usually this would be handled by the processor noticing an
+        * IRQ/NMI window request, or checking RVI during evaluation of
+        * pending virtual interrupts.  However, this setting was done
+        * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
+        * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
+        */
+       if (unlikely(evaluate_pending_interrupts))
                 kvm_make_request(KVM_REQ_EVENT, vcpu);
-       }
  
         /*
          * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -12863,6 +12958,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
         return 0;
  }
  
+static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
+{
+       to_vmx(vcpu)->req_immediate_exit = true;
+}
+
  static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
  {
         ktime_t remaining =
@@ -13040,24 +13140,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         kvm_clear_interrupt_queue(vcpu);
  }
  
-static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu,
-                       struct vmcs12 *vmcs12)
-{
-       u32 entry_failure_code;
-
-       nested_ept_uninit_mmu_context(vcpu);
-
-       /*
-        * Only PDPTE load can fail as the value of cr3 was checked on entry and
-        * couldn't have changed.
-        */
-       if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
-               nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
-
-       if (!enable_ept)
-               vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-}
-
  /*
   * A part of what we need to when the nested L2 guest exits and we want to
   * run its L1 parent, is to reset L1's guest state to the host state specified
@@ -13071,6 +13153,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                                    struct vmcs12 *vmcs12)
  {
         struct kvm_segment seg;
+       u32 entry_failure_code;
  
         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                 vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -13097,23 +13180,35 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
         vmx_set_cr4(vcpu, vmcs12->host_cr4);
  
-       load_vmcs12_mmu_host_state(vcpu, vmcs12);
+       nested_ept_uninit_mmu_context(vcpu);
  
         /*
-        * If vmcs01 don't use VPID, CPU flushes TLB on every
+        * Only PDPTE load can fail as the value of cr3 was checked on entry and
+        * couldn't have changed.
+        */
+       if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+               nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
+
+       if (!enable_ept)
+               vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+
+       /*
+        * If vmcs01 doesn't use VPID, CPU flushes TLB on every
          * VMEntry/VMExit. Thus, no need to flush TLB.
          *
-        * If vmcs12 uses VPID, TLB entries populated by L2 are
-        * tagged with vmx->nested.vpid02 while L1 entries are tagged
-        * with vmx->vpid. Thus, no need to flush TLB.
+        * If vmcs12 doesn't use VPID, L1 expects TLB to be
+        * flushed on every VMEntry/VMExit.
          *
-        * Therefore, flush TLB only in case vmcs01 uses VPID and
-        * vmcs12 don't use VPID as in this case L1 & L2 TLB entries
-        * are both tagged with vmx->vpid.
+        * Otherwise, we can preserve TLB entries as long as we are
+        * able to tag L1 TLB entries differently than L2 TLB entries.
+        *
+        * If vmcs12 uses EPT, we need to execute this flush on EPTP01
+        * and therefore we request the TLB flush to happen only after VMCS EPTP
+        * has been set by KVM_REQ_LOAD_CR3.
          */
         if (enable_vpid &&
-           !(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02)) {
-               vmx_flush_tlb(vcpu, true);
+           (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
+               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
         }
  
         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
@@ -13193,6 +13288,140 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
  }
  
+static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
+{
+       struct shared_msr_entry *efer_msr;
+       unsigned int i;
+
+       if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
+               return vmcs_read64(GUEST_IA32_EFER);
+
+       if (cpu_has_load_ia32_efer)
+               return host_efer;
+
+       for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
+               if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
+                       return vmx->msr_autoload.guest.val[i].value;
+       }
+
+       efer_msr = find_msr_entry(vmx, MSR_EFER);
+       if (efer_msr)
+               return efer_msr->data;
+
+       return host_efer;
+}
+
+static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct vmx_msr_entry g, h;
+       struct msr_data msr;
+       gpa_t gpa;
+       u32 i, j;
+
+       vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
+
+       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+               /*
+                * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
+                * as vmcs01.GUEST_DR7 contains a userspace defined value
+                * and vcpu->arch.dr7 is not squirreled away before the
+                * nested VMENTER (not worth adding a variable in nested_vmx).
+                */
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+                       kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+               else
+                       WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+       }
+
+       /*
+        * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+        * handle a variety of side effects to KVM's software model.
+        */
+       vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
+
+       vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+       vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
+
+       vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+       vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
+
+       nested_ept_uninit_mmu_context(vcpu);
+       vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+       __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+       /*
+        * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
+        * from vmcs01 (if necessary).  The PDPTRs are not loaded on
+        * VMFail, like everything else we just need to ensure our
+        * software model is up-to-date.
+        */
+       ept_save_pdptrs(vcpu);
+
+       kvm_mmu_reset_context(vcpu);
+
+       if (cpu_has_vmx_msr_bitmap())
+               vmx_update_msr_bitmap(vcpu);
+
+       /*
+        * This nasty bit of open coding is a compromise between blindly
+        * loading L1's MSRs using the exit load lists (incorrect emulation
+        * of VMFail), leaving the nested VM's MSRs in the software model
+        * (incorrect behavior) and snapshotting the modified MSRs (too
+        * expensive since the lists are unbound by hardware).  For each
+        * MSR that was (prematurely) loaded from the nested VMEntry load
+        * list, reload it from the exit load list if it exists and differs
+        * from the guest value.  The intent is to stuff host state as
+        * silently as possible, not to fully process the exit load list.
+        */
+       msr.host_initiated = false;
+       for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
+               gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
+               if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
+                       pr_debug_ratelimited(
+                               "%s read MSR index failed (%u, 0x%08llx)\n",
+                               __func__, i, gpa);
+                       goto vmabort;
+               }
+
+               for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
+                       gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
+                       if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
+                               pr_debug_ratelimited(
+                                       "%s read MSR failed (%u, 0x%08llx)\n",
+                                       __func__, j, gpa);
+                               goto vmabort;
+                       }
+                       if (h.index != g.index)
+                               continue;
+                       if (h.value == g.value)
+                               break;
+
+                       if (nested_vmx_load_msr_check(vcpu, &h)) {
+                               pr_debug_ratelimited(
+                                       "%s check failed (%u, 0x%x, 0x%x)\n",
+                                       __func__, j, h.index, h.reserved);
+                               goto vmabort;
+                       }
+
+                       msr.index = h.index;
+                       msr.data = h.value;
+                       if (kvm_set_msr(vcpu, &msr)) {
+                               pr_debug_ratelimited(
+                                       "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
+                                       __func__, j, h.index, h.value);
+                               goto vmabort;
+                       }
+               }
+       }
+
+       return;
+
+vmabort:
+       nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+}
+
  /*
   * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
   * and modify vmcs12 to make it see what it would expect to see there if
@@ -13245,20 +13474,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         }
  
         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-       vm_entry_controls_reset_shadow(vmx);
-       vm_exit_controls_reset_shadow(vmx);
-       vmx_segment_cache_clear(vmx);
  
         /* Update any VMCS fields that might have changed while L2 ran */
         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
-       if (vmx->hv_deadline_tsc == -1)
-               vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
-                               PIN_BASED_VMX_PREEMPTION_TIMER);
-       else
-               vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
-                             PIN_BASED_VMX_PREEMPTION_TIMER);
+
         if (kvm_has_tsc_control)
                 decache_tsc_multiplier(vmx);
  
@@ -13342,7 +13563,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
          */
         nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
  
-       load_vmcs12_mmu_host_state(vcpu, vmcs12);
+       /*
+        * Restore L1's host state to KVM's software model.  We're here
+        * because a consistency check was caught by hardware, which
+        * means some amount of guest state has been propagated to KVM's
+        * model and needs to be unwound to the host's state.
+        */
+       nested_vmx_restore_host_state(vcpu);
  
         /*
          * The emulated instruction was already skipped in
@@ -13462,18 +13689,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
                 return -ERANGE;
  
         vmx->hv_deadline_tsc = tscl + delta_tsc;
-       vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
-                       PIN_BASED_VMX_PREEMPTION_TIMER);
-
         return delta_tsc == 0;
  }
  
  static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
  {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       vmx->hv_deadline_tsc = -1;
-       vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
-                       PIN_BASED_VMX_PREEMPTION_TIMER);
+       to_vmx(vcpu)->hv_deadline_tsc = -1;
  }
  #endif
  
@@ -13954,6 +14175,14 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
             ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
                 return -EINVAL;
  
+       /*
+        * SMM temporarily disables VMX, so we cannot be in guest mode,
+        * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
+        * must be zero.
+        */
+       if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags)
+               return -EINVAL;
+
         if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
             !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
                 return -EINVAL;
@@ -14097,6 +14326,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .apicv_post_state_restore = vmx_apicv_post_state_restore,
         .hwapic_irr_update = vmx_hwapic_irr_update,
         .hwapic_isr_update = vmx_hwapic_isr_update,
+       .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
         .sync_pir_to_irr = vmx_sync_pir_to_irr,
         .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
  
@@ -14130,6 +14360,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .umip_emulated = vmx_umip_emulated,
  
         .check_nested_events = vmx_check_nested_events,
+       .request_immediate_exit = vmx_request_immediate_exit,
  
         .sched_in = vmx_sched_in,