x86/KVM/VMX: Use MSR save list for IA32_FLUSH_CMD if required

[linux-2.6-microblaze.git] / arch / x86 / kvm / vmx.c
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 40aa292..eb7c207 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,6 +71,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = {
  };
  MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
  
+static bool __read_mostly nosmt;
+module_param(nosmt, bool, S_IRUGO);
+
  static bool __read_mostly enable_vpid = 1;
  module_param_named(vpid, enable_vpid, bool, 0444);
  
@@ -188,6 +191,54 @@ module_param(ple_window_max, uint, 0444);
  
  extern const ulong vmx_return;
  
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+
+/* These MUST be in sync with vmentry_l1d_param order. */
+enum vmx_l1d_flush_state {
+       VMENTER_L1D_FLUSH_NEVER,
+       VMENTER_L1D_FLUSH_COND,
+       VMENTER_L1D_FLUSH_ALWAYS,
+};
+
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush = VMENTER_L1D_FLUSH_COND;
+
+static const struct {
+       const char *option;
+       enum vmx_l1d_flush_state cmd;
+} vmentry_l1d_param[] = {
+       {"never",       VMENTER_L1D_FLUSH_NEVER},
+       {"cond",        VMENTER_L1D_FLUSH_COND},
+       {"always",      VMENTER_L1D_FLUSH_ALWAYS},
+};
+
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+       unsigned int i;
+
+       if (!s)
+               return -EINVAL;
+
+       for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+               if (!strcmp(s, vmentry_l1d_param[i].option)) {
+                       vmentry_l1d_flush = vmentry_l1d_param[i].cmd;
+                       return 0;
+               }
+       }
+
+       return -EINVAL;
+}
+
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+       return sprintf(s, "%s\n", vmentry_l1d_param[vmentry_l1d_flush].option);
+}
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+       .set = vmentry_l1d_flush_set,
+       .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, &vmentry_l1d_flush, S_IRUGO);
+
  struct kvm_vmx {
         struct kvm kvm;
  
@@ -242,7 +293,11 @@ struct shared_msr_entry {
   * underlying hardware which will be used to run L2.
   * This structure is packed to ensure that its layout is identical across
   * machines (necessary for live migration).
- * If there are changes in this struct, VMCS12_REVISION must be changed.
+ *
+ * IMPORTANT: Changing the layout of existing fields in this structure
+ * will break save/restore compatibility with older kvm releases. When
+ * adding new fields, either use space in the reserved padding* arrays
+ * or add the new fields to the end of the structure.
   */
  typedef u64 natural_width;
  struct __packed vmcs12 {
@@ -265,17 +320,14 @@ struct __packed vmcs12 {
         u64 virtual_apic_page_addr;
         u64 apic_access_addr;
         u64 posted_intr_desc_addr;
-       u64 vm_function_control;
         u64 ept_pointer;
         u64 eoi_exit_bitmap0;
         u64 eoi_exit_bitmap1;
         u64 eoi_exit_bitmap2;
         u64 eoi_exit_bitmap3;
-       u64 eptp_list_address;
         u64 xss_exit_bitmap;
         u64 guest_physical_address;
         u64 vmcs_link_pointer;
-       u64 pml_address;
         u64 guest_ia32_debugctl;
         u64 guest_ia32_pat;
         u64 guest_ia32_efer;
@@ -288,7 +340,12 @@ struct __packed vmcs12 {
         u64 host_ia32_pat;
         u64 host_ia32_efer;
         u64 host_ia32_perf_global_ctrl;
-       u64 padding64[8]; /* room for future expansion */
+       u64 vmread_bitmap;
+       u64 vmwrite_bitmap;
+       u64 vm_function_control;
+       u64 eptp_list_address;
+       u64 pml_address;
+       u64 padding64[3]; /* room for future expansion */
         /*
          * To allow migration of L1 (complete with its L2 guests) between
          * machines of different natural widths (32 or 64 bit), we cannot have
@@ -397,7 +454,6 @@ struct __packed vmcs12 {
         u16 guest_ldtr_selector;
         u16 guest_tr_selector;
         u16 guest_intr_status;
-       u16 guest_pml_index;
         u16 host_es_selector;
         u16 host_cs_selector;
         u16 host_ss_selector;
@@ -405,12 +461,172 @@ struct __packed vmcs12 {
         u16 host_fs_selector;
         u16 host_gs_selector;
         u16 host_tr_selector;
+       u16 guest_pml_index;
  };
  
+/*
+ * For save/restore compatibility, the vmcs12 field offsets must not change.
+ */
+#define CHECK_OFFSET(field, loc)                               \
+       BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc),       \
+               "Offset of " #field " in struct vmcs12 has changed.")
+
+static inline void vmx_check_vmcs12_offsets(void) {
+       CHECK_OFFSET(revision_id, 0);
+       CHECK_OFFSET(abort, 4);
+       CHECK_OFFSET(launch_state, 8);
+       CHECK_OFFSET(io_bitmap_a, 40);
+       CHECK_OFFSET(io_bitmap_b, 48);
+       CHECK_OFFSET(msr_bitmap, 56);
+       CHECK_OFFSET(vm_exit_msr_store_addr, 64);
+       CHECK_OFFSET(vm_exit_msr_load_addr, 72);
+       CHECK_OFFSET(vm_entry_msr_load_addr, 80);
+       CHECK_OFFSET(tsc_offset, 88);
+       CHECK_OFFSET(virtual_apic_page_addr, 96);
+       CHECK_OFFSET(apic_access_addr, 104);
+       CHECK_OFFSET(posted_intr_desc_addr, 112);
+       CHECK_OFFSET(ept_pointer, 120);
+       CHECK_OFFSET(eoi_exit_bitmap0, 128);
+       CHECK_OFFSET(eoi_exit_bitmap1, 136);
+       CHECK_OFFSET(eoi_exit_bitmap2, 144);
+       CHECK_OFFSET(eoi_exit_bitmap3, 152);
+       CHECK_OFFSET(xss_exit_bitmap, 160);
+       CHECK_OFFSET(guest_physical_address, 168);
+       CHECK_OFFSET(vmcs_link_pointer, 176);
+       CHECK_OFFSET(guest_ia32_debugctl, 184);
+       CHECK_OFFSET(guest_ia32_pat, 192);
+       CHECK_OFFSET(guest_ia32_efer, 200);
+       CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
+       CHECK_OFFSET(guest_pdptr0, 216);
+       CHECK_OFFSET(guest_pdptr1, 224);
+       CHECK_OFFSET(guest_pdptr2, 232);
+       CHECK_OFFSET(guest_pdptr3, 240);
+       CHECK_OFFSET(guest_bndcfgs, 248);
+       CHECK_OFFSET(host_ia32_pat, 256);
+       CHECK_OFFSET(host_ia32_efer, 264);
+       CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
+       CHECK_OFFSET(vmread_bitmap, 280);
+       CHECK_OFFSET(vmwrite_bitmap, 288);
+       CHECK_OFFSET(vm_function_control, 296);
+       CHECK_OFFSET(eptp_list_address, 304);
+       CHECK_OFFSET(pml_address, 312);
+       CHECK_OFFSET(cr0_guest_host_mask, 344);
+       CHECK_OFFSET(cr4_guest_host_mask, 352);
+       CHECK_OFFSET(cr0_read_shadow, 360);
+       CHECK_OFFSET(cr4_read_shadow, 368);
+       CHECK_OFFSET(cr3_target_value0, 376);
+       CHECK_OFFSET(cr3_target_value1, 384);
+       CHECK_OFFSET(cr3_target_value2, 392);
+       CHECK_OFFSET(cr3_target_value3, 400);
+       CHECK_OFFSET(exit_qualification, 408);
+       CHECK_OFFSET(guest_linear_address, 416);
+       CHECK_OFFSET(guest_cr0, 424);
+       CHECK_OFFSET(guest_cr3, 432);
+       CHECK_OFFSET(guest_cr4, 440);
+       CHECK_OFFSET(guest_es_base, 448);
+       CHECK_OFFSET(guest_cs_base, 456);
+       CHECK_OFFSET(guest_ss_base, 464);
+       CHECK_OFFSET(guest_ds_base, 472);
+       CHECK_OFFSET(guest_fs_base, 480);
+       CHECK_OFFSET(guest_gs_base, 488);
+       CHECK_OFFSET(guest_ldtr_base, 496);
+       CHECK_OFFSET(guest_tr_base, 504);
+       CHECK_OFFSET(guest_gdtr_base, 512);
+       CHECK_OFFSET(guest_idtr_base, 520);
+       CHECK_OFFSET(guest_dr7, 528);
+       CHECK_OFFSET(guest_rsp, 536);
+       CHECK_OFFSET(guest_rip, 544);
+       CHECK_OFFSET(guest_rflags, 552);
+       CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
+       CHECK_OFFSET(guest_sysenter_esp, 568);
+       CHECK_OFFSET(guest_sysenter_eip, 576);
+       CHECK_OFFSET(host_cr0, 584);
+       CHECK_OFFSET(host_cr3, 592);
+       CHECK_OFFSET(host_cr4, 600);
+       CHECK_OFFSET(host_fs_base, 608);
+       CHECK_OFFSET(host_gs_base, 616);
+       CHECK_OFFSET(host_tr_base, 624);
+       CHECK_OFFSET(host_gdtr_base, 632);
+       CHECK_OFFSET(host_idtr_base, 640);
+       CHECK_OFFSET(host_ia32_sysenter_esp, 648);
+       CHECK_OFFSET(host_ia32_sysenter_eip, 656);
+       CHECK_OFFSET(host_rsp, 664);
+       CHECK_OFFSET(host_rip, 672);
+       CHECK_OFFSET(pin_based_vm_exec_control, 744);
+       CHECK_OFFSET(cpu_based_vm_exec_control, 748);
+       CHECK_OFFSET(exception_bitmap, 752);
+       CHECK_OFFSET(page_fault_error_code_mask, 756);
+       CHECK_OFFSET(page_fault_error_code_match, 760);
+       CHECK_OFFSET(cr3_target_count, 764);
+       CHECK_OFFSET(vm_exit_controls, 768);
+       CHECK_OFFSET(vm_exit_msr_store_count, 772);
+       CHECK_OFFSET(vm_exit_msr_load_count, 776);
+       CHECK_OFFSET(vm_entry_controls, 780);
+       CHECK_OFFSET(vm_entry_msr_load_count, 784);
+       CHECK_OFFSET(vm_entry_intr_info_field, 788);
+       CHECK_OFFSET(vm_entry_exception_error_code, 792);
+       CHECK_OFFSET(vm_entry_instruction_len, 796);
+       CHECK_OFFSET(tpr_threshold, 800);
+       CHECK_OFFSET(secondary_vm_exec_control, 804);
+       CHECK_OFFSET(vm_instruction_error, 808);
+       CHECK_OFFSET(vm_exit_reason, 812);
+       CHECK_OFFSET(vm_exit_intr_info, 816);
+       CHECK_OFFSET(vm_exit_intr_error_code, 820);
+       CHECK_OFFSET(idt_vectoring_info_field, 824);
+       CHECK_OFFSET(idt_vectoring_error_code, 828);
+       CHECK_OFFSET(vm_exit_instruction_len, 832);
+       CHECK_OFFSET(vmx_instruction_info, 836);
+       CHECK_OFFSET(guest_es_limit, 840);
+       CHECK_OFFSET(guest_cs_limit, 844);
+       CHECK_OFFSET(guest_ss_limit, 848);
+       CHECK_OFFSET(guest_ds_limit, 852);
+       CHECK_OFFSET(guest_fs_limit, 856);
+       CHECK_OFFSET(guest_gs_limit, 860);
+       CHECK_OFFSET(guest_ldtr_limit, 864);
+       CHECK_OFFSET(guest_tr_limit, 868);
+       CHECK_OFFSET(guest_gdtr_limit, 872);
+       CHECK_OFFSET(guest_idtr_limit, 876);
+       CHECK_OFFSET(guest_es_ar_bytes, 880);
+       CHECK_OFFSET(guest_cs_ar_bytes, 884);
+       CHECK_OFFSET(guest_ss_ar_bytes, 888);
+       CHECK_OFFSET(guest_ds_ar_bytes, 892);
+       CHECK_OFFSET(guest_fs_ar_bytes, 896);
+       CHECK_OFFSET(guest_gs_ar_bytes, 900);
+       CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
+       CHECK_OFFSET(guest_tr_ar_bytes, 908);
+       CHECK_OFFSET(guest_interruptibility_info, 912);
+       CHECK_OFFSET(guest_activity_state, 916);
+       CHECK_OFFSET(guest_sysenter_cs, 920);
+       CHECK_OFFSET(host_ia32_sysenter_cs, 924);
+       CHECK_OFFSET(vmx_preemption_timer_value, 928);
+       CHECK_OFFSET(virtual_processor_id, 960);
+       CHECK_OFFSET(posted_intr_nv, 962);
+       CHECK_OFFSET(guest_es_selector, 964);
+       CHECK_OFFSET(guest_cs_selector, 966);
+       CHECK_OFFSET(guest_ss_selector, 968);
+       CHECK_OFFSET(guest_ds_selector, 970);
+       CHECK_OFFSET(guest_fs_selector, 972);
+       CHECK_OFFSET(guest_gs_selector, 974);
+       CHECK_OFFSET(guest_ldtr_selector, 976);
+       CHECK_OFFSET(guest_tr_selector, 978);
+       CHECK_OFFSET(guest_intr_status, 980);
+       CHECK_OFFSET(host_es_selector, 982);
+       CHECK_OFFSET(host_cs_selector, 984);
+       CHECK_OFFSET(host_ss_selector, 986);
+       CHECK_OFFSET(host_ds_selector, 988);
+       CHECK_OFFSET(host_fs_selector, 990);
+       CHECK_OFFSET(host_gs_selector, 992);
+       CHECK_OFFSET(host_tr_selector, 994);
+       CHECK_OFFSET(guest_pml_index, 996);
+}
+
  /*
   * VMCS12_REVISION is an arbitrary id that should be changed if the content or
   * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
   * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ *
+ * IMPORTANT: Changing this value will break save/restore compatibility with
+ * older kvm releases.
   */
  #define VMCS12_REVISION 0x11e57ed0
  
@@ -481,7 +697,8 @@ struct nested_vmx {
         bool sync_shadow_vmcs;
         bool dirty_vmcs12;
  
-       bool change_vmcs01_virtual_x2apic_mode;
+       bool change_vmcs01_virtual_apic_mode;
+
         /* L2 must run next, and mustn't decide to exit to L1. */
         bool nested_run_pending;
  
@@ -591,6 +808,11 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
                         (unsigned long *)&pi_desc->control);
  }
  
+struct vmx_msrs {
+       unsigned int            nr;
+       struct vmx_msr_entry    val[NR_AUTOLOAD_MSRS];
+};
+
  struct vcpu_vmx {
         struct kvm_vcpu       vcpu;
         unsigned long         host_rsp;
@@ -624,9 +846,8 @@ struct vcpu_vmx {
         struct loaded_vmcs   *loaded_vmcs;
         bool                  __launched; /* temporary, used in vmx_vcpu_run */
         struct msr_autoload {
-               unsigned nr;
-               struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
-               struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
+               struct vmx_msrs guest;
+               struct vmx_msrs host;
         } msr_autoload;
         struct {
                 int           loaded;
@@ -761,6 +982,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
         FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
         FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
         FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
+       FIELD64(PML_ADDRESS, pml_address),
         FIELD64(TSC_OFFSET, tsc_offset),
         FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
         FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
@@ -772,10 +994,11 @@ static const unsigned short vmcs_field_to_offset_table[] = {
         FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
         FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
         FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
+       FIELD64(VMREAD_BITMAP, vmread_bitmap),
+       FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
         FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
         FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
         FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
-       FIELD64(PML_ADDRESS, pml_address),
         FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
         FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
         FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
@@ -1089,6 +1312,16 @@ static inline u16 evmcs_read16(unsigned long field)
         return *(u16 *)((char *)current_evmcs + offset);
  }
  
+static inline void evmcs_touch_msr_bitmap(void)
+{
+       if (unlikely(!current_evmcs))
+               return;
+
+       if (current_evmcs->hv_enlightenments_control.msr_bitmap)
+               current_evmcs->hv_clean_fields &=
+                       ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+}
+
  static void evmcs_load(u64 phys_addr)
  {
         struct hv_vp_assist_page *vp_ap =
@@ -1173,6 +1406,7 @@ static inline u32 evmcs_read32(unsigned long field) { return 0; }
  static inline u16 evmcs_read16(unsigned long field) { return 0; }
  static inline void evmcs_load(u64 phys_addr) {}
  static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
+static inline void evmcs_touch_msr_bitmap(void) {}
  #endif /* IS_ENABLED(CONFIG_HYPERV) */
  
  static inline bool is_exception_n(u32 intr_info, u8 vector)
@@ -1393,6 +1627,11 @@ static inline bool cpu_has_vmx_invept_global(void)
         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
  }
  
+static inline bool cpu_has_vmx_invvpid_individual_addr(void)
+{
+       return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
+}
+
  static inline bool cpu_has_vmx_invvpid_single(void)
  {
         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
@@ -1510,6 +1749,17 @@ static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
         return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
  }
  
+/*
+ * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE
+ * to modify any valid field of the VMCS, or are the VM-exit
+ * information fields read-only?
+ */
+static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
+{
+       return to_vmx(vcpu)->nested.msrs.misc_low &
+               MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
+}
+
  static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
  {
         return vmcs12->cpu_based_vm_exec_control & bit;
@@ -2171,9 +2421,20 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
         vm_exit_controls_clearbit(vmx, exit);
  }
  
+static int find_msr(struct vmx_msrs *m, unsigned int msr)
+{
+       unsigned int i;
+
+       for (i = 0; i < m->nr; ++i) {
+               if (m->val[i].index == msr)
+                       return i;
+       }
+       return -ENOENT;
+}
+
  static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
  {
-       unsigned i;
+       int i;
         struct msr_autoload *m = &vmx->msr_autoload;
  
         switch (msr) {
@@ -2194,18 +2455,21 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
                 }
                 break;
         }
+       i = find_msr(&m->guest, msr);
+       if (i < 0)
+               goto skip_guest;
+       --m->guest.nr;
+       m->guest.val[i] = m->guest.val[m->guest.nr];
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
  
-       for (i = 0; i < m->nr; ++i)
-               if (m->guest[i].index == msr)
-                       break;
-
-       if (i == m->nr)
+skip_guest:
+       i = find_msr(&m->host, msr);
+       if (i < 0)
                 return;
-       --m->nr;
-       m->guest[i] = m->guest[m->nr];
-       m->host[i] = m->host[m->nr];
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+
+       --m->host.nr;
+       m->host.val[i] = m->host.val[m->host.nr];
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
  }
  
  static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
@@ -2220,9 +2484,9 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
  }
  
  static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
-                                 u64 guest_val, u64 host_val)
+                                 u64 guest_val, u64 host_val, bool entry_only)
  {
-       unsigned i;
+       int i, j = 0;
         struct msr_autoload *m = &vmx->msr_autoload;
  
         switch (msr) {
@@ -2257,24 +2521,31 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
         }
  
-       for (i = 0; i < m->nr; ++i)
-               if (m->guest[i].index == msr)
-                       break;
+       i = find_msr(&m->guest, msr);
+       if (!entry_only)
+               j = find_msr(&m->host, msr);
  
-       if (i == NR_AUTOLOAD_MSRS) {
+       if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
                 printk_once(KERN_WARNING "Not enough msr switch entries. "
                                 "Can't add msr %x\n", msr);
                 return;
-       } else if (i == m->nr) {
-               ++m->nr;
-               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
-               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
         }
+       if (i < 0) {
+               i = m->guest.nr++;
+               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+       }
+       m->guest.val[i].index = msr;
+       m->guest.val[i].value = guest_val;
  
-       m->guest[i].index = msr;
-       m->guest[i].value = guest_val;
-       m->host[i].index = msr;
-       m->host[i].value = host_val;
+       if (entry_only)
+               return;
+
+       if (j < 0) {
+               j = m->host.nr++;
+               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+       }
+       m->host.val[j].index = msr;
+       m->host.val[j].value = host_val;
  }
  
  static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
@@ -2318,7 +2589,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
                         guest_efer &= ~EFER_LME;
                 if (guest_efer != host_efer)
                         add_atomic_switch_msr(vmx, MSR_EFER,
-                                             guest_efer, host_efer);
+                                             guest_efer, host_efer, false);
                 return false;
         } else {
                 guest_efer &= ~ignore_bits;
@@ -3127,6 +3398,7 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
                 msrs->misc_high);
         msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
         msrs->misc_low |=
+               MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
                 VMX_MISC_ACTIVITY_HLT;
         msrs->misc_high = 0;
@@ -3300,6 +3572,15 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
  
         vmx->nested.msrs.misc_low = data;
         vmx->nested.msrs.misc_high = data >> 32;
+
+       /*
+        * If L1 has read-only VM-exit information fields, use the
+        * less permissive vmx_vmwrite_bitmap to specify write
+        * permissions for the shadow VMCS.
+        */
+       if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
+               vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+
         return 0;
  }
  
@@ -3354,6 +3635,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
+       /*
+        * Don't allow changes to the VMX capability MSRs while the vCPU
+        * is in VMX operation.
+        */
+       if (vmx->nested.vmxon)
+               return -EBUSY;
+
         switch (msr_index) {
         case MSR_IA32_VMX_BASIC:
                 return vmx_restore_vmx_basic(vmx, data);
@@ -3755,7 +4043,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 vcpu->arch.ia32_xss = data;
                 if (vcpu->arch.ia32_xss != host_xss)
                         add_atomic_switch_msr(vmx, MSR_IA32_XSS,
-                               vcpu->arch.ia32_xss, host_xss);
+                               vcpu->arch.ia32_xss, host_xss, false);
                 else
                         clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
                 break;
@@ -4216,6 +4504,15 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
                 if (!loaded_vmcs->msr_bitmap)
                         goto out_vmcs;
                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
+
+               if (IS_ENABLED(CONFIG_HYPERV) &&
+                   static_branch_unlikely(&enable_evmcs) &&
+                   (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+                       struct hv_enlightened_vmcs *evmcs =
+                               (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
+
+                       evmcs->hv_enlightenments_control.msr_bitmap = 1;
+               }
         }
         return 0;
  
@@ -5329,6 +5626,9 @@ static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bit
         if (!cpu_has_vmx_msr_bitmap())
                 return;
  
+       if (static_branch_unlikely(&enable_evmcs))
+               evmcs_touch_msr_bitmap();
+
         /*
          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
          * have the write-low and read-high bitmap offsets the wrong way round.
@@ -5364,6 +5664,9 @@ static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitm
         if (!cpu_has_vmx_msr_bitmap())
                 return;
  
+       if (static_branch_unlikely(&enable_evmcs))
+               evmcs_touch_msr_bitmap();
+
         /*
          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
          * have the write-low and read-high bitmap offsets the wrong way round.
@@ -5934,6 +6237,16 @@ static void ept_set_mmio_spte_mask(void)
                                    VMX_EPT_MISCONFIG_WX_VALUE);
  }
  
+static bool vmx_l1d_use_msr_save_list(void)
+{
+       if (!enable_ept || !boot_cpu_has_bug(X86_BUG_L1TF) ||
+           static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+           !static_cpu_has(X86_FEATURE_FLUSH_L1D))
+               return false;
+
+       return vmentry_l1d_flush == VMENTER_L1D_FLUSH_ALWAYS;
+}
+
  #define VMX_XSS_EXIT_BITMAP 0
  /*
   * Sets up the vmcs for emulated real mode.
@@ -5946,8 +6259,14 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
         int i;
  
         if (enable_shadow_vmcs) {
+               /*
+                * At vCPU creation, "VMWRITE to any supported field
+                * in the VMCS" is supported, so use the more
+                * permissive vmx_vmread_bitmap to specify both read
+                * and write permissions for the shadow VMCS.
+                */
                 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
-               vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+               vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
         }
         if (cpu_has_vmx_msr_bitmap())
                 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
@@ -6006,9 +6325,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
  
         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
-       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
-       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
  
         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
@@ -6049,6 +6368,12 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
         }
+       /*
+        * If flushing the L1D cache on every VMENTER is enforced and the
+        * MSR is available, use the MSR save list.
+        */
+       if (vmx_l1d_use_msr_save_list())
+               add_atomic_switch_msr(vmx, MSR_IA32_FLUSH_CMD, L1D_FLUSH, 0, true);
  }
  
  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -7588,8 +7913,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
                         vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
                 return 1;
  
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
-                               sizeof(*vmpointer), &e)) {
+       if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
@@ -7670,6 +7994,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                 return 1;
         }
  
+       /* CPL=0 must be checked manually. */
+       if (vmx_get_cpl(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
         if (vmx->nested.vmxon) {
                 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
                 return kvm_skip_emulated_instruction(vcpu);
@@ -7729,6 +8059,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
   */
  static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
  {
+       if (vmx_get_cpl(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 0;
+       }
+
         if (!to_vmx(vcpu)->nested.vmxon) {
                 kvm_queue_exception(vcpu, UD_VECTOR);
                 return 0;
@@ -7928,23 +8263,42 @@ static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
  
  }
  
+/*
+ * Copy the writable VMCS shadow fields back to the VMCS12, in case
+ * they have been modified by the L1 guest. Note that the "read-only"
+ * VM-exit information fields are actually writable if the vCPU is
+ * configured to support "VMWRITE to any supported field in the VMCS."
+ */
  static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
  {
-       int i;
+       const u16 *fields[] = {
+               shadow_read_write_fields,
+               shadow_read_only_fields
+       };
+       const int max_fields[] = {
+               max_shadow_read_write_fields,
+               max_shadow_read_only_fields
+       };
+       int i, q;
         unsigned long field;
         u64 field_value;
         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
-       const u16 *fields = shadow_read_write_fields;
-       const int num_fields = max_shadow_read_write_fields;
  
         preempt_disable();
  
         vmcs_load(shadow_vmcs);
  
-       for (i = 0; i < num_fields; i++) {
-               field = fields[i];
-               field_value = __vmcs_readl(field);
-               vmcs12_write_any(&vmx->vcpu, field, field_value);
+       for (q = 0; q < ARRAY_SIZE(fields); q++) {
+               for (i = 0; i < max_fields[q]; i++) {
+                       field = fields[q][i];
+                       field_value = __vmcs_readl(field);
+                       vmcs12_write_any(&vmx->vcpu, field, field_value);
+               }
+               /*
+                * Skip the VM-exit information fields if they are read-only.
+                */
+               if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
+                       break;
         }
  
         vmcs_clear(shadow_vmcs);
@@ -8029,9 +8383,9 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                 if (get_vmx_mem_address(vcpu, exit_qualification,
                                 vmx_instruction_info, true, &gva))
                         return 1;
-               /* _system ok, as hardware has verified cpl=0 */
-               kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
-                            &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
+               /* _system ok, nested_vmx_check_permission has verified cpl=0 */
+               kvm_write_guest_virt_system(vcpu, gva, &field_value,
+                                           (is_long_mode(vcpu) ? 8 : 4), NULL);
         }
  
         nested_vmx_succeed(vcpu);
@@ -8069,8 +8423,8 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                 if (get_vmx_mem_address(vcpu, exit_qualification,
                                 vmx_instruction_info, false, &gva))
                         return 1;
-               if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
-                          &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
+               if (kvm_read_guest_virt(vcpu, gva, &field_value,
+                                       (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
                         kvm_inject_page_fault(vcpu, &e);
                         return 1;
                 }
@@ -8078,7 +8432,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
  
  
         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
-       if (vmcs_field_readonly(field)) {
+       /*
+        * If the vCPU supports "VMWRITE to any supported field in the
+        * VMCS," then the "read-only" fields are actually read/write.
+        */
+       if (vmcs_field_readonly(field) &&
+           !nested_cpu_has_vmwrite_any_field(vcpu)) {
                 nested_vmx_failValid(vcpu,
                         VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
                 return kvm_skip_emulated_instruction(vcpu);
@@ -8189,10 +8548,10 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
         if (get_vmx_mem_address(vcpu, exit_qualification,
                         vmx_instruction_info, true, &vmcs_gva))
                 return 1;
-       /* ok to use *_system, as hardware has verified cpl=0 */
-       if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
-                                (void *)&to_vmx(vcpu)->nested.current_vmptr,
-                                sizeof(u64), &e)) {
+       /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
+       if (kvm_write_guest_virt_system(vcpu, vmcs_gva,
+                                       (void *)&to_vmx(vcpu)->nested.current_vmptr,
+                                       sizeof(u64), &e)) {
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
@@ -8239,8 +8598,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
                         vmx_instruction_info, false, &gva))
                 return 1;
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
-                               sizeof(operand), &e)) {
+       if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
@@ -8304,8 +8662,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
                         vmx_instruction_info, false, &gva))
                 return 1;
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
-                               sizeof(operand), &e)) {
+       if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
@@ -8317,12 +8674,19 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
  
         switch (type) {
         case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
-               if (is_noncanonical_address(operand.gla, vcpu)) {
+               if (!operand.vpid ||
+                   is_noncanonical_address(operand.gla, vcpu)) {
                         nested_vmx_failValid(vcpu,
                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
                         return kvm_skip_emulated_instruction(vcpu);
                 }
-               /* fall through */
+               if (cpu_has_vmx_invvpid_individual_addr() &&
+                   vmx->nested.vpid02) {
+                       __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
+                               vmx->nested.vpid02, operand.gla);
+               } else
+                       __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+               break;
         case VMX_VPID_EXTENT_SINGLE_CONTEXT:
         case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
                 if (!operand.vpid) {
@@ -8330,15 +8694,16 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
                         return kvm_skip_emulated_instruction(vcpu);
                 }
+               __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
                 break;
         case VMX_VPID_EXTENT_ALL_CONTEXT:
+               __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
                 break;
         default:
                 WARN_ON_ONCE(1);
                 return kvm_skip_emulated_instruction(vcpu);
         }
  
-       __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
         nested_vmx_succeed(vcpu);
  
         return kvm_skip_emulated_instruction(vcpu);
@@ -8842,11 +9207,13 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
         case EXIT_REASON_TPR_BELOW_THRESHOLD:
                 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
         case EXIT_REASON_APIC_ACCESS:
-               return nested_cpu_has2(vmcs12,
-                       SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
         case EXIT_REASON_APIC_WRITE:
         case EXIT_REASON_EOI_INDUCED:
-               /* apic_write and eoi_induced should exit unconditionally. */
+               /*
+                * The controls for "virtualize APIC accesses," "APIC-
+                * register virtualization," and "virtual-interrupt
+                * delivery" only come from vmcs12.
+                */
                 return true;
         case EXIT_REASON_EPT_VIOLATION:
                 /*
@@ -9237,6 +9604,77 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
         }
  }
  
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+       int size = PAGE_SIZE << L1D_CACHE_ORDER;
+       bool always;
+
+       /*
+        * This code is only executed when:
+        * - the flush mode is 'cond'
+        * - the flush mode is 'always' and the flush MSR is not
+        *   available
+        *
+        * If the CPU has the flush MSR then clear the flush bit because
+        * 'always' mode is handled via the MSR save list.
+        *
+        * If the MSR is not avaibable then act depending on the mitigation
+        * mode: If 'flush always', keep the flush bit set, otherwise clear
+        * it.
+        *
+        * The flush bit gets set again either from vcpu_run() or from one
+        * of the unsafe VMEXIT handlers.
+        */
+       if (static_cpu_has(X86_FEATURE_FLUSH_L1D))
+               always = false;
+       else
+               always = vmentry_l1d_flush == VMENTER_L1D_FLUSH_ALWAYS;
+
+       vcpu->arch.l1tf_flush_l1d = always;
+
+       vcpu->stat.l1d_flush++;
+
+       if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+               wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+               return;
+       }
+
+       asm volatile(
+               /* First ensure the pages are in the TLB */
+               "xorl   %%eax, %%eax\n"
+               ".Lpopulate_tlb:\n\t"
+               "movzbl (%[empty_zp], %%" _ASM_AX "), %%ecx\n\t"
+               "addl   $4096, %%eax\n\t"
+               "cmpl   %%eax, %[size]\n\t"
+               "jne    .Lpopulate_tlb\n\t"
+               "xorl   %%eax, %%eax\n\t"
+               "cpuid\n\t"
+               /* Now fill the cache */
+               "xorl   %%eax, %%eax\n"
+               ".Lfill_cache:\n"
+               "movzbl (%[empty_zp], %%" _ASM_AX "), %%ecx\n\t"
+               "addl   $64, %%eax\n\t"
+               "cmpl   %%eax, %[size]\n\t"
+               "jne    .Lfill_cache\n\t"
+               "lfence\n"
+               :: [empty_zp] "r" (vmx_l1d_flush_pages),
+                   [size] "r" (size)
+               : "eax", "ebx", "ecx", "edx");
+}
+
  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
  {
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -9253,31 +9691,43 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
         vmcs_write32(TPR_THRESHOLD, irr);
  }
  
-static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
+static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
  {
         u32 sec_exec_control;
  
+       if (!lapic_in_kernel(vcpu))
+               return;
+
         /* Postpone execution until vmcs01 is the current VMCS. */
         if (is_guest_mode(vcpu)) {
-               to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
+               to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
                 return;
         }
  
-       if (!cpu_has_vmx_virtualize_x2apic_mode())
-               return;
-
         if (!cpu_need_tpr_shadow(vcpu))
                 return;
  
         sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+       sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+                             SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
  
-       if (set) {
-               sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-               sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
-       } else {
-               sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
-               sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-               vmx_flush_tlb(vcpu, true);
+       switch (kvm_get_apic_mode(vcpu)) {
+       case LAPIC_MODE_INVALID:
+               WARN_ONCE(true, "Invalid local APIC state");
+       case LAPIC_MODE_DISABLED:
+               break;
+       case LAPIC_MODE_XAPIC:
+               if (flexpriority_enabled) {
+                       sec_exec_control |=
+                               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+                       vmx_flush_tlb(vcpu, true);
+               }
+               break;
+       case LAPIC_MODE_X2APIC:
+               if (cpu_has_vmx_virtualize_x2apic_mode())
+                       sec_exec_control |=
+                               SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+               break;
         }
         vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
  
@@ -9286,24 +9736,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
  
  static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
  {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       /*
-        * Currently we do not handle the nested case where L2 has an
-        * APIC access page of its own; that page is still pinned.
-        * Hence, we skip the case where the VCPU is in guest mode _and_
-        * L1 prepared an APIC access page for L2.
-        *
-        * For the case where L1 and L2 share the same APIC access page
-        * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear
-        * in the vmcs12), this function will only update either the vmcs01
-        * or the vmcs02.  If the former, the vmcs02 will be updated by
-        * prepare_vmcs02.  If the latter, the vmcs01 will be updated in
-        * the next L2->L1 exit.
-        */
-       if (!is_guest_mode(vcpu) ||
-           !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
-                            SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+       if (!is_guest_mode(vcpu)) {
                 vmcs_write64(APIC_ACCESS_ADDR, hpa);
                 vmx_flush_tlb(vcpu, true);
         }
@@ -9481,6 +9914,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
                         [ss]"i"(__KERNEL_DS),
                         [cs]"i"(__KERNEL_CS)
                         );
+               vcpu->arch.l1tf_flush_l1d = true;
         }
  }
  STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
@@ -9643,7 +10077,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
                         clear_atomic_switch_msr(vmx, msrs[i].msr);
                 else
                         add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
-                                       msrs[i].host);
+                                       msrs[i].host, false);
  }
  
  static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
@@ -9738,6 +10172,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
         evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
                 (unsigned long)&current_evmcs->host_rsp : 0;
  
+       if (static_branch_unlikely(&vmx_l1d_should_flush)) {
+               if (vcpu->arch.l1tf_flush_l1d)
+                       vmx_l1d_flush(vcpu);
+       }
+
         asm(
                 /* Store host registers */
                 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -9943,13 +10382,13 @@ STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
  
  static struct kvm *vmx_vm_alloc(void)
  {
-       struct kvm_vmx *kvm_vmx = kzalloc(sizeof(struct kvm_vmx), GFP_KERNEL);
+       struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
         return &kvm_vmx->kvm;
  }
  
  static void vmx_vm_free(struct kvm *kvm)
  {
-       kfree(to_kvm_vmx(kvm));
+       vfree(to_kvm_vmx(kvm));
  }
  
  static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
@@ -10100,10 +10539,20 @@ free_vcpu:
         return ERR_PTR(err);
  }
  
+#define L1TF_MSG "SMT enabled with L1TF CPU bug present. Refer to CVE-2018-3620 for details.\n"
+
  static int vmx_vm_init(struct kvm *kvm)
  {
         if (!ple_gap)
                 kvm->arch.pause_in_guest = true;
+
+       if (boot_cpu_has(X86_BUG_L1TF) && cpu_smt_control == CPU_SMT_ENABLED) {
+               if (nosmt) {
+                       pr_err(L1TF_MSG);
+                       return -EOPNOTSUPP;
+               }
+               pr_warn(L1TF_MSG);
+       }
         return 0;
  }
  
@@ -10387,11 +10836,6 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                         vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
                                         SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
                 }
-       } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
-                  cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
-               vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
-                             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
-               kvm_vcpu_reload_apic_access_page(vcpu);
         }
  
         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
@@ -10871,8 +11315,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
         return 0;
  }
  
-static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-                              bool from_vmentry)
+static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
@@ -10963,10 +11406,10 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
          * Set the MSR load/store lists to match L0's settings.
          */
         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
  
         set_cr4_guest_host_mask(vmx);
  
@@ -11006,13 +11449,13 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
   * is assigned to entry_failure_code on failure.
   */
  static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-                         bool from_vmentry, u32 *entry_failure_code)
+                         u32 *entry_failure_code)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         u32 exec_control, vmcs12_exec_ctrl;
  
         if (vmx->nested.dirty_vmcs12) {
-               prepare_vmcs02_full(vcpu, vmcs12, from_vmentry);
+               prepare_vmcs02_full(vcpu, vmcs12);
                 vmx->nested.dirty_vmcs12 = false;
         }
  
@@ -11032,7 +11475,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
          * HOST_FS_BASE, HOST_GS_BASE.
          */
  
-       if (from_vmentry &&
+       if (vmx->nested.nested_run_pending &&
             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
                 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
@@ -11040,7 +11483,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
         }
-       if (from_vmentry) {
+       if (vmx->nested.nested_run_pending) {
                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                              vmcs12->vm_entry_intr_info_field);
                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
@@ -11172,7 +11615,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                         ~VM_ENTRY_IA32E_MODE) |
                 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
  
-       if (from_vmentry &&
+       if (vmx->nested.nested_run_pending &&
             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
                 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
                 vcpu->arch.pat = vmcs12->guest_ia32_pat;
@@ -11197,7 +11640,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
                         if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
                                 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
-                               __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true);
+                               __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
                         }
                 } else {
                         vmx_flush_tlb(vcpu, true);
@@ -11240,7 +11683,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         vmx_set_cr4(vcpu, vmcs12->guest_cr4);
         vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
  
-       if (from_vmentry &&
+       if (vmx->nested.nested_run_pending &&
             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
                 vcpu->arch.efer = vmcs12->guest_ia32_efer;
         else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -11418,7 +11861,7 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         return 0;
  }
  
-static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
+static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -11438,7 +11881,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
                 vcpu->arch.tsc_offset += vmcs12->tsc_offset;
  
         r = EXIT_REASON_INVALID_STATE;
-       if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual))
+       if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
                 goto fail;
  
         nested_get_vmcs12_pages(vcpu, vmcs12);
@@ -11540,20 +11983,25 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
          * the nested entry.
          */
  
-       ret = enter_vmx_non_root_mode(vcpu, true);
-       if (ret)
+       vmx->nested.nested_run_pending = 1;
+       ret = enter_vmx_non_root_mode(vcpu);
+       if (ret) {
+               vmx->nested.nested_run_pending = 0;
                 return ret;
+       }
+
+       /* Hide L1D cache contents from the nested guest.  */
+       vmx->vcpu.arch.l1tf_flush_l1d = true;
  
         /*
          * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
          * by event injection, halt vcpu.
          */
         if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
-           !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK))
+           !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) {
+               vmx->nested.nested_run_pending = 0;
                 return kvm_vcpu_halt(vcpu);
-
-       vmx->nested.nested_run_pending = 1;
-
+       }
         return 1;
  
  out:
@@ -11925,12 +12373,20 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
  
         load_vmcs12_mmu_host_state(vcpu, vmcs12);
  
-       if (enable_vpid) {
-               /*
-                * Trivially support vpid by letting L2s share their parent
-                * L1's vpid. TODO: move to a more elaborate solution, giving
-                * each L2 its own vpid and exposing the vpid feature to L1.
-                */
+       /*
+        * If vmcs01 don't use VPID, CPU flushes TLB on every
+        * VMEntry/VMExit. Thus, no need to flush TLB.
+        *
+        * If vmcs12 uses VPID, TLB entries populated by L2 are
+        * tagged with vmx->nested.vpid02 while L1 entries are tagged
+        * with vmx->vpid. Thus, no need to flush TLB.
+        *
+        * Therefore, flush TLB only in case vmcs01 uses VPID and
+        * vmcs12 don't use VPID as in this case L1 & L2 TLB entries
+        * are both tagged with vmx->vpid.
+        */
+       if (enable_vpid &&
+           !(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02)) {
                 vmx_flush_tlb(vcpu, true);
         }
  
@@ -12057,8 +12513,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         vmx_segment_cache_clear(vmx);
  
         /* Update any VMCS fields that might have changed while L2 ran */
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
         if (vmx->hv_deadline_tsc == -1)
                 vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
@@ -12069,10 +12525,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         if (kvm_has_tsc_control)
                 decache_tsc_multiplier(vmx);
  
-       if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
-               vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
-               vmx_set_virtual_x2apic_mode(vcpu,
-                               vcpu->arch.apic_base & X2APIC_ENABLE);
+       if (vmx->nested.change_vmcs01_virtual_apic_mode) {
+               vmx->nested.change_vmcs01_virtual_apic_mode = false;
+               vmx_set_virtual_apic_mode(vcpu);
         } else if (!nested_cpu_has_ept(vmcs12) &&
                    nested_cpu_has2(vmcs12,
                                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -12236,7 +12691,7 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
  static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
  {
         struct vcpu_vmx *vmx;
-       u64 tscl, guest_tscl, delta_tsc;
+       u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
  
         if (kvm_mwait_in_guest(vcpu->kvm))
                 return -EOPNOTSUPP;
@@ -12245,6 +12700,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
         tscl = rdtsc();
         guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
         delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+       lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
+
+       if (delta_tsc > lapic_timer_advance_cycles)
+               delta_tsc -= lapic_timer_advance_cycles;
+       else
+               delta_tsc = 0;
  
         /* Convert to host delta tsc if tsc scaling is enabled */
         if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
@@ -12615,7 +13076,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
  
         if (vmx->nested.smm.guest_mode) {
                 vcpu->arch.hflags &= ~HF_SMM_MASK;
-               ret = enter_vmx_non_root_mode(vcpu, false);
+               ret = enter_vmx_non_root_mode(vcpu);
                 vcpu->arch.hflags |= HF_SMM_MASK;
                 if (ret)
                         return ret;
@@ -12700,7 +13161,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .enable_nmi_window = enable_nmi_window,
         .enable_irq_window = enable_irq_window,
         .update_cr8_intercept = update_cr8_intercept,
-       .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
+       .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
         .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
         .get_enable_apicv = vmx_get_enable_apicv,
         .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
@@ -12770,6 +13231,34 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .enable_smi_window = enable_smi_window,
  };
  
+static int __init vmx_setup_l1d_flush(void)
+{
+       struct page *page;
+
+       if (vmentry_l1d_flush == VMENTER_L1D_FLUSH_NEVER ||
+           !boot_cpu_has_bug(X86_BUG_L1TF) ||
+           vmx_l1d_use_msr_save_list())
+               return 0;
+
+       if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+               page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+               if (!page)
+                       return -ENOMEM;
+               vmx_l1d_flush_pages = page_address(page);
+       }
+
+       static_branch_enable(&vmx_l1d_should_flush);
+       return 0;
+}
+
+static void vmx_free_l1d_flush_pages(void)
+{
+       if (vmx_l1d_flush_pages) {
+               free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+               vmx_l1d_flush_pages = NULL;
+       }
+}
+
  static int __init vmx_init(void)
  {
         int r;
@@ -12803,15 +13292,22 @@ static int __init vmx_init(void)
         }
  #endif
  
-       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
-                     __alignof__(struct vcpu_vmx), THIS_MODULE);
+       r = vmx_setup_l1d_flush();
         if (r)
                 return r;
  
+       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+                    __alignof__(struct vcpu_vmx), THIS_MODULE);
+       if (r) {
+               vmx_free_l1d_flush_pages();
+               return r;
+       }
+
  #ifdef CONFIG_KEXEC_CORE
         rcu_assign_pointer(crash_vmclear_loaded_vmcss,
                            crash_vmclear_local_loaded_vmcss);
  #endif
+       vmx_check_vmcs12_offsets();
  
         return 0;
  }
@@ -12847,6 +13343,7 @@ static void __exit vmx_exit(void)
                 static_branch_disable(&enable_evmcs);
         }
  #endif
+       vmx_free_l1d_flush_pages();
  }
  
  module_init(vmx_init)