KVM/VMX: Invoke NMI non-IST entry instead of IST entry

[linux-2.6-microblaze.git] / arch / x86 / kvm / vmx / vmx.c
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index bcbf0d2..b21d751 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -36,6 +36,7 @@
  #include <asm/debugreg.h>
  #include <asm/desc.h>
  #include <asm/fpu/internal.h>
+#include <asm/idtentry.h>
  #include <asm/io.h>
  #include <asm/irq_remapping.h>
  #include <asm/kexec.h>
@@ -57,6 +58,7 @@
  #include "mmu.h"
  #include "nested.h"
  #include "pmu.h"
+#include "sgx.h"
  #include "trace.h"
  #include "vmcs.h"
  #include "vmcs12.h"
@@ -156,9 +158,11 @@ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
         MSR_IA32_SPEC_CTRL,
         MSR_IA32_PRED_CMD,
         MSR_IA32_TSC,
+#ifdef CONFIG_X86_64
         MSR_FS_BASE,
         MSR_GS_BASE,
         MSR_KERNEL_GS_BASE,
+#endif
         MSR_IA32_SYSENTER_CS,
         MSR_IA32_SYSENTER_ESP,
         MSR_IA32_SYSENTER_EIP,
@@ -361,8 +365,6 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = {
  module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
  
  static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
-                                                         u32 msr, int type);
  
  void vmx_vmexit(void);
  
@@ -472,26 +474,6 @@ static const u32 vmx_uret_msrs_list[] = {
  static bool __read_mostly enlightened_vmcs = true;
  module_param(enlightened_vmcs, bool, 0444);
  
-/* check_ept_pointer() should be under protection of ept_pointer_lock. */
-static void check_ept_pointer_match(struct kvm *kvm)
-{
-       struct kvm_vcpu *vcpu;
-       u64 tmp_eptp = INVALID_PAGE;
-       int i;
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (!VALID_PAGE(tmp_eptp)) {
-                       tmp_eptp = to_vmx(vcpu)->ept_pointer;
-               } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
-                       to_kvm_vmx(kvm)->ept_pointers_match
-                               = EPT_POINTERS_MISMATCH;
-                       return;
-               }
-       }
-
-       to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
-}
-
  static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
                 void *data)
  {
@@ -501,47 +483,70 @@ static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush
                         range->pages);
  }
  
-static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
-               struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
+static inline int hv_remote_flush_root_ept(hpa_t root_ept,
+                                          struct kvm_tlb_range *range)
  {
-       u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
-
-       /*
-        * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
-        * of the base of EPT PML4 table, strip off EPT configuration
-        * information.
-        */
         if (range)
-               return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
+               return hyperv_flush_guest_mapping_range(root_ept,
                                 kvm_fill_hv_flush_list_func, (void *)range);
         else
-               return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
+               return hyperv_flush_guest_mapping(root_ept);
  }
  
  static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
                 struct kvm_tlb_range *range)
  {
+       struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
         struct kvm_vcpu *vcpu;
-       int ret = 0, i;
+       int ret = 0, i, nr_unique_valid_roots;
+       hpa_t root;
  
-       spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+       spin_lock(&kvm_vmx->hv_root_ept_lock);
  
-       if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
-               check_ept_pointer_match(kvm);
+       if (!VALID_PAGE(kvm_vmx->hv_root_ept)) {
+               nr_unique_valid_roots = 0;
  
-       if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
+               /*
+                * Flush all valid roots, and see if all vCPUs have converged
+                * on a common root, in which case future flushes can skip the
+                * loop and flush the common root.
+                */
                 kvm_for_each_vcpu(i, vcpu, kvm) {
-                       /* If ept_pointer is invalid pointer, bypass flush request. */
-                       if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
-                               ret |= __hv_remote_flush_tlb_with_range(
-                                       kvm, vcpu, range);
+                       root = to_vmx(vcpu)->hv_root_ept;
+                       if (!VALID_PAGE(root) || root == kvm_vmx->hv_root_ept)
+                               continue;
+
+                       /*
+                        * Set the tracked root to the first valid root.  Keep
+                        * this root for the entirety of the loop even if more
+                        * roots are encountered as a low effort optimization
+                        * to avoid flushing the same (first) root again.
+                        */
+                       if (++nr_unique_valid_roots == 1)
+                               kvm_vmx->hv_root_ept = root;
+
+                       if (!ret)
+                               ret = hv_remote_flush_root_ept(root, range);
+
+                       /*
+                        * Stop processing roots if a failure occurred and
+                        * multiple valid roots have already been detected.
+                        */
+                       if (ret && nr_unique_valid_roots > 1)
+                               break;
                 }
+
+               /*
+                * The optimized flush of a single root can't be used if there
+                * are multiple valid roots (obviously).
+                */
+               if (nr_unique_valid_roots > 1)
+                       kvm_vmx->hv_root_ept = INVALID_PAGE;
         } else {
-               ret = __hv_remote_flush_tlb_with_range(kvm,
-                               kvm_get_vcpu(kvm, 0), range);
+               ret = hv_remote_flush_root_ept(kvm_vmx->hv_root_ept, range);
         }
  
-       spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+       spin_unlock(&kvm_vmx->hv_root_ept_lock);
         return ret;
  }
  static int hv_remote_flush_tlb(struct kvm *kvm)
@@ -559,7 +564,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
          * evmcs in singe VM shares same assist page.
          */
         if (!*p_hv_pa_pg)
-               *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+               *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
  
         if (!*p_hv_pa_pg)
                 return -ENOMEM;
@@ -576,6 +581,21 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
  
  #endif /* IS_ENABLED(CONFIG_HYPERV) */
  
+static void hv_track_root_ept(struct kvm_vcpu *vcpu, hpa_t root_ept)
+{
+#if IS_ENABLED(CONFIG_HYPERV)
+       struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+
+       if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+               spin_lock(&kvm_vmx->hv_root_ept_lock);
+               to_vmx(vcpu)->hv_root_ept = root_ept;
+               if (root_ept != kvm_vmx->hv_root_ept)
+                       kvm_vmx->hv_root_ept = INVALID_PAGE;
+               spin_unlock(&kvm_vmx->hv_root_ept_lock);
+       }
+#endif
+}
+
  /*
   * Comment's format: document - errata name - stepping - processor name.
   * Refer from
@@ -1570,12 +1590,25 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
  
  static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
  {
+       /*
+        * Emulation of instructions in SGX enclaves is impossible as RIP does
+        * not point  tthe failing instruction, and even if it did, the code
+        * stream is inaccessible.  Inject #UD instead of exiting to userspace
+        * so that guest userspace can't DoS the guest simply by triggering
+        * emulation (enclaves are CPL3 only).
+        */
+       if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return false;
+       }
         return true;
  }
  
  static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
  {
+       union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
         unsigned long rip, orig_rip;
+       u32 instr_len;
  
         /*
          * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
@@ -1586,9 +1619,33 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
          * i.e. we end up advancing IP with some random value.
          */
         if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
-           to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+           exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+               instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+               /*
+                * Emulating an enclave's instructions isn't supported as KVM
+                * cannot access the enclave's memory or its true RIP, e.g. the
+                * vmcs.GUEST_RIP points at the exit point of the enclave, not
+                * the RIP that actually triggered the VM-Exit.  But, because
+                * most instructions that cause VM-Exit will #UD in an enclave,
+                * most instruction-based VM-Exits simply do not occur.
+                *
+                * There are a few exceptions, notably the debug instructions
+                * INT1ICEBRK and INT3, as they are allowed in debug enclaves
+                * and generate #DB/#BP as expected, which KVM might intercept.
+                * But again, the CPU does the dirty work and saves an instr
+                * length of zero so VMMs don't shoot themselves in the foot.
+                * WARN if KVM tries to skip a non-zero length instruction on
+                * a VM-Exit from an enclave.
+                */
+               if (!instr_len)
+                       goto rip_updated;
+
+               WARN(exit_reason.enclave_mode,
+                    "KVM: skipping instruction after SGX enclave VM-Exit");
+
                 orig_rip = kvm_rip_read(vcpu);
-               rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+               rip = orig_rip + instr_len;
  #ifdef CONFIG_X86_64
                 /*
                  * We need to mask out the high 32 bits of RIP if not in 64-bit
@@ -1604,6 +1661,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
                         return 0;
         }
  
+rip_updated:
         /* skipping an emulated instruction also counts */
         vmx_set_interrupt_shadow(vcpu, 0);
  
@@ -1865,6 +1923,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
         case MSR_IA32_FEAT_CTL:
                 msr_info->data = vmx->msr_ia32_feature_control;
                 break;
+       case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+                       return 1;
+               msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
+                       [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
+               break;
         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                 if (!nested_vmx_allowed(vcpu))
                         return 1;
@@ -2158,6 +2223,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 vmx->msr_ia32_feature_control = data;
                 if (msr_info->host_initiated && data == 0)
                         vmx_leave_nested(vcpu);
+
+               /* SGX may be enabled/disabled by guest's firmware */
+               vmx_write_encls_bitmap(vcpu, NULL);
+               break;
+       case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+               /*
+                * On real hardware, the LE hash MSRs are writable before
+                * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
+                * at which point SGX related bits in IA32_FEATURE_CONTROL
+                * become writable.
+                *
+                * KVM does not emulate SGX activation for simplicity, so
+                * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
+                * is unlocked.  This is technically not architectural
+                * behavior, but it's close enough.
+                */
+               if (!msr_info->host_initiated &&
+                   (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+                   ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
+                   !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
+                       return 1;
+               vmx->msr_ia32_sgxlepubkeyhash
+                       [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
                 break;
         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                 if (!msr_info->host_initiated)
@@ -3088,8 +3176,7 @@ static int vmx_get_max_tdp_level(void)
         return 4;
  }
  
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
-                  int root_level)
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
  {
         u64 eptp = VMX_EPTP_MT_WB;
  
@@ -3098,13 +3185,13 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
         if (enable_ept_ad_bits &&
             (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
                 eptp |= VMX_EPTP_AD_ENABLE_BIT;
-       eptp |= (root_hpa & PAGE_MASK);
+       eptp |= root_hpa;
  
         return eptp;
  }
  
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
-                            int pgd_level)
+static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+                            int root_level)
  {
         struct kvm *kvm = vcpu->kvm;
         bool update_guest_cr3 = true;
@@ -3112,16 +3199,10 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
         u64 eptp;
  
         if (enable_ept) {
-               eptp = construct_eptp(vcpu, pgd, pgd_level);
+               eptp = construct_eptp(vcpu, root_hpa, root_level);
                 vmcs_write64(EPT_POINTER, eptp);
  
-               if (kvm_x86_ops.tlb_remote_flush) {
-                       spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-                       to_vmx(vcpu)->ept_pointer = eptp;
-                       to_kvm_vmx(kvm)->ept_pointers_match
-                               = EPT_POINTERS_CHECK;
-                       spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-               }
+               hv_track_root_ept(vcpu, root_hpa);
  
                 if (!enable_unrestricted_guest && !is_paging(vcpu))
                         guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
@@ -3131,7 +3212,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
                         update_guest_cr3 = false;
                 vmx_ept_load_pdptrs(vcpu);
         } else {
-               guest_cr3 = pgd;
+               guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
         }
  
         if (update_guest_cr3)
@@ -3738,8 +3819,7 @@ static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
                 __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
  }
  
-static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
-                                                         u32 msr, int type)
+void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
@@ -3784,8 +3864,7 @@ static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
                 vmx_clear_msr_bitmap_write(msr_bitmap, msr);
  }
  
-static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
-                                                        u32 msr, int type)
+void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
@@ -3818,15 +3897,6 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
  }
  
-void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
-                                                     u32 msr, int type, bool value)
-{
-       if (value)
-               vmx_enable_intercept_for_msr(vcpu, msr, type);
-       else
-               vmx_disable_intercept_for_msr(vcpu, msr, type);
-}
-
  static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
  {
         u8 mode = 0;
@@ -4314,15 +4384,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
         vmx->secondary_exec_control = exec_control;
  }
  
-static void ept_set_mmio_spte_mask(void)
-{
-       /*
-        * EPT Misconfigurations can be generated if the value of bits 2:0
-        * of an EPT paging-structure entry is 110b (write/execute).
-        */
-       kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0);
-}
-
  #define VMX_XSS_EXIT_BITMAP 0
  
  /*
@@ -4410,8 +4471,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
         }
  
-       if (cpu_has_vmx_encls_vmexit())
-               vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+       vmx_write_encls_bitmap(&vmx->vcpu, NULL);
  
         if (vmx_pt_mode_is_host_guest()) {
                 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
@@ -5020,7 +5080,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
         reg = (exit_qualification >> 8) & 15;
         switch ((exit_qualification >> 4) & 3) {
         case 0: /* mov to cr */
-               val = kvm_register_readl(vcpu, reg);
+               val = kvm_register_read(vcpu, reg);
                 trace_kvm_cr_write(cr, val);
                 switch (cr) {
                 case 0:
@@ -5143,7 +5203,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
                 kvm_register_write(vcpu, reg, val);
                 err = 0;
         } else {
-               err = kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg));
+               err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
         }
  
  out:
@@ -5184,17 +5244,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
         return 1;
  }
  
-static int handle_vmcall(struct kvm_vcpu *vcpu)
-{
-       return kvm_emulate_hypercall(vcpu);
-}
-
-static int handle_invd(struct kvm_vcpu *vcpu)
-{
-       /* Treat an INVD instruction as a NOP and just skip it. */
-       return kvm_skip_emulated_instruction(vcpu);
-}
-
  static int handle_invlpg(struct kvm_vcpu *vcpu)
  {
         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
@@ -5203,28 +5252,6 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
         return kvm_skip_emulated_instruction(vcpu);
  }
  
-static int handle_rdpmc(struct kvm_vcpu *vcpu)
-{
-       int err;
-
-       err = kvm_rdpmc(vcpu);
-       return kvm_complete_insn_gp(vcpu, err);
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu)
-{
-       return kvm_emulate_wbinvd(vcpu);
-}
-
-static int handle_xsetbv(struct kvm_vcpu *vcpu)
-{
-       u64 new_bv = kvm_read_edx_eax(vcpu);
-       u32 index = kvm_rcx_read(vcpu);
-
-       int err = kvm_set_xcr(vcpu, index, new_bv);
-       return kvm_complete_insn_gp(vcpu, err);
-}
-
  static int handle_apic_access(struct kvm_vcpu *vcpu)
  {
         if (likely(fasteoi)) {
@@ -5361,7 +5388,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
                         EPT_VIOLATION_EXECUTABLE))
                       ? PFERR_PRESENT_MASK : 0;
  
-       error_code |= (exit_qualification & 0x100) != 0 ?
+       error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
                PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
  
         vcpu->arch.exit_qualification = exit_qualification;
@@ -5384,6 +5411,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
  {
         gpa_t gpa;
  
+       if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+               return 1;
+
         /*
          * A nested guest cannot optimize MMIO vmexits, because we have an
          * nGPA here instead of the required GPA.
@@ -5485,18 +5515,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
         }
  }
  
-static void vmx_enable_tdp(void)
-{
-       kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
-               enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
-               enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
-               0ull, VMX_EPT_EXECUTABLE_MASK,
-               cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-               VMX_EPT_RWX_MASK, 0ull);
-
-       ept_set_mmio_spte_mask();
-}
-
  /*
   * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
   * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -5516,34 +5534,11 @@ static int handle_pause(struct kvm_vcpu *vcpu)
         return kvm_skip_emulated_instruction(vcpu);
  }
  
-static int handle_nop(struct kvm_vcpu *vcpu)
-{
-       return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int handle_mwait(struct kvm_vcpu *vcpu)
-{
-       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
-       return handle_nop(vcpu);
-}
-
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
-{
-       kvm_queue_exception(vcpu, UD_VECTOR);
-       return 1;
-}
-
  static int handle_monitor_trap(struct kvm_vcpu *vcpu)
  {
         return 1;
  }
  
-static int handle_monitor(struct kvm_vcpu *vcpu)
-{
-       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
-       return handle_nop(vcpu);
-}
-
  static int handle_invpcid(struct kvm_vcpu *vcpu)
  {
         u32 vmx_instruction_info;
@@ -5560,7 +5555,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
         }
  
         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
-       type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+       type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
  
         if (type > 3) {
                 kvm_inject_gp(vcpu, 0);
@@ -5632,16 +5627,18 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
         return 1;
  }
  
+#ifndef CONFIG_X86_SGX_KVM
  static int handle_encls(struct kvm_vcpu *vcpu)
  {
         /*
-        * SGX virtualization is not yet supported.  There is no software
-        * enable bit for SGX, so we have to trap ENCLS and inject a #UD
-        * to prevent the guest from executing ENCLS.
+        * SGX virtualization is disabled.  There is no software enable bit for
+        * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
+        * the guest from executing ENCLS (when SGX is supported by hardware).
          */
         kvm_queue_exception(vcpu, UD_VECTOR);
         return 1;
  }
+#endif /* CONFIG_X86_SGX_KVM */
  
  static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
  {
@@ -5668,10 +5665,10 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
         [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
         [EXIT_REASON_HLT]                     = kvm_emulate_halt,
-       [EXIT_REASON_INVD]                    = handle_invd,
+       [EXIT_REASON_INVD]                    = kvm_emulate_invd,
         [EXIT_REASON_INVLPG]                  = handle_invlpg,
-       [EXIT_REASON_RDPMC]                   = handle_rdpmc,
-       [EXIT_REASON_VMCALL]                  = handle_vmcall,
+       [EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,
+       [EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,
         [EXIT_REASON_VMCLEAR]                 = handle_vmx_instruction,
         [EXIT_REASON_VMLAUNCH]                = handle_vmx_instruction,
         [EXIT_REASON_VMPTRLD]                 = handle_vmx_instruction,
@@ -5685,8 +5682,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
         [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
         [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
-       [EXIT_REASON_WBINVD]                  = handle_wbinvd,
-       [EXIT_REASON_XSETBV]                  = handle_xsetbv,
+       [EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,
+       [EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,
         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
         [EXIT_REASON_GDTR_IDTR]               = handle_desc,
@@ -5694,13 +5691,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
-       [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_mwait,
+       [EXIT_REASON_MWAIT_INSTRUCTION]       = kvm_emulate_mwait,
         [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
-       [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
+       [EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,
         [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
         [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
-       [EXIT_REASON_RDRAND]                  = handle_invalid_op,
-       [EXIT_REASON_RDSEED]                  = handle_invalid_op,
+       [EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,
+       [EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,
         [EXIT_REASON_PML_FULL]                = handle_pml_full,
         [EXIT_REASON_INVPCID]                 = handle_invpcid,
         [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
@@ -5787,12 +5784,23 @@ static void vmx_dump_dtsel(char *name, uint32_t limit)
                vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
  }
  
-void dump_vmcs(void)
+static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
  {
+       unsigned int i;
+       struct vmx_msr_entry *e;
+
+       pr_err("MSR %s:\n", name);
+       for (i = 0, e = m->val; i < m->nr; ++i, ++e)
+               pr_err("  %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
+}
+
+void dump_vmcs(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
         u32 vmentry_ctl, vmexit_ctl;
         u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
         unsigned long cr4;
-       u64 efer;
+       int efer_slot;
  
         if (!dump_invalid_vmcs) {
                 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
@@ -5804,7 +5812,6 @@ void dump_vmcs(void)
         cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
         pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
         cr4 = vmcs_readl(GUEST_CR4);
-       efer = vmcs_read64(GUEST_IA32_EFER);
         secondary_exec_control = 0;
         if (cpu_has_secondary_exec_ctrls())
                 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -5816,9 +5823,7 @@ void dump_vmcs(void)
         pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
                cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
         pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
-       if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
-           (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
-       {
+       if (cpu_has_vmx_ept()) {
                 pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
                        vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
                 pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
@@ -5841,10 +5846,20 @@ void dump_vmcs(void)
         vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
         vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
         vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
-       if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
-           (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
-               pr_err("EFER =     0x%016llx  PAT = 0x%016llx\n",
-                      efer, vmcs_read64(GUEST_IA32_PAT));
+       efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
+       if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
+               pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
+       else if (efer_slot >= 0)
+               pr_err("EFER= 0x%016llx (autoload)\n",
+                      vmx->msr_autoload.guest.val[efer_slot].value);
+       else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
+               pr_err("EFER= 0x%016llx (effective)\n",
+                      vcpu->arch.efer | (EFER_LMA | EFER_LME));
+       else
+               pr_err("EFER= 0x%016llx (effective)\n",
+                      vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
+       if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
+               pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
         pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
                vmcs_read64(GUEST_IA32_DEBUGCTL),
                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
@@ -5860,6 +5875,10 @@ void dump_vmcs(void)
         if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
                 pr_err("InterruptStatus = %04x\n",
                        vmcs_read16(GUEST_INTR_STATUS));
+       if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
+               vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
+       if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
+               vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
  
         pr_err("*** Host State ***\n");
         pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
@@ -5881,14 +5900,16 @@ void dump_vmcs(void)
                vmcs_readl(HOST_IA32_SYSENTER_ESP),
                vmcs_read32(HOST_IA32_SYSENTER_CS),
                vmcs_readl(HOST_IA32_SYSENTER_EIP));
-       if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
-               pr_err("EFER = 0x%016llx  PAT = 0x%016llx\n",
-                      vmcs_read64(HOST_IA32_EFER),
-                      vmcs_read64(HOST_IA32_PAT));
+       if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
+               pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
+       if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
+               pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
         if (cpu_has_load_perf_global_ctrl() &&
             vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
                 pr_err("PerfGlobCtl = 0x%016llx\n",
                        vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+       if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
+               vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
  
         pr_err("*** Control State ***\n");
         pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
@@ -5997,7 +6018,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
         }
  
         if (exit_reason.failed_vmentry) {
-               dump_vmcs();
+               dump_vmcs(vcpu);
                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                 vcpu->run->fail_entry.hardware_entry_failure_reason
                         = exit_reason.full;
@@ -6006,7 +6027,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
         }
  
         if (unlikely(vmx->fail)) {
-               dump_vmcs();
+               dump_vmcs(vcpu);
                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                 vcpu->run->fail_entry.hardware_entry_failure_reason
                         = vmcs_read32(VM_INSTRUCTION_ERROR);
@@ -6092,7 +6113,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
  unexpected_vmexit:
         vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
                     exit_reason.full);
-       dump_vmcs();
+       dump_vmcs(vcpu);
         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
         vcpu->run->internal.suberror =
                         KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
@@ -6395,18 +6416,17 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
  
  void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
  
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
+                                       unsigned long entry)
  {
-       unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
-       gate_desc *desc = (gate_desc *)host_idt_base + vector;
-
         kvm_before_interrupt(vcpu);
-       vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
+       vmx_do_interrupt_nmi_irqoff(entry);
         kvm_after_interrupt(vcpu);
  }
  
  static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
  {
+       const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
         u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
  
         /* if exit due to PF check for async PF */
@@ -6417,18 +6437,20 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
                 kvm_machine_check();
         /* We need to handle NMIs before interrupts are enabled */
         else if (is_nmi(intr_info))
-               handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
+               handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
  }
  
  static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
  {
         u32 intr_info = vmx_get_intr_info(vcpu);
+       unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+       gate_desc *desc = (gate_desc *)host_idt_base + vector;
  
         if (WARN_ONCE(!is_external_intr(intr_info),
             "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
                 return;
  
-       handle_interrupt_nmi_irqoff(vcpu, intr_info);
+       handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
  }
  
  static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
@@ -6938,9 +6960,11 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
         bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
  
         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
         vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
         vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
         vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
@@ -6976,6 +7000,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
         else
                 memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
  
+       vcpu_setup_sgx_lepubkeyhash(vcpu);
+
         vmx->nested.posted_intr_nv = -1;
         vmx->nested.current_vmptr = -1ull;
  
@@ -6989,8 +7015,9 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
         vmx->pi_desc.nv = POSTED_INTR_VECTOR;
         vmx->pi_desc.sn = 1;
  
-       vmx->ept_pointer = INVALID_PAGE;
-
+#if IS_ENABLED(CONFIG_HYPERV)
+       vmx->hv_root_ept = INVALID_PAGE;
+#endif
         return 0;
  
  free_vmcs:
@@ -7007,7 +7034,9 @@ free_vpid:
  
  static int vmx_vm_init(struct kvm *kvm)
  {
-       spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+#if IS_ENABLED(CONFIG_HYPERV)
+       spin_lock_init(&to_kvm_vmx(kvm)->hv_root_ept_lock);
+#endif
  
         if (!ple_gap)
                 kvm->arch.pause_in_guest = true;
@@ -7302,6 +7331,19 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
  
         set_cr4_guest_host_mask(vmx);
  
+       vmx_write_encls_bitmap(vcpu, NULL);
+       if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+               vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
+       else
+               vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
+
+       if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+               vmx->msr_ia32_feature_control_valid_bits |=
+                       FEAT_CTL_SGX_LC_ENABLED;
+       else
+               vmx->msr_ia32_feature_control_valid_bits &=
+                       ~FEAT_CTL_SGX_LC_ENABLED;
+
         /* Refresh #PF interception to account for MAXPHYADDR changes. */
         vmx_update_exception_bitmap(vcpu);
  }
@@ -7322,6 +7364,13 @@ static __init void vmx_set_cpu_caps(void)
         if (vmx_pt_mode_is_host_guest())
                 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
  
+       if (!enable_sgx) {
+               kvm_cpu_cap_clear(X86_FEATURE_SGX);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX1);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+       }
+
         if (vmx_umip_emulated())
                 kvm_cpu_cap_set(X86_FEATURE_UMIP);
  
@@ -7848,7 +7897,8 @@ static __init int hardware_setup(void)
         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
  
         if (enable_ept)
-               vmx_enable_tdp();
+               kvm_mmu_set_ept_masks(enable_ept_ad_bits,
+                                     cpu_has_vmx_ept_execute_only());
  
         if (!enable_ept)
                 ept_lpage_level = 0;
@@ -7909,6 +7959,8 @@ static __init int hardware_setup(void)
         if (!enable_ept || !cpu_has_vmx_intel_pt())
                 pt_mode = PT_MODE_SYSTEM;
  
+       setup_default_sgx_lepubkeyhash();
+
         if (nested) {
                 nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
                                            vmx_capability.ept);