Merge branch 'kvm-tdp-fix-rcu' into HEAD

author Paolo Bonzini <pbonzini@redhat.com>

Fri, 2 Apr 2021 11:25:32 +0000 (07:25 -0400)

committer Paolo Bonzini <pbonzini@redhat.com>

Fri, 2 Apr 2021 11:25:32 +0000 (07:25 -0400)
author Paolo Bonzini <pbonzini@redhat.com>
Fri, 2 Apr 2021 11:25:32 +0000 (07:25 -0400)
committer Paolo Bonzini <pbonzini@redhat.com>
Fri, 2 Apr 2021 11:25:32 +0000 (07:25 -0400)
diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst

index 0aa4817..1fc860c 100644 (file)
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -38,25 +38,24 @@ the mmu-lock on x86. Currently, the page fault can be fast in one of the
  following two cases:
  
  1. Access Tracking: The SPTE is not present, but it is marked for access
-   tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to
-   restore the saved R/X bits. This is described in more detail later below.
+   tracking. That means we need to restore the saved R/X bits. This is
+   described in more detail later below.
  
-2. Write-Protection: The SPTE is present and the fault is
-   caused by write-protect. That means we just need to change the W bit of
-   the spte.
+2. Write-Protection: The SPTE is present and the fault is caused by
+   write-protect. That means we just need to change the W bit of the spte.
  
-What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
-SPTE_MMU_WRITEABLE bit on the spte:
+What we use to avoid all the race is the Host-writable bit and MMU-writable bit
+on the spte:
  
-- SPTE_HOST_WRITEABLE means the gfn is writable on host.
-- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when
-  the gfn is writable on guest mmu and it is not write-protected by shadow
-  page write-protection.
+- Host-writable means the gfn is writable in the host kernel page tables and in
+  its KVM memslot.
+- MMU-writable means the gfn is writable in the guest's mmu and it is not
+  write-protected by shadow page write-protection.
  
  On fast page fault path, we will use cmpxchg to atomically set the spte W
-bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or
-restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This
-is safe because whenever changing these bits can be detected by cmpxchg.
+bit if spte.HOST_WRITEABLE = 1 and spte.WRITE_PROTECT = 1, to restore the saved
+R/X bits if for an access-traced spte, or both. This is safe because whenever
+changing these bits can be detected by cmpxchg.
  
  But we need carefully check these cases:
  
@@ -185,17 +184,17 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update().
  Lockless Access Tracking:
  
  This is used for Intel CPUs that are using EPT but do not support the EPT A/D
-bits. In this case, when the KVM MMU notifier is called to track accesses to a
-page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present
-by clearing the RWX bits in the PTE and storing the original R & X bits in
-some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the
-PTE (using the ignored bit 62). When the VM tries to access the page later on,
-a fault is generated and the fast page fault mechanism described above is used
-to atomically restore the PTE to a Present state. The W bit is not saved when
-the PTE is marked for access tracking and during restoration to the Present
-state, the W bit is set depending on whether or not it was a write access. If
-it wasn't, then the W bit will remain clear until a write access happens, at
-which time it will be set using the Dirty tracking mechanism described above.
+bits. In this case, PTEs are tagged as A/D disabled (using ignored bits), and
+when the KVM MMU notifier is called to track accesses to a page (via
+kvm_mmu_notifier_clear_flush_young), it marks the PTE not-present in hardware
+by clearing the RWX bits in the PTE and storing the original R & X bits in more
+unused/ignored bits. When the VM tries to access the page later on, a fault is
+generated and the fast page fault mechanism described above is used to
+atomically restore the PTE to a Present state. The W bit is not saved when the
+PTE is marked for access tracking and during restoration to the Present state,
+the W bit is set depending on whether or not it was a write access. If it
+wasn't, then the W bit will remain clear until a write access happens, at which
+time it will be set using the Dirty tracking mechanism described above.
  
  3. Reference
  ------------
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h

index cc96e26..12c4a13 100644 (file)
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -336,6 +336,7 @@
  #define X86_FEATURE_AVIC               (15*32+13) /* Virtual Interrupt Controller */
  #define X86_FEATURE_V_VMSAVE_VMLOAD    (15*32+15) /* Virtual VMSAVE VMLOAD */
  #define X86_FEATURE_VGIF               (15*32+16) /* Virtual GIF */
+#define X86_FEATURE_V_SPEC_CTRL                (15*32+20) /* Virtual SPEC_CTRL */
  #define X86_FEATURE_SVME_ADDR_CHK      (15*32+28) /* "" SVME addr check */
  
  /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 9bc091e..a52f973 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1125,6 +1125,7 @@ struct kvm_vcpu_stat {
         u64 req_event;
         u64 halt_poll_success_ns;
         u64 halt_poll_fail_ns;
+       u64 nested_run;
  };
  
  struct x86_instruction_info;
@@ -1251,8 +1252,8 @@ struct kvm_x86_ops {
         int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
         u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
  
-       void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd,
-                            int pgd_level);
+       void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+                            int root_level);
  
         bool (*has_wbinvd_exit)(void);
  
@@ -1339,6 +1340,7 @@ struct kvm_x86_ops {
  struct kvm_x86_nested_ops {
         int (*check_events)(struct kvm_vcpu *vcpu);
         bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
+       void (*triple_fault)(struct kvm_vcpu *vcpu);
         int (*get_state)(struct kvm_vcpu *vcpu,
                          struct kvm_nested_state __user *user_kvm_nested_state,
                          unsigned user_data_size);
@@ -1410,9 +1412,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
  int kvm_mmu_create(struct kvm_vcpu *vcpu);
  void kvm_mmu_init_vm(struct kvm *kvm);
  void kvm_mmu_uninit_vm(struct kvm *kvm);
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-               u64 acc_track_mask, u64 me_mask);
  
  void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -1520,6 +1519,11 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
  int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
  int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
  int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
+int kvm_emulate_invd(struct kvm_vcpu *vcpu);
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu);
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
  
  int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
  int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -1548,14 +1552,14 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
  unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
  void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
  void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
  
  int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
  int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
  
  unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
  void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-bool kvm_rdpmc(struct kvm_vcpu *vcpu);
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
  
  void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
  void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
@@ -1596,9 +1600,6 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu);
  
  int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
  void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
  void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                         ulong roots_to_free);
  gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h

index 1c56194..772e60e 100644 (file)
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -269,7 +269,9 @@ struct vmcb_save_area {
          * SEV-ES guests when referenced through the GHCB or for
          * saving to the host save area.
          */
-       u8 reserved_7[80];
+       u8 reserved_7[72];
+       u32 spec_ctrl;          /* Guest version of SPEC_CTRL at 0x2E0 */
+       u8 reserved_7b[4];
         u32 pkru;
         u8 reserved_7a[20];
         u64 reserved_8;         /* rax already available at 0x01f8 */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index cc369b9..0050f39 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2869,7 +2869,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
                 return;
  
         if (is_guest_mode(vcpu)) {
-               r = kvm_x86_ops.nested_ops->check_events(vcpu);
+               r = kvm_check_nested_events(vcpu);
                 if (r < 0)
                         return;
                 /*
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h

index c68bfc3..88d0ed5 100644 (file)
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -59,7 +59,8 @@ static __always_inline u64 rsvd_bits(int s, int e)
         return ((2ULL << (e - s)) - 1) << s;
  }
  
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
  
  void
  reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
@@ -73,6 +74,10 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
  int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
                                 u64 fault_address, char *insn, int insn_len);
  
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+
  static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
  {
         if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
@@ -102,8 +107,8 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
         if (!VALID_PAGE(root_hpa))
                 return;
  
-       static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
-                                vcpu->arch.mmu->shadow_root_level);
+       static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
+                                         vcpu->arch.mmu->shadow_root_level);
  }
  
  int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
@@ -124,7 +129,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
   * write-protects guest page to sync the guest modification, b) another one is
   * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
   * between these two sorts are:
- * 1) the first case clears SPTE_MMU_WRITEABLE bit.
+ * 1) the first case clears MMU-writable bit.
   * 2) the first case requires flushing tlb immediately avoiding corrupting
   *    shadow page table between all vcpus so it should be in the protection of
   *    mmu-lock. And the another case does not need to flush tlb until returning
@@ -135,17 +140,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
   * So, there is the problem: the first case can meet the corrupted tlb caused
   * by another case which write-protects pages but without flush tlb
   * immediately. In order to making the first case be aware this problem we let
- * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
- * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
+ * it flush tlb if we try to write-protect a spte whose MMU-writable bit
+ * is set, it works since another case never touches MMU-writable bit.
   *
   * Anyway, whenever a spte is updated (only permission and status bits are
- * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
+ * changed) we need to check whether the spte with MMU-writable becomes
   * readonly, if that happens, we need to flush tlb. Fortunately,
   * mmu_spte_update() has already handled it perfectly.
   *
- * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
+ * The rules to use MMU-writable and PT_WRITABLE_MASK:
   * - if we want to see if it has writable tlb entry or if the spte can be
- *   writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
+ *   writable on the mmu mapping, check MMU-writable, this is the most
   *   case, otherwise
   * - if we fix page fault on the spte or do write-protection by dirty logging,
   *   check PT_WRITABLE_MASK.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index d75524b..7a99e59 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -48,6 +48,7 @@
  #include <asm/memtype.h>
  #include <asm/cmpxchg.h>
  #include <asm/io.h>
+#include <asm/set_memory.h>
  #include <asm/vmx.h>
  #include <asm/kvm_page_track.h>
  #include "trace.h"
@@ -215,10 +216,10 @@ bool is_nx_huge_page_enabled(void)
  static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
                            unsigned int access)
  {
-       u64 mask = make_mmio_spte(vcpu, gfn, access);
+       u64 spte = make_mmio_spte(vcpu, gfn, access);
  
-       trace_mark_mmio_spte(sptep, gfn, mask);
-       mmu_spte_set(sptep, mask);
+       trace_mark_mmio_spte(sptep, gfn, spte);
+       mmu_spte_set(sptep, spte);
  }
  
  static gfn_t get_mmio_spte_gfn(u64 spte)
@@ -236,17 +237,6 @@ static unsigned get_mmio_spte_access(u64 spte)
         return spte & shadow_mmio_access_mask;
  }
  
-static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
-                         kvm_pfn_t pfn, unsigned int access)
-{
-       if (unlikely(is_noslot_pfn(pfn))) {
-               mark_mmio_spte(vcpu, sptep, gfn, access);
-               return true;
-       }
-
-       return false;
-}
-
  static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
  {
         u64 kvm_gen, spte_gen, gen;
@@ -1118,7 +1108,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
         rmap_printk("spte %p %llx\n", sptep, *sptep);
  
         if (pt_protect)
-               spte &= ~SPTE_MMU_WRITEABLE;
+               spte &= ~shadow_mmu_writable_mask;
         spte = spte & ~PT_WRITABLE_MASK;
  
         return mmu_spte_update(sptep, spte);
@@ -1424,17 +1414,15 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
              slot_rmap_walk_okay(_iter_);                               \
              slot_rmap_walk_next(_iter_))
  
-static __always_inline int
-kvm_handle_hva_range(struct kvm *kvm,
-                    unsigned long start,
-                    unsigned long end,
-                    unsigned long data,
-                    int (*handler)(struct kvm *kvm,
-                                   struct kvm_rmap_head *rmap_head,
-                                   struct kvm_memory_slot *slot,
-                                   gfn_t gfn,
-                                   int level,
-                                   unsigned long data))
+typedef int (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                             struct kvm_memory_slot *slot, gfn_t gfn,
+                             int level, unsigned long data);
+
+static __always_inline int kvm_handle_hva_range(struct kvm *kvm,
+                                               unsigned long start,
+                                               unsigned long end,
+                                               unsigned long data,
+                                               rmap_handler_t handler)
  {
         struct kvm_memslots *slots;
         struct kvm_memory_slot *memslot;
@@ -1473,12 +1461,7 @@ kvm_handle_hva_range(struct kvm *kvm,
  }
  
  static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-                         unsigned long data,
-                         int (*handler)(struct kvm *kvm,
-                                        struct kvm_rmap_head *rmap_head,
-                                        struct kvm_memory_slot *slot,
-                                        gfn_t gfn, int level,
-                                        unsigned long data))
+                         unsigned long data, rmap_handler_t handler)
  {
         return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
  }
@@ -2421,6 +2404,15 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
  
         kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
  
+       /*
+        * Note, this check is intentionally soft, it only guarantees that one
+        * page is available, while the caller may end up allocating as many as
+        * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
+        * exceeding the (arbitrary by default) limit will not harm the host,
+        * being too agressive may unnecessarily kill the guest, and getting an
+        * exact count is far more trouble than it's worth, especially in the
+        * page fault paths.
+        */
         if (!kvm_mmu_available_pages(vcpu->kvm))
                 return -ENOSPC;
         return 0;
@@ -2561,9 +2553,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
         struct kvm_mmu_page *sp;
         int ret;
  
-       if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
-               return 0;
-
         sp = sptep_to_sp(sptep);
  
         ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
@@ -2593,6 +2582,11 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
         pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
                  *sptep, write_fault, gfn);
  
+       if (unlikely(is_noslot_pfn(pfn))) {
+               mark_mmio_spte(vcpu, sptep, gfn, pte_access);
+               return RET_PF_EMULATE;
+       }
+
         if (is_shadow_present_pte(*sptep)) {
                 /*
                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
@@ -2626,9 +2620,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
                                 KVM_PAGES_PER_HPAGE(level));
  
-       if (unlikely(is_mmio_spte(*sptep)))
-               ret = RET_PF_EMULATE;
-
         /*
          * The fault is fully spurious if and only if the new SPTE and old SPTE
          * are identical, and emulation is not required.
@@ -2946,9 +2937,19 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                 return true;
         }
  
-       if (unlikely(is_noslot_pfn(pfn)))
+       if (unlikely(is_noslot_pfn(pfn))) {
                 vcpu_cache_mmio_info(vcpu, gva, gfn,
                                      access & shadow_mmio_access_mask);
+               /*
+                * If MMIO caching is disabled, emulate immediately without
+                * touching the shadow page tables as attempting to install an
+                * MMIO SPTE will just be an expensive nop.
+                */
+               if (unlikely(!shadow_mmio_value)) {
+                       *ret_val = RET_PF_EMULATE;
+                       return true;
+               }
+       }
  
         return false;
  }
@@ -3061,6 +3062,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                         if (!is_shadow_present_pte(spte))
                                 break;
  
+               if (!is_shadow_present_pte(spte))
+                       break;
+
                 sp = sptep_to_sp(iterator.sptep);
                 if (!is_last_spte(spte, sp->role.level))
                         break;
@@ -3193,14 +3197,17 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
                     (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
                         mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
-               } else {
-                       for (i = 0; i < 4; ++i)
-                               if (mmu->pae_root[i] != 0)
-                                       mmu_free_root_page(kvm,
-                                                          &mmu->pae_root[i],
-                                                          &invalid_list);
-                       mmu->root_hpa = INVALID_PAGE;
+               } else if (mmu->pae_root) {
+                       for (i = 0; i < 4; ++i) {
+                               if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
+                                       continue;
+
+                               mmu_free_root_page(kvm, &mmu->pae_root[i],
+                                                  &invalid_list);
+                               mmu->pae_root[i] = INVALID_PAE_ROOT;
+                       }
                 }
+               mmu->root_hpa = INVALID_PAGE;
                 mmu->root_pgd = 0;
         }
  
@@ -3226,155 +3233,182 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
  {
         struct kvm_mmu_page *sp;
  
-       write_lock(&vcpu->kvm->mmu_lock);
-
-       if (make_mmu_pages_available(vcpu)) {
-               write_unlock(&vcpu->kvm->mmu_lock);
-               return INVALID_PAGE;
-       }
         sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
         ++sp->root_count;
  
-       write_unlock(&vcpu->kvm->mmu_lock);
         return __pa(sp->spt);
  }
  
  static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
  {
-       u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       u8 shadow_root_level = mmu->shadow_root_level;
         hpa_t root;
         unsigned i;
  
         if (is_tdp_mmu_enabled(vcpu->kvm)) {
                 root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
-
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->root_hpa = root;
+               mmu->root_hpa = root;
         } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
-               root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
-                                     true);
-
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->root_hpa = root;
+               root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+               mmu->root_hpa = root;
         } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
+               if (WARN_ON_ONCE(!mmu->pae_root))
+                       return -EIO;
+
                 for (i = 0; i < 4; ++i) {
-                       MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
+                       WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
  
                         root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
                                               i << 30, PT32_ROOT_LEVEL, true);
-                       if (!VALID_PAGE(root))
-                               return -ENOSPC;
-                       vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
+                       mmu->pae_root[i] = root | PT_PRESENT_MASK |
+                                          shadow_me_mask;
                 }
-               vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
-       } else
-               BUG();
+               mmu->root_hpa = __pa(mmu->pae_root);
+       } else {
+               WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
+               return -EIO;
+       }
  
         /* root_pgd is ignored for direct MMUs. */
-       vcpu->arch.mmu->root_pgd = 0;
+       mmu->root_pgd = 0;
  
         return 0;
  }
  
  static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
  {
-       u64 pdptr, pm_mask;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       u64 pdptrs[4], pm_mask;
         gfn_t root_gfn, root_pgd;
         hpa_t root;
         int i;
  
-       root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
+       root_pgd = mmu->get_guest_pgd(vcpu);
         root_gfn = root_pgd >> PAGE_SHIFT;
  
         if (mmu_check_root(vcpu, root_gfn))
                 return 1;
  
+       if (mmu->root_level == PT32E_ROOT_LEVEL) {
+               for (i = 0; i < 4; ++i) {
+                       pdptrs[i] = mmu->get_pdptr(vcpu, i);
+                       if (!(pdptrs[i] & PT_PRESENT_MASK))
+                               continue;
+
+                       if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
+                               return 1;
+               }
+       }
+
         /*
          * Do we shadow a long mode page table? If so we need to
          * write-protect the guests page table root.
          */
-       if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
-               MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));
-
+       if (mmu->root_level >= PT64_ROOT_4LEVEL) {
                 root = mmu_alloc_root(vcpu, root_gfn, 0,
-                                     vcpu->arch.mmu->shadow_root_level, false);
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->root_hpa = root;
+                                     mmu->shadow_root_level, false);
+               mmu->root_hpa = root;
                 goto set_root_pgd;
         }
  
+       if (WARN_ON_ONCE(!mmu->pae_root))
+               return -EIO;
+
         /*
          * We shadow a 32 bit page table. This may be a legacy 2-level
          * or a PAE 3-level page table. In either case we need to be aware that
          * the shadow page table may be a PAE or a long mode page table.
          */
-       pm_mask = PT_PRESENT_MASK;
-       if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+       pm_mask = PT_PRESENT_MASK | shadow_me_mask;
+       if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
  
+               if (WARN_ON_ONCE(!mmu->lm_root))
+                       return -EIO;
+
+               mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask;
+       }
+
         for (i = 0; i < 4; ++i) {
-               MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
-               if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
-                       pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
-                       if (!(pdptr & PT_PRESENT_MASK)) {
-                               vcpu->arch.mmu->pae_root[i] = 0;
+               WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+               if (mmu->root_level == PT32E_ROOT_LEVEL) {
+                       if (!(pdptrs[i] & PT_PRESENT_MASK)) {
+                               mmu->pae_root[i] = INVALID_PAE_ROOT;
                                 continue;
                         }
-                       root_gfn = pdptr >> PAGE_SHIFT;
-                       if (mmu_check_root(vcpu, root_gfn))
-                               return 1;
+                       root_gfn = pdptrs[i] >> PAGE_SHIFT;
                 }
  
                 root = mmu_alloc_root(vcpu, root_gfn, i << 30,
                                       PT32_ROOT_LEVEL, false);
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->pae_root[i] = root | pm_mask;
+               mmu->pae_root[i] = root | pm_mask;
         }
-       vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
+
+       if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+               mmu->root_hpa = __pa(mmu->lm_root);
+       else
+               mmu->root_hpa = __pa(mmu->pae_root);
+
+set_root_pgd:
+       mmu->root_pgd = root_pgd;
+
+       return 0;
+}
+
+static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       u64 *lm_root, *pae_root;
  
         /*
-        * If we shadow a 32 bit page table with a long mode page
-        * table we enter this path.
+        * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
+        * tables are allocated and initialized at root creation as there is no
+        * equivalent level in the guest's NPT to shadow.  Allocate the tables
+        * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
          */
-       if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
-               if (vcpu->arch.mmu->lm_root == NULL) {
-                       /*
-                        * The additional page necessary for this is only
-                        * allocated on demand.
-                        */
+       if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
+           mmu->shadow_root_level < PT64_ROOT_4LEVEL)
+               return 0;
  
-                       u64 *lm_root;
+       /*
+        * This mess only works with 4-level paging and needs to be updated to
+        * work with 5-level paging.
+        */
+       if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
+               return -EIO;
  
-                       lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
-                       if (lm_root == NULL)
-                               return 1;
+       if (mmu->pae_root && mmu->lm_root)
+               return 0;
  
-                       lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
+       /*
+        * The special roots should always be allocated in concert.  Yell and
+        * bail if KVM ends up in a state where only one of the roots is valid.
+        */
+       if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root))
+               return -EIO;
  
-                       vcpu->arch.mmu->lm_root = lm_root;
-               }
+       /*
+        * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
+        * doesn't need to be decrypted.
+        */
+       pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+       if (!pae_root)
+               return -ENOMEM;
  
-               vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
+       lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+       if (!lm_root) {
+               free_page((unsigned long)pae_root);
+               return -ENOMEM;
         }
  
-set_root_pgd:
-       vcpu->arch.mmu->root_pgd = root_pgd;
+       mmu->pae_root = pae_root;
+       mmu->lm_root = lm_root;
  
         return 0;
  }
  
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->arch.mmu->direct_map)
-               return mmu_alloc_direct_roots(vcpu);
-       else
-               return mmu_alloc_shadow_roots(vcpu);
-}
-
  void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
  {
         int i;
@@ -3422,7 +3456,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
         for (i = 0; i < 4; ++i) {
                 hpa_t root = vcpu->arch.mmu->pae_root[i];
  
-               if (root && VALID_PAGE(root)) {
+               if (IS_VALID_PAE_ROOT(root)) {
                         root &= PT64_BASE_ADDR_MASK;
                         sp = to_shadow_page(root);
                         mmu_sync_children(vcpu, sp);
@@ -3554,11 +3588,12 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
                             __is_rsvd_bits_set(rsvd_check, sptes[level], level);
  
         if (reserved) {
-               pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
+               pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
                        __func__, addr);
                 for (level = root; level >= leaf; level--)
-                       pr_err("------ spte 0x%llx level %d.\n",
-                              sptes[level], level);
+                       pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
+                              sptes[level], level,
+                              rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]);
         }
  
         return reserved;
@@ -3653,6 +3688,14 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
         bool async;
  
+       /*
+        * Retry the page fault if the gfn hit a memslot that is being deleted
+        * or moved.  This ensures any existing SPTEs for the old memslot will
+        * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+        */
+       if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
+               return true;
+
         /* Don't expose private memslots to L2. */
         if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
                 *pfn = KVM_PFN_NOSLOT;
@@ -4615,12 +4658,17 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
         struct kvm_mmu *context = &vcpu->arch.guest_mmu;
         union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
  
-       context->shadow_root_level = new_role.base.level;
-
         __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
  
-       if (new_role.as_u64 != context->mmu_role.as_u64)
+       if (new_role.as_u64 != context->mmu_role.as_u64) {
                 shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+
+               /*
+                * Override the level set by the common init helper, nested TDP
+                * always uses the host's TDP configuration.
+                */
+               context->shadow_root_level = new_role.base.level;
+       }
  }
  EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
  
@@ -4802,16 +4850,27 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
         r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
         if (r)
                 goto out;
-       r = mmu_alloc_roots(vcpu);
-       kvm_mmu_sync_roots(vcpu);
+       r = mmu_alloc_special_roots(vcpu);
         if (r)
                 goto out;
+       write_lock(&vcpu->kvm->mmu_lock);
+       if (make_mmu_pages_available(vcpu))
+               r = -ENOSPC;
+       else if (vcpu->arch.mmu->direct_map)
+               r = mmu_alloc_direct_roots(vcpu);
+       else
+               r = mmu_alloc_shadow_roots(vcpu);
+       write_unlock(&vcpu->kvm->mmu_lock);
+       if (r)
+               goto out;
+
+       kvm_mmu_sync_roots(vcpu);
+
         kvm_mmu_load_pgd(vcpu);
         static_call(kvm_x86_tlb_flush_current)(vcpu);
  out:
         return r;
  }
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
  
  void kvm_mmu_unload(struct kvm_vcpu *vcpu)
  {
@@ -4820,7 +4879,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
         WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
  }
-EXPORT_SYMBOL_GPL(kvm_mmu_unload);
  
  static bool need_remote_flush(u64 old, u64 new)
  {
@@ -5220,6 +5278,8 @@ slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
  
  static void free_mmu_pages(struct kvm_mmu *mmu)
  {
+       if (!tdp_enabled && mmu->pae_root)
+               set_memory_encrypted((unsigned long)mmu->pae_root, 1);
         free_page((unsigned long)mmu->pae_root);
         free_page((unsigned long)mmu->lm_root);
  }
@@ -5240,9 +5300,11 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
          * while the PDP table is a per-vCPU construct that's allocated at MMU
          * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
          * x86_64.  Therefore we need to allocate the PDP table in the first
-        * 4GB of memory, which happens to fit the DMA32 zone.  Except for
-        * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
-        * skip allocating the PDP table.
+        * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
+        * generally doesn't use PAE paging and can skip allocating the PDP
+        * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
+        * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
+        * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
          */
         if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
                 return 0;
@@ -5252,8 +5314,22 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
                 return -ENOMEM;
  
         mmu->pae_root = page_address(page);
+
+       /*
+        * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
+        * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
+        * that KVM's writes and the CPU's reads get along.  Note, this is
+        * only necessary when using shadow paging, as 64-bit NPT can get at
+        * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
+        * by 32-bit kernels (when KVM itself uses 32-bit NPT).
+        */
+       if (!tdp_enabled)
+               set_memory_decrypted((unsigned long)mmu->pae_root, 1);
+       else
+               WARN_ON_ONCE(shadow_me_mask);
+
         for (i = 0; i < 4; ++i)
-               mmu->pae_root[i] = INVALID_PAGE;
+               mmu->pae_root[i] = INVALID_PAE_ROOT;
  
         return 0;
  }
@@ -5476,9 +5552,9 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
          * spte from present to present (changing the spte from present
          * to nonpresent will flush all the TLBs immediately), in other
          * words, the only case we care is mmu_spte_update() where we
-        * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
-        * instead of PT_WRITABLE_MASK, that means it does not depend
-        * on PT_WRITABLE_MASK anymore.
+        * have checked Host-writable | MMU-writable instead of
+        * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
+        * anymore.
          */
         if (flush)
                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5701,25 +5777,6 @@ static void mmu_destroy_caches(void)
         kmem_cache_destroy(mmu_page_header_cache);
  }
  
-static void kvm_set_mmio_spte_mask(void)
-{
-       u64 mask;
-
-       /*
-        * Set a reserved PA bit in MMIO SPTEs to generate page faults with
-        * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
-        * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
-        * 52-bit physical addresses then there are no reserved PA bits in the
-        * PTEs and so the reserved PA approach must be disabled.
-        */
-       if (shadow_phys_bits < 52)
-               mask = BIT_ULL(51) | PT_PRESENT_MASK;
-       else
-               mask = 0;
-
-       kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK);
-}
-
  static bool get_nx_auto_mode(void)
  {
         /* Return true when CPU has the bug, and mitigations are ON */
@@ -5785,8 +5842,6 @@ int kvm_mmu_module_init(void)
  
         kvm_mmu_reset_all_pte_masks();
  
-       kvm_set_mmio_spte_mask();
-
         pte_list_desc_cache = kmem_cache_create("pte_list_desc",
                                             sizeof(struct pte_list_desc),
                                             0, SLAB_ACCOUNT, NULL);
@@ -5884,6 +5939,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
         struct kvm_mmu_page *sp;
         unsigned int ratio;
         LIST_HEAD(invalid_list);
+       bool flush = false;
         ulong to_zap;
  
         rcu_idx = srcu_read_lock(&kvm->srcu);
@@ -5905,19 +5961,19 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
                                       lpage_disallowed_link);
                 WARN_ON_ONCE(!sp->lpage_disallowed);
                 if (is_tdp_mmu_page(sp)) {
-                       kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
-                               sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level));
+                       flush = kvm_tdp_mmu_zap_sp(kvm, sp);
                 } else {
                         kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
                         WARN_ON_ONCE(sp->lpage_disallowed);
                 }
  
                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
-                       kvm_mmu_commit_zap_page(kvm, &invalid_list);
+                       kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
                         cond_resched_rwlock_write(&kvm->mmu_lock);
+                       flush = false;
                 }
         }
-       kvm_mmu_commit_zap_page(kvm, &invalid_list);
+       kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
  
         write_unlock(&kvm->mmu_lock);
         srcu_read_unlock(&kvm->srcu, rcu_idx);
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c

index ced15fd..cedc17b 100644 (file)
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ b/arch/x86/kvm/mmu/mmu_audit.c
@@ -70,7 +70,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
         for (i = 0; i < 4; ++i) {
                 hpa_t root = vcpu->arch.mmu->pae_root[i];
  
-               if (root && VALID_PAGE(root)) {
+               if (IS_VALID_PAE_ROOT(root)) {
                         root &= PT64_BASE_ADDR_MASK;
                         sp = to_shadow_page(root);
                         __mmu_spte_walk(vcpu, sp, fn, 2);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h

index 1f6f98c..e03267e 100644 (file)
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -20,6 +20,16 @@ extern bool dbg;
  #define MMU_WARN_ON(x) do { } while (0)
  #endif
  
+/*
+ * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
+ * bit, and thus are guaranteed to be non-zero when valid.  And, when a guest
+ * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
+ * as the CPU would treat that as PRESENT PDPTR with reserved bits set.  Use
+ * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
+ */
+#define INVALID_PAE_ROOT       0
+#define IS_VALID_PAE_ROOT(x)   (!!(x))
+
  struct kvm_mmu_page {
         struct list_head link;
         struct hlist_node hash_link;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h

index 55d7b47..70b7e44 100644 (file)
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -503,6 +503,7 @@ error:
  #endif
         walker->fault.address = addr;
         walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+       walker->fault.async_page_fault = false;
  
         trace_kvm_mmu_walker_error(walker->fault.error_code);
         return 0;
@@ -1084,7 +1085,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
  
                 nr_present++;
  
-               host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+               host_writable = sp->spt[i] & shadow_host_writable_mask;
  
                 set_spte_ret |= set_spte(vcpu, &sp->spt[i],
                                          pte_access, PG_LEVEL_4K,
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c

index ef55f0b..66d43ce 100644 (file)
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -16,13 +16,20 @@
  #include "spte.h"
  
  #include <asm/e820/api.h>
+#include <asm/vmx.h>
  
+static bool __read_mostly enable_mmio_caching = true;
+module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
+
+u64 __read_mostly shadow_host_writable_mask;
+u64 __read_mostly shadow_mmu_writable_mask;
  u64 __read_mostly shadow_nx_mask;
  u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
  u64 __read_mostly shadow_user_mask;
  u64 __read_mostly shadow_accessed_mask;
  u64 __read_mostly shadow_dirty_mask;
  u64 __read_mostly shadow_mmio_value;
+u64 __read_mostly shadow_mmio_mask;
  u64 __read_mostly shadow_mmio_access_mask;
  u64 __read_mostly shadow_present_mask;
  u64 __read_mostly shadow_me_mask;
@@ -38,7 +45,6 @@ static u64 generation_mmio_spte_mask(u64 gen)
         u64 mask;
  
         WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
-       BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
  
         mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
         mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
@@ -48,16 +54,18 @@ static u64 generation_mmio_spte_mask(u64 gen)
  u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
  {
         u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
-       u64 mask = generation_mmio_spte_mask(gen);
+       u64 spte = generation_mmio_spte_mask(gen);
         u64 gpa = gfn << PAGE_SHIFT;
  
+       WARN_ON_ONCE(!shadow_mmio_value);
+
         access &= shadow_mmio_access_mask;
-       mask |= shadow_mmio_value | access;
-       mask |= gpa | shadow_nonpresent_or_rsvd_mask;
-       mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+       spte |= shadow_mmio_value | access;
+       spte |= gpa | shadow_nonpresent_or_rsvd_mask;
+       spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
                 << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
  
-       return mask;
+       return spte;
  }
  
  static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@ -86,13 +94,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                      bool can_unsync, bool host_writable, bool ad_disabled,
                      u64 *new_spte)
  {
-       u64 spte = 0;
+       u64 spte = SPTE_MMU_PRESENT_MASK;
         int ret = 0;
  
         if (ad_disabled)
-               spte |= SPTE_AD_DISABLED_MASK;
+               spte |= SPTE_TDP_AD_DISABLED_MASK;
         else if (kvm_vcpu_ad_need_write_protect(vcpu))
-               spte |= SPTE_AD_WRPROT_ONLY_MASK;
+               spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
+
+       /*
+        * Bits 62:52 of PAE SPTEs are reserved.  WARN if said bits are set
+        * if PAE paging may be employed (shadow paging or any 32-bit KVM).
+        */
+       WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) &&
+                    (spte & SPTE_TDP_AD_MASK));
  
         /*
          * For the EPT case, shadow_present_mask is 0 if hardware
@@ -124,7 +139,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                         kvm_is_mmio_pfn(pfn));
  
         if (host_writable)
-               spte |= SPTE_HOST_WRITEABLE;
+               spte |= shadow_host_writable_mask;
         else
                 pte_access &= ~ACC_WRITE_MASK;
  
@@ -134,7 +149,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
         spte |= (u64)pfn << PAGE_SHIFT;
  
         if (pte_access & ACC_WRITE_MASK) {
-               spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
+               spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
  
                 /*
                  * Optimization: for pte sync, if spte was writable the hash
@@ -150,7 +165,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                                  __func__, gfn);
                         ret |= SET_SPTE_WRITE_PROTECTED_PT;
                         pte_access &= ~ACC_WRITE_MASK;
-                       spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+                       spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
                 }
         }
  
@@ -161,19 +176,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                 spte = mark_spte_for_access_track(spte);
  
  out:
+       WARN_ON(is_mmio_spte(spte));
         *new_spte = spte;
         return ret;
  }
  
  u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
  {
-       u64 spte;
+       u64 spte = SPTE_MMU_PRESENT_MASK;
  
-       spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
-              shadow_user_mask | shadow_x_mask | shadow_me_mask;
+       spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+               shadow_user_mask | shadow_x_mask | shadow_me_mask;
  
         if (ad_disabled)
-               spte |= SPTE_AD_DISABLED_MASK;
+               spte |= SPTE_TDP_AD_DISABLED_MASK;
         else
                 spte |= shadow_accessed_mask;
  
@@ -188,7 +204,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
         new_spte |= (u64)new_pfn << PAGE_SHIFT;
  
         new_spte &= ~PT_WRITABLE_MASK;
-       new_spte &= ~SPTE_HOST_WRITEABLE;
+       new_spte &= ~shadow_host_writable_mask;
  
         new_spte = mark_spte_for_access_track(new_spte);
  
@@ -242,53 +258,68 @@ u64 mark_spte_for_access_track(u64 spte)
         return spte;
  }
  
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
  {
         BUG_ON((u64)(unsigned)access_mask != access_mask);
-       WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
         WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
-       shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
+
+       if (!enable_mmio_caching)
+               mmio_value = 0;
+
+       /*
+        * Disable MMIO caching if the MMIO value collides with the bits that
+        * are used to hold the relocated GFN when the L1TF mitigation is
+        * enabled.  This should never fire as there is no known hardware that
+        * can trigger this condition, e.g. SME/SEV CPUs that require a custom
+        * MMIO value are not susceptible to L1TF.
+        */
+       if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask <<
+                                 SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)))
+               mmio_value = 0;
+
+       /*
+        * The masked MMIO value must obviously match itself and a removed SPTE
+        * must not get a false positive.  Removed SPTEs and MMIO SPTEs should
+        * never collide as MMIO must set some RWX bits, and removed SPTEs must
+        * not set any RWX bits.
+        */
+       if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
+           WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
+               mmio_value = 0;
+
+       shadow_mmio_value = mmio_value;
+       shadow_mmio_mask  = mmio_mask;
         shadow_mmio_access_mask = access_mask;
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
  
-/*
- * Sets the shadow PTE masks used by the MMU.
- *
- * Assumptions:
- *  - Setting either @accessed_mask or @dirty_mask requires setting both
- *  - At least one of @accessed_mask or @acc_track_mask must be set
- */
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-               u64 acc_track_mask, u64 me_mask)
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
  {
-       BUG_ON(!dirty_mask != !accessed_mask);
-       BUG_ON(!accessed_mask && !acc_track_mask);
-       BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
-
-       shadow_user_mask = user_mask;
-       shadow_accessed_mask = accessed_mask;
-       shadow_dirty_mask = dirty_mask;
-       shadow_nx_mask = nx_mask;
-       shadow_x_mask = x_mask;
-       shadow_present_mask = p_mask;
-       shadow_acc_track_mask = acc_track_mask;
-       shadow_me_mask = me_mask;
+       shadow_user_mask        = VMX_EPT_READABLE_MASK;
+       shadow_accessed_mask    = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
+       shadow_dirty_mask       = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
+       shadow_nx_mask          = 0ull;
+       shadow_x_mask           = VMX_EPT_EXECUTABLE_MASK;
+       shadow_present_mask     = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+       shadow_acc_track_mask   = VMX_EPT_RWX_MASK;
+       shadow_me_mask          = 0ull;
+
+       shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
+       shadow_mmu_writable_mask  = EPT_SPTE_MMU_WRITABLE;
+
+       /*
+        * EPT Misconfigurations are generated if the value of bits 2:0
+        * of an EPT paging-structure entry is 110b (write/execute).
+        */
+       kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
+                                  VMX_EPT_RWX_MASK, 0);
  }
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
  
  void kvm_mmu_reset_all_pte_masks(void)
  {
         u8 low_phys_bits;
-
-       shadow_user_mask = 0;
-       shadow_accessed_mask = 0;
-       shadow_dirty_mask = 0;
-       shadow_nx_mask = 0;
-       shadow_x_mask = 0;
-       shadow_present_mask = 0;
-       shadow_acc_track_mask = 0;
+       u64 mask;
  
         shadow_phys_bits = kvm_get_shadow_phys_bits();
  
@@ -315,4 +346,30 @@ void kvm_mmu_reset_all_pte_masks(void)
  
         shadow_nonpresent_or_rsvd_lower_gfn_mask =
                 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
+
+       shadow_user_mask        = PT_USER_MASK;
+       shadow_accessed_mask    = PT_ACCESSED_MASK;
+       shadow_dirty_mask       = PT_DIRTY_MASK;
+       shadow_nx_mask          = PT64_NX_MASK;
+       shadow_x_mask           = 0;
+       shadow_present_mask     = PT_PRESENT_MASK;
+       shadow_acc_track_mask   = 0;
+       shadow_me_mask          = sme_me_mask;
+
+       shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE;
+       shadow_mmu_writable_mask  = DEFAULT_SPTE_MMU_WRITEABLE;
+
+       /*
+        * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+        * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
+        * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+        * 52-bit physical addresses then there are no reserved PA bits in the
+        * PTEs and so the reserved PA approach must be disabled.
+        */
+       if (shadow_phys_bits < 52)
+               mask = BIT_ULL(51) | PT_PRESENT_MASK;
+       else
+               mask = 0;
+
+       kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
  }
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h

index 6de3950..bca0ba1 100644 (file)
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -5,18 +5,33 @@
  
  #include "mmu_internal.h"
  
-#define PT_FIRST_AVAIL_BITS_SHIFT 10
-#define PT64_SECOND_AVAIL_BITS_SHIFT 54
+/*
+ * A MMU present SPTE is backed by actual memory and may or may not be present
+ * in hardware.  E.g. MMIO SPTEs are not considered present.  Use bit 11, as it
+ * is ignored by all flavors of SPTEs and checking a low bit often generates
+ * better code than for a high bit, e.g. 56+.  MMU present checks are pervasive
+ * enough that the improved code generation is noticeable in KVM's footprint.
+ */
+#define SPTE_MMU_PRESENT_MASK          BIT_ULL(11)
  
  /*
- * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
- * Access Tracking SPTEs.
+ * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
+ * be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
+ * PML, is enabled).  Use bits 52 and 53 to hold the type of A/D tracking that
+ * is must be employed for a given TDP SPTE.
+ *
+ * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE
+ * paging, including NPT PAE.  This scheme works because legacy shadow paging
+ * is guaranteed to have A/D bits and write-protection is forced only for
+ * TDP with CPU dirty logging (PML).  If NPT ever gains PML-like support, it
+ * must be restricted to 64-bit KVM.
   */
-#define SPTE_SPECIAL_MASK (3ULL << 52)
-#define SPTE_AD_ENABLED_MASK (0ULL << 52)
-#define SPTE_AD_DISABLED_MASK (1ULL << 52)
-#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
-#define SPTE_MMIO_MASK (3ULL << 52)
+#define SPTE_TDP_AD_SHIFT              52
+#define SPTE_TDP_AD_MASK               (3ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_ENABLED_MASK       (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED_MASK      (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY_MASK   (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
  
  #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
  #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
@@ -51,16 +66,46 @@
         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
  
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITEABLE    BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITEABLE     BIT_ULL(10)
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page.  This mask obviously
+ * must not overlap the A/D type mask.
+ */
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
+                                         PT64_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
+#define SHADOW_ACC_TRACK_SAVED_MASK    (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
+                                        SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/*
+ * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
+ * to not overlap the A/D type mask or the saved access bits of access-tracked
+ * SPTEs when A/D bits are disabled.
+ */
+#define EPT_SPTE_HOST_WRITABLE         BIT_ULL(57)
+#define EPT_SPTE_MMU_WRITABLE          BIT_ULL(58)
  
-#define SPTE_HOST_WRITEABLE    (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-#define SPTE_MMU_WRITEABLE     (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/* Defined only to keep the above static asserts readable. */
+#undef SHADOW_ACC_TRACK_SAVED_MASK
  
  /*
- * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
   * the memslots generation and is derived as follows:
   *
- * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
- * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62
+ * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
+ * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
   *
   * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
   * the MMIO generation number, as doing so would require stealing a bit from
@@ -71,39 +116,44 @@
   */
  
  #define MMIO_SPTE_GEN_LOW_START                3
-#define MMIO_SPTE_GEN_LOW_END          11
+#define MMIO_SPTE_GEN_LOW_END          10
  
-#define MMIO_SPTE_GEN_HIGH_START       PT64_SECOND_AVAIL_BITS_SHIFT
+#define MMIO_SPTE_GEN_HIGH_START       52
  #define MMIO_SPTE_GEN_HIGH_END         62
  
  #define MMIO_SPTE_GEN_LOW_MASK         GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
                                                     MMIO_SPTE_GEN_LOW_START)
  #define MMIO_SPTE_GEN_HIGH_MASK                GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
                                                     MMIO_SPTE_GEN_HIGH_START)
+static_assert(!(SPTE_MMU_PRESENT_MASK &
+               (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
  
  #define MMIO_SPTE_GEN_LOW_BITS         (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
  #define MMIO_SPTE_GEN_HIGH_BITS                (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
  
  /* remember to adjust the comment above as well if you change these */
-static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9);
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
  
  #define MMIO_SPTE_GEN_LOW_SHIFT                (MMIO_SPTE_GEN_LOW_START - 0)
  #define MMIO_SPTE_GEN_HIGH_SHIFT       (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
  
  #define MMIO_SPTE_GEN_MASK             GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
  
+extern u64 __read_mostly shadow_host_writable_mask;
+extern u64 __read_mostly shadow_mmu_writable_mask;
  extern u64 __read_mostly shadow_nx_mask;
  extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
  extern u64 __read_mostly shadow_user_mask;
  extern u64 __read_mostly shadow_accessed_mask;
  extern u64 __read_mostly shadow_dirty_mask;
  extern u64 __read_mostly shadow_mmio_value;
+extern u64 __read_mostly shadow_mmio_mask;
  extern u64 __read_mostly shadow_mmio_access_mask;
  extern u64 __read_mostly shadow_present_mask;
  extern u64 __read_mostly shadow_me_mask;
  
  /*
- * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK;
   * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
   * pages.
   */
@@ -120,29 +170,22 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
   */
  #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
  
-/*
- * The mask/shift to use for saving the original R/X bits when marking the PTE
- * as not-present for access tracking purposes. We do not save the W bit as the
- * PTEs being access tracked also need to be dirty tracked, so the W bit will be
- * restored only when a write is attempted to the page.
- */
-#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
-                                         PT64_EPT_EXECUTABLE_MASK)
-#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
-
  /*
   * If a thread running without exclusive control of the MMU lock must perform a
   * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
   * non-present intermediate value. Other threads which encounter this value
   * should not modify the SPTE.
   *
- * This constant works because it is considered non-present on both AMD and
- * Intel CPUs and does not create a L1TF vulnerability because the pfn section
- * is zeroed out.
+ * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
+ * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * vulnerability.  Use only low bits to avoid 64-bit immediates.
   *
   * Only used by the TDP MMU.
   */
-#define REMOVED_SPTE (1ull << 59)
+#define REMOVED_SPTE   0x5a0ULL
+
+/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
+static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
  
  static inline bool is_removed_spte(u64 spte)
  {
@@ -167,7 +210,13 @@ extern u8 __read_mostly shadow_phys_bits;
  
  static inline bool is_mmio_spte(u64 spte)
  {
-       return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
+       return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+              likely(shadow_mmio_value);
+}
+
+static inline bool is_shadow_present_pte(u64 pte)
+{
+       return !!(pte & SPTE_MMU_PRESENT_MASK);
  }
  
  static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
@@ -177,25 +226,30 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
  
  static inline bool spte_ad_enabled(u64 spte)
  {
-       MMU_WARN_ON(is_mmio_spte(spte));
-       return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
+       return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK;
  }
  
  static inline bool spte_ad_need_write_protect(u64 spte)
  {
-       MMU_WARN_ON(is_mmio_spte(spte));
-       return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
+       /*
+        * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0',
+        * and non-TDP SPTEs will never set these bits.  Optimize for 64-bit
+        * TDP and do the A/D type check unconditionally.
+        */
+       return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK;
  }
  
  static inline u64 spte_shadow_accessed_mask(u64 spte)
  {
-       MMU_WARN_ON(is_mmio_spte(spte));
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
         return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
  }
  
  static inline u64 spte_shadow_dirty_mask(u64 spte)
  {
-       MMU_WARN_ON(is_mmio_spte(spte));
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
         return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
  }
  
@@ -204,11 +258,6 @@ static inline bool is_access_track_spte(u64 spte)
         return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
  }
  
-static inline bool is_shadow_present_pte(u64 pte)
-{
-       return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte);
-}
-
  static inline bool is_large_pte(u64 pte)
  {
         return pte & PT_PAGE_SIZE_MASK;
@@ -246,8 +295,8 @@ static inline bool is_dirty_spte(u64 spte)
  
  static inline bool spte_can_locklessly_be_made_writable(u64 spte)
  {
-       return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
-               (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
+       return (spte & shadow_host_writable_mask) &&
+              (spte & shadow_mmu_writable_mask);
  }
  
  static inline u64 get_mmio_spte_generation(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c

index 462b1f7..fd50008 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -86,7 +86,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
  
  static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield);
+                         gfn_t start, gfn_t end, bool can_yield, bool flush);
  
  void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  {
@@ -99,7 +99,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  
         list_del(&root->link);
  
-       zap_gfn_range(kvm, root, 0, max_gfn, false);
+       zap_gfn_range(kvm, root, 0, max_gfn, false, false);
  
         free_page((unsigned long)root->spt);
         kmem_cache_free(mmu_page_header_cache, root);
@@ -137,22 +137,21 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
         return sp;
  }
  
-static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
  {
         union kvm_mmu_page_role role;
         struct kvm *kvm = vcpu->kvm;
         struct kvm_mmu_page *root;
  
-       role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
+       lockdep_assert_held_write(&kvm->mmu_lock);
  
-       write_lock(&kvm->mmu_lock);
+       role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
  
         /* Check for an existing root before allocating a new one. */
         for_each_tdp_mmu_root(kvm, root) {
                 if (root->role.word == role.word) {
                         kvm_mmu_get_root(kvm, root);
-                       write_unlock(&kvm->mmu_lock);
-                       return root;
+                       goto out;
                 }
         }
  
@@ -161,19 +160,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
  
         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
  
-       write_unlock(&kvm->mmu_lock);
-
-       return root;
-}
-
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu_page *root;
-
-       root = get_tdp_mmu_vcpu_root(vcpu);
-       if (!root)
-               return INVALID_PAGE;
-
+out:
         return __pa(root->spt);
  }
  
@@ -205,13 +192,12 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
  
  static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
  {
-       bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
-
         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
                 return;
  
         if (is_accessed_spte(old_spte) &&
-           (!is_accessed_spte(new_spte) || pfn_changed))
+           (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
+            spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
  }
  
@@ -455,7 +441,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
  
  
         if (was_leaf && is_dirty_spte(old_spte) &&
-           (!is_dirty_spte(new_spte) || pfn_changed))
+           (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
  
         /*
@@ -498,7 +484,7 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
          * Do not change removed SPTEs. Only the thread that froze the SPTE
          * may modify it.
          */
-       if (iter->old_spte == REMOVED_SPTE)
+       if (is_removed_spte(iter->old_spte))
                 return false;
  
         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
@@ -569,7 +555,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
          * should be used. If operating under the MMU lock in write mode, the
          * use of the removed SPTE should not be necessary.
          */
-       WARN_ON(iter->old_spte == REMOVED_SPTE);
+       WARN_ON(is_removed_spte(iter->old_spte));
  
         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
  
@@ -668,20 +654,21 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
   * scheduler needs the CPU or there is contention on the MMU lock. If this
   * function cannot yield, it will not release the MMU lock or reschedule and
   * the caller must ensure it does not supply too large a GFN range, or the
- * operation can cause a soft lockup.
+ * operation can cause a soft lockup.  Note, in some use cases a flush may be
+ * required by prior actions.  Ensure the pending flush is performed prior to
+ * yielding.
   */
  static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield)
+                         gfn_t start, gfn_t end, bool can_yield, bool flush)
  {
         struct tdp_iter iter;
-       bool flush_needed = false;
  
         rcu_read_lock();
  
         tdp_root_for_each_pte(iter, root, start, end) {
                 if (can_yield &&
-                   tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
-                       flush_needed = false;
+                   tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
+                       flush = false;
                         continue;
                 }
  
@@ -699,11 +686,11 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
                         continue;
  
                 tdp_mmu_set_spte(kvm, &iter, 0);
-               flush_needed = true;
+               flush = true;
         }
  
         rcu_read_unlock();
-       return flush_needed;
+       return flush;
  }
  
  /*
@@ -712,13 +699,14 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
   * SPTEs have been cleared and a TLB flush is needed before releasing the
   * MMU lock.
   */
-bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
+                                bool can_yield)
  {
         struct kvm_mmu_page *root;
         bool flush = false;
  
         for_each_tdp_mmu_root_yield_safe(kvm, root)
-               flush |= zap_gfn_range(kvm, root, start, end, true);
+               flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
  
         return flush;
  }
@@ -775,12 +763,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
                                      new_spte);
                 ret = RET_PF_EMULATE;
-       } else
+       } else {
                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
                                        rcu_dereference(iter->sptep));
+       }
  
-       trace_kvm_mmu_set_spte(iter->level, iter->gfn,
-                              rcu_dereference(iter->sptep));
         if (!prefault)
                 vcpu->stat.pf_fixed++;
  
@@ -880,17 +867,15 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
         return ret;
  }
  
-static __always_inline int
-kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
-                            unsigned long start,
-                            unsigned long end,
-                            unsigned long data,
-                            int (*handler)(struct kvm *kvm,
-                                           struct kvm_memory_slot *slot,
-                                           struct kvm_mmu_page *root,
-                                           gfn_t start,
-                                           gfn_t end,
-                                           unsigned long data))
+typedef int (*tdp_handler_t)(struct kvm *kvm, struct kvm_memory_slot *slot,
+                            struct kvm_mmu_page *root, gfn_t start, gfn_t end,
+                            unsigned long data);
+
+static __always_inline int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
+                                                       unsigned long start,
+                                                       unsigned long end,
+                                                       unsigned long data,
+                                                       tdp_handler_t handler)
  {
         struct kvm_memslots *slots;
         struct kvm_memory_slot *memslot;
@@ -925,12 +910,20 @@ kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
         return ret;
  }
  
+static __always_inline int kvm_tdp_mmu_handle_hva(struct kvm *kvm,
+                                                 unsigned long addr,
+                                                 unsigned long data,
+                                                 tdp_handler_t handler)
+{
+       return kvm_tdp_mmu_handle_hva_range(kvm, addr, addr + 1, data, handler);
+}
+
  static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
                                      struct kvm_memory_slot *slot,
                                      struct kvm_mmu_page *root, gfn_t start,
                                      gfn_t end, unsigned long unused)
  {
-       return zap_gfn_range(kvm, root, start, end, false);
+       return zap_gfn_range(kvm, root, start, end, false, false);
  }
  
  int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
@@ -998,12 +991,12 @@ int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
  }
  
  static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
-                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
-                       unsigned long unused2)
+                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
+                       unsigned long unused)
  {
         struct tdp_iter iter;
  
-       tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
+       tdp_root_for_each_leaf_pte(iter, root, gfn, end)
                 if (is_accessed_spte(iter.old_spte))
                         return 1;
  
@@ -1012,8 +1005,7 @@ static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
  
  int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
  {
-       return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
-                                           test_age_gfn);
+       return kvm_tdp_mmu_handle_hva(kvm, hva, 0, test_age_gfn);
  }
  
  /*
@@ -1023,7 +1015,7 @@ int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
   * Returns non-zero if a flush is needed before releasing the MMU lock.
   */
  static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
-                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
+                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
                         unsigned long data)
  {
         struct tdp_iter iter;
@@ -1034,7 +1026,7 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
  
         rcu_read_lock();
  
-       WARN_ON(pte_huge(*ptep));
+       WARN_ON(pte_huge(*ptep) || (gfn + 1) != end);
  
         new_pfn = pte_pfn(*ptep);
  
@@ -1045,10 +1037,14 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
                 if (!is_shadow_present_pte(iter.old_spte))
                         break;
  
+               /*
+                * Note, when changing a read-only SPTE, it's not strictly
+                * necessary to zero the SPTE before setting the new PFN, but
+                * doing so preserves the invariant that the PFN of a present
+                * leaf SPTE can never change.  See __handle_changed_spte().
+                */
                 tdp_mmu_set_spte(kvm, &iter, 0);
  
-               kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
-
                 if (!pte_write(*ptep)) {
                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
                                         iter.old_spte, new_pfn);
@@ -1070,9 +1066,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
  int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
                              pte_t *host_ptep)
  {
-       return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
-                                           (unsigned long)host_ptep,
-                                           set_tdp_spte);
+       return kvm_tdp_mmu_handle_hva(kvm, address, (unsigned long)host_ptep,
+                                     set_tdp_spte);
  }
  
  /*
@@ -1332,7 +1327,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
  
  /*
   * Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
   * Returns true if an SPTE was set and a TLB flush is needed.
   */
  static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
@@ -1349,7 +1344,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
                         break;
  
                 new_spte = iter.old_spte &
-                       ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+                       ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
  
                 tdp_mmu_set_spte(kvm, &iter, new_spte);
                 spte_set = true;
@@ -1362,7 +1357,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
  
  /*
   * Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
   * Returns true if an SPTE was set and a TLB flush is needed.
   */
  bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h

index 3b761c1..31096ec 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -8,7 +8,29 @@
  hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
  void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
  
-bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end);
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
+                                bool can_yield);
+static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start,
+                                            gfn_t end)
+{
+       return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true);
+}
+static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level);
+
+       /*
+        * Don't allow yielding, as the caller may have a flush pending.  Note,
+        * if mmu_lock is held for write, zapping will never yield in this case,
+        * but explicitly disallow it for safety.  The TDP MMU does not yield
+        * until it has made forward progress (steps sideways), and when zapping
+        * a single shadow page that it's guaranteed to see (thus the mmu_lock
+        * requirement), its "step sideways" will always step beyond the bounds
+        * of the shadow page's gfn range and stop iterating before yielding.
+        */
+       lockdep_assert_held_write(&kvm->mmu_lock);
+       return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false);
+}
  void kvm_tdp_mmu_zap_all(struct kvm *kvm);
  
  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c

index 78bdcfa..cd0285f 100644 (file)
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -270,7 +270,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
         if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
                 return -EINVAL;
  
-       if (!svm->vcpu.arch.apic->regs)
+       if (!vcpu->arch.apic->regs)
                 return -EINVAL;
  
         if (kvm_apicv_activated(vcpu->kvm)) {
@@ -281,7 +281,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
                         return ret;
         }
  
-       svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+       svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
  
         /* Setting AVIC backing page address in the phy APIC ID table */
         entry = avic_get_physical_id_entry(vcpu, id);
@@ -315,15 +315,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
         }
  }
  
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_svm *svm = to_svm(vcpu);
         u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
         u32 icrl = svm->vmcb->control.exit_info_1;
         u32 id = svm->vmcb->control.exit_info_2 >> 32;
         u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
-       struct kvm_lapic *apic = svm->vcpu.arch.apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
  
-       trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+       trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
  
         switch (id) {
         case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
@@ -347,11 +348,11 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
                  * set the appropriate IRR bits on the valid target
                  * vcpus. So, we just need to kick the appropriate vcpu.
                  */
-               avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh);
+               avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh);
                 break;
         case AVIC_IPI_FAILURE_INVALID_TARGET:
                 WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
-                         index, svm->vcpu.vcpu_id, icrh, icrl);
+                         index, vcpu->vcpu_id, icrh, icrl);
                 break;
         case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
                 WARN_ONCE(1, "Invalid backing page\n");
@@ -539,8 +540,9 @@ static bool is_avic_unaccelerated_access_trap(u32 offset)
         return ret;
  }
  
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_svm *svm = to_svm(vcpu);
         int ret = 0;
         u32 offset = svm->vmcb->control.exit_info_1 &
                      AVIC_UNACCEL_ACCESS_OFFSET_MASK;
@@ -550,7 +552,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
                      AVIC_UNACCEL_ACCESS_WRITE_MASK;
         bool trap = is_avic_unaccelerated_access_trap(offset);
  
-       trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+       trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
                                             trap, write, vector);
         if (trap) {
                 /* Handling Trap */
@@ -558,7 +560,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
                 ret = avic_unaccel_trap_write(svm);
         } else {
                 /* Handling Fault */
-               ret = kvm_emulate_instruction(&svm->vcpu, 0);
+               ret = kvm_emulate_instruction(vcpu, 0);
         }
  
         return ret;
@@ -572,7 +574,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
         if (!avic || !irqchip_in_kernel(vcpu->kvm))
                 return 0;
  
-       ret = avic_init_backing_page(&svm->vcpu);
+       ret = avic_init_backing_page(vcpu);
         if (ret)
                 return ret;
  
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c

index 35891d9..9bed484 100644 (file)
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -29,6 +29,8 @@
  #include "lapic.h"
  #include "svm.h"
  
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
  static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
                                        struct x86_exception *fault)
  {
@@ -92,12 +94,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
  static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *hsave = svm->nested.hsave;
  
         WARN_ON(mmu_is_nested(vcpu));
  
         vcpu->arch.mmu = &vcpu->arch.guest_mmu;
-       kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
+       kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+                               svm->vmcb01.ptr->save.efer,
                                 svm->nested.ctl.nested_cr3);
         vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
         vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
@@ -123,7 +125,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
                 return;
  
         c = &svm->vmcb->control;
-       h = &svm->nested.hsave->control;
+       h = &svm->vmcb01.ptr->control;
         g = &svm->nested.ctl;
  
         for (i = 0; i < MAX_INTERCEPT; i++)
@@ -233,49 +235,71 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
  
  static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
  {
-       if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
+       if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN)))
                 return false;
  
-       if (control->asid == 0)
+       if (CC(control->asid == 0))
                 return false;
  
-       if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
-           !npt_enabled)
+       if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
                 return false;
  
         return true;
  }
  
-static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
+static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu,
+                                     struct vmcb_save_area *save)
  {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-       bool vmcb12_lma;
+       /*
+        * These checks are also performed by KVM_SET_SREGS,
+        * except that EFER.LMA is not checked by SVM against
+        * CR0.PG && EFER.LME.
+        */
+       if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
+               if (CC(!(save->cr4 & X86_CR4_PAE)) ||
+                   CC(!(save->cr0 & X86_CR0_PE)) ||
+                   CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
+                       return false;
+       }
  
-       if ((vmcb12->save.efer & EFER_SVME) == 0)
+       if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
                 return false;
  
-       if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
+       return true;
+}
+
+/* Common checks that apply to both L1 and L2 state.  */
+static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu,
+                                   struct vmcb_save_area *save)
+{
+       /*
+        * FIXME: these should be done after copying the fields,
+        * to avoid TOC/TOU races.  For these save area checks
+        * the possible damage is limited since kvm_set_cr0 and
+        * kvm_set_cr4 handle failure; EFER_SVME is an exception
+        * so it is force-set later in nested_prepare_vmcb_save.
+        */
+       if (CC(!(save->efer & EFER_SVME)))
                 return false;
  
-       if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
+       if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+           CC(save->cr0 & ~0xffffffffULL))
                 return false;
  
-       vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
+       if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
+               return false;
  
-       if (vmcb12_lma) {
-               if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
-                   !(vmcb12->save.cr0 & X86_CR0_PE) ||
-                   kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3))
-                       return false;
-       }
-       if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
+       if (!nested_vmcb_check_cr3_cr4(vcpu, save))
+               return false;
+
+       if (CC(!kvm_valid_efer(vcpu, save->efer)))
                 return false;
  
-       return nested_vmcb_check_controls(&vmcb12->control);
+       return true;
  }
  
-static void load_nested_vmcb_control(struct vcpu_svm *svm,
-                                    struct vmcb_control_area *control)
+static void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
+                                           struct vmcb_control_area *control)
  {
         copy_vmcb_control_area(&svm->nested.ctl, control);
  
@@ -287,9 +311,9 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm,
  
  /*
   * Synchronize fields that are written by the processor, so that
- * they can be copied back into the nested_vmcb.
+ * they can be copied back into the vmcb12.
   */
-void sync_nested_vmcb_control(struct vcpu_svm *svm)
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
  {
         u32 mask;
         svm->nested.ctl.event_inj      = svm->vmcb->control.event_inj;
@@ -317,8 +341,8 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
   * Transfer any event that L0 or L1 wanted to inject into L2 to
   * EXIT_INT_INFO.
   */
-static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
-                                          struct vmcb *vmcb12)
+static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
+                                               struct vmcb *vmcb12)
  {
         struct kvm_vcpu *vcpu = &svm->vcpu;
         u32 exit_int_info = 0;
@@ -362,12 +386,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
  static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
                                bool nested_npt)
  {
-       if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+       if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
                 return -EINVAL;
  
         if (!nested_npt && is_pae_paging(vcpu) &&
             (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
-               if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+               if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)))
                         return -EINVAL;
         }
  
@@ -386,20 +410,57 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
         return 0;
  }
  
-static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
  {
+       if (!svm->nested.vmcb02.ptr)
+               return;
+
+       /* FIXME: merge g_pat from vmcb01 and vmcb12.  */
+       svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+{
+       bool new_vmcb12 = false;
+
+       nested_vmcb02_compute_g_pat(svm);
+
         /* Load the nested guest state */
-       svm->vmcb->save.es = vmcb12->save.es;
-       svm->vmcb->save.cs = vmcb12->save.cs;
-       svm->vmcb->save.ss = vmcb12->save.ss;
-       svm->vmcb->save.ds = vmcb12->save.ds;
-       svm->vmcb->save.gdtr = vmcb12->save.gdtr;
-       svm->vmcb->save.idtr = vmcb12->save.idtr;
+
+       if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+               new_vmcb12 = true;
+               svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+       }
+
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+               svm->vmcb->save.es = vmcb12->save.es;
+               svm->vmcb->save.cs = vmcb12->save.cs;
+               svm->vmcb->save.ss = vmcb12->save.ss;
+               svm->vmcb->save.ds = vmcb12->save.ds;
+               svm->vmcb->save.cpl = vmcb12->save.cpl;
+               vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+       }
+
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+               svm->vmcb->save.gdtr = vmcb12->save.gdtr;
+               svm->vmcb->save.idtr = vmcb12->save.idtr;
+               vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+       }
+
         kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
-       svm_set_efer(&svm->vcpu, vmcb12->save.efer);
+
+       /*
+        * Force-set EFER_SVME even though it is checked earlier on the
+        * VMCB12, because the guest can flip the bit between the check
+        * and now.  Clearing EFER_SVME would call svm_free_nested.
+        */
+       svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME);
+
         svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
         svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
-       svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
+       svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
         kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
         kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
         kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
@@ -408,15 +469,41 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
         svm->vmcb->save.rax = vmcb12->save.rax;
         svm->vmcb->save.rsp = vmcb12->save.rsp;
         svm->vmcb->save.rip = vmcb12->save.rip;
-       svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
-       svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
-       svm->vmcb->save.cpl = vmcb12->save.cpl;
+
+       /* These bits will be set properly on the first execution when new_vmc12 is true */
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+               svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
+               svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
+               vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+       }
  }
  
-static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
  {
         const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
  
+       /*
+        * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
+        * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+        */
+
+       /*
+        * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
+        * avic_physical_id.
+        */
+       WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK);
+
+       /* Copied from vmcb01.  msrpm_base can be overwritten later.  */
+       svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
+       svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
+       svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa;
+
+       /* Done at vmrun: asid.  */
+
+       /* Also overwritten later if necessary.  */
+       svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+       /* nested_cr3.  */
         if (nested_npt_enabled(svm))
                 nested_svm_init_mmu_context(&svm->vcpu);
  
@@ -425,7 +512,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
  
         svm->vmcb->control.int_ctl             =
                 (svm->nested.ctl.int_ctl & ~mask) |
-               (svm->nested.hsave->control.int_ctl & mask);
+               (svm->vmcb01.ptr->control.int_ctl & mask);
  
         svm->vmcb->control.virt_ext            = svm->nested.ctl.virt_ext;
         svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
@@ -440,17 +527,28 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
         enter_guest_mode(&svm->vcpu);
  
         /*
-        * Merge guest and host intercepts - must be called  with vcpu in
-        * guest-mode to take affect here
+        * Merge guest and host intercepts - must be called with vcpu in
+        * guest-mode to take effect.
          */
         recalc_intercepts(svm);
+}
  
-       vmcb_mark_all_dirty(svm->vmcb);
+static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+       /*
+        * Some VMCB state is shared between L1 and L2 and thus has to be
+        * moved at the time of nested vmrun and vmexit.
+        *
+        * VMLOAD/VMSAVE state would also belong in this category, but KVM
+        * always performs VMLOAD and VMSAVE from the VMCB01.
+        */
+       to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
  }
  
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
                          struct vmcb *vmcb12)
  {
+       struct vcpu_svm *svm = to_svm(vcpu);
         int ret;
  
         trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
@@ -468,9 +566,14 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
  
  
         svm->nested.vmcb12_gpa = vmcb12_gpa;
-       load_nested_vmcb_control(svm, &vmcb12->control);
-       nested_prepare_vmcb_control(svm);
-       nested_prepare_vmcb_save(svm, vmcb12);
+
+       WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
+
+       nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
+
+       svm_switch_vmcb(svm, &svm->nested.vmcb02);
+       nested_vmcb02_prepare_control(svm);
+       nested_vmcb02_prepare_save(svm, vmcb12);
  
         ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
                                   nested_npt_enabled(svm));
@@ -478,44 +581,48 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
                 return ret;
  
         if (!npt_enabled)
-               svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
+               vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
  
         svm_set_gif(svm, true);
  
         return 0;
  }
  
-int nested_svm_vmrun(struct vcpu_svm *svm)
+int nested_svm_vmrun(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_svm *svm = to_svm(vcpu);
         int ret;
         struct vmcb *vmcb12;
-       struct vmcb *hsave = svm->nested.hsave;
-       struct vmcb *vmcb = svm->vmcb;
         struct kvm_host_map map;
         u64 vmcb12_gpa;
  
-       if (is_smm(&svm->vcpu)) {
-               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       ++vcpu->stat.nested_run;
+
+       if (is_smm(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
                 return 1;
         }
  
         vmcb12_gpa = svm->vmcb->save.rax;
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
+       ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
         if (ret == -EINVAL) {
-               kvm_inject_gp(&svm->vcpu, 0);
+               kvm_inject_gp(vcpu, 0);
                 return 1;
         } else if (ret) {
-               return kvm_skip_emulated_instruction(&svm->vcpu);
+               return kvm_skip_emulated_instruction(vcpu);
         }
  
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+       ret = kvm_skip_emulated_instruction(vcpu);
  
         vmcb12 = map.hva;
  
         if (WARN_ON_ONCE(!svm->nested.initialized))
                 return -EINVAL;
  
-       if (!nested_vmcb_checks(svm, vmcb12)) {
+       nested_load_control_from_vmcb12(svm, &vmcb12->control);
+
+       if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) ||
+           !nested_vmcb_check_controls(&svm->nested.ctl)) {
                 vmcb12->control.exit_code    = SVM_EXIT_ERR;
                 vmcb12->control.exit_code_hi = 0;
                 vmcb12->control.exit_info_1  = 0;
@@ -525,36 +632,25 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
  
  
         /* Clear internal status */
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
  
         /*
-        * Save the old vmcb, so we don't need to pick what we save, but can
-        * restore everything when a VMEXIT occurs
+        * Since vmcb01 is not in use, we can use it to store some of the L1
+        * state.
          */
-       hsave->save.es     = vmcb->save.es;
-       hsave->save.cs     = vmcb->save.cs;
-       hsave->save.ss     = vmcb->save.ss;
-       hsave->save.ds     = vmcb->save.ds;
-       hsave->save.gdtr   = vmcb->save.gdtr;
-       hsave->save.idtr   = vmcb->save.idtr;
-       hsave->save.efer   = svm->vcpu.arch.efer;
-       hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       hsave->save.cr4    = svm->vcpu.arch.cr4;
-       hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
-       hsave->save.rip    = kvm_rip_read(&svm->vcpu);
-       hsave->save.rsp    = vmcb->save.rsp;
-       hsave->save.rax    = vmcb->save.rax;
-       if (npt_enabled)
-               hsave->save.cr3    = vmcb->save.cr3;
-       else
-               hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
-
-       copy_vmcb_control_area(&hsave->control, &vmcb->control);
+       svm->vmcb01.ptr->save.efer   = vcpu->arch.efer;
+       svm->vmcb01.ptr->save.cr0    = kvm_read_cr0(vcpu);
+       svm->vmcb01.ptr->save.cr4    = vcpu->arch.cr4;
+       svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu);
+       svm->vmcb01.ptr->save.rip    = kvm_rip_read(vcpu);
+
+       if (!npt_enabled)
+               svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu);
  
         svm->nested.nested_run_pending = 1;
  
-       if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
+       if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12))
                 goto out_exit_err;
  
         if (nested_svm_vmrun_msrpm(svm))
@@ -571,7 +667,7 @@ out_exit_err:
         nested_svm_vmexit(svm);
  
  out:
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       kvm_vcpu_unmap(vcpu, &map, true);
  
         return ret;
  }
@@ -594,27 +690,30 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
  
  int nested_svm_vmexit(struct vcpu_svm *svm)
  {
-       int rc;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
         struct vmcb *vmcb12;
-       struct vmcb *hsave = svm->nested.hsave;
         struct vmcb *vmcb = svm->vmcb;
         struct kvm_host_map map;
+       int rc;
+
+       /* Triple faults in L2 should never escape. */
+       WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
  
-       rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+       rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
         if (rc) {
                 if (rc == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
+                       kvm_inject_gp(vcpu, 0);
                 return 1;
         }
  
         vmcb12 = map.hva;
  
         /* Exit Guest-Mode */
-       leave_guest_mode(&svm->vcpu);
+       leave_guest_mode(vcpu);
         svm->nested.vmcb12_gpa = 0;
         WARN_ON_ONCE(svm->nested.nested_run_pending);
  
-       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
  
         /* in case we halted in L2 */
         svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -628,14 +727,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
         vmcb12->save.gdtr   = vmcb->save.gdtr;
         vmcb12->save.idtr   = vmcb->save.idtr;
         vmcb12->save.efer   = svm->vcpu.arch.efer;
-       vmcb12->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       vmcb12->save.cr3    = kvm_read_cr3(&svm->vcpu);
+       vmcb12->save.cr0    = kvm_read_cr0(vcpu);
+       vmcb12->save.cr3    = kvm_read_cr3(vcpu);
         vmcb12->save.cr2    = vmcb->save.cr2;
         vmcb12->save.cr4    = svm->vcpu.arch.cr4;
-       vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
-       vmcb12->save.rip    = kvm_rip_read(&svm->vcpu);
-       vmcb12->save.rsp    = kvm_rsp_read(&svm->vcpu);
-       vmcb12->save.rax    = kvm_rax_read(&svm->vcpu);
+       vmcb12->save.rflags = kvm_get_rflags(vcpu);
+       vmcb12->save.rip    = kvm_rip_read(vcpu);
+       vmcb12->save.rsp    = kvm_rsp_read(vcpu);
+       vmcb12->save.rax    = kvm_rax_read(vcpu);
         vmcb12->save.dr7    = vmcb->save.dr7;
         vmcb12->save.dr6    = svm->vcpu.arch.dr6;
         vmcb12->save.cpl    = vmcb->save.cpl;
@@ -647,7 +746,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
         vmcb12->control.exit_info_2       = vmcb->control.exit_info_2;
  
         if (vmcb12->control.exit_code != SVM_EXIT_ERR)
-               nested_vmcb_save_pending_event(svm, vmcb12);
+               nested_save_pending_event_to_vmcb12(svm, vmcb12);
  
         if (svm->nrips_enabled)
                 vmcb12->control.next_rip  = vmcb->control.next_rip;
@@ -662,37 +761,38 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
         vmcb12->control.pause_filter_thresh =
                 svm->vmcb->control.pause_filter_thresh;
  
-       /* Restore the original control entries */
-       copy_vmcb_control_area(&vmcb->control, &hsave->control);
+       nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+
+       svm_switch_vmcb(svm, &svm->vmcb01);
  
-       /* On vmexit the  GIF is set to false */
+       /*
+        * On vmexit the  GIF is set to false and
+        * no event can be injected in L1.
+        */
         svm_set_gif(svm, false);
+       svm->vmcb->control.exit_int_info = 0;
  
-       svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
-               svm->vcpu.arch.l1_tsc_offset;
+       svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
+       if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+               svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+               vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+       }
  
         svm->nested.ctl.nested_cr3 = 0;
  
-       /* Restore selected save entries */
-       svm->vmcb->save.es = hsave->save.es;
-       svm->vmcb->save.cs = hsave->save.cs;
-       svm->vmcb->save.ss = hsave->save.ss;
-       svm->vmcb->save.ds = hsave->save.ds;
-       svm->vmcb->save.gdtr = hsave->save.gdtr;
-       svm->vmcb->save.idtr = hsave->save.idtr;
-       kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
-       kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED);
-       svm_set_efer(&svm->vcpu, hsave->save.efer);
-       svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
-       svm_set_cr4(&svm->vcpu, hsave->save.cr4);
-       kvm_rax_write(&svm->vcpu, hsave->save.rax);
-       kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
-       kvm_rip_write(&svm->vcpu, hsave->save.rip);
-       svm->vmcb->save.dr7 = DR7_FIXED_1;
-       svm->vmcb->save.cpl = 0;
-       svm->vmcb->control.exit_int_info = 0;
+       /*
+        * Restore processor state that had been saved in vmcb01
+        */
+       kvm_set_rflags(vcpu, svm->vmcb->save.rflags);
+       svm_set_efer(vcpu, svm->vmcb->save.efer);
+       svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
+       svm_set_cr4(vcpu, svm->vmcb->save.cr4);
+       kvm_rax_write(vcpu, svm->vmcb->save.rax);
+       kvm_rsp_write(vcpu, svm->vmcb->save.rsp);
+       kvm_rip_write(vcpu, svm->vmcb->save.rip);
  
-       vmcb_mark_all_dirty(svm->vmcb);
+       svm->vcpu.arch.dr7 = DR7_FIXED_1;
+       kvm_update_dr7(&svm->vcpu);
  
         trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
                                        vmcb12->control.exit_info_1,
@@ -701,50 +801,53 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
                                        vmcb12->control.exit_int_info_err,
                                        KVM_ISA_SVM);
  
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       kvm_vcpu_unmap(vcpu, &map, true);
  
-       nested_svm_uninit_mmu_context(&svm->vcpu);
+       nested_svm_uninit_mmu_context(vcpu);
  
-       rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
+       rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false);
         if (rc)
                 return 1;
  
-       if (npt_enabled)
-               svm->vmcb->save.cr3 = hsave->save.cr3;
-
         /*
          * Drop what we picked up for L2 via svm_complete_interrupts() so it
          * doesn't end up in L1.
          */
         svm->vcpu.arch.nmi_injected = false;
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
  
         return 0;
  }
  
+static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
+{
+       nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
+}
+
  int svm_allocate_nested(struct vcpu_svm *svm)
  {
-       struct page *hsave_page;
+       struct page *vmcb02_page;
  
         if (svm->nested.initialized)
                 return 0;
  
-       hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-       if (!hsave_page)
+       vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+       if (!vmcb02_page)
                 return -ENOMEM;
-       svm->nested.hsave = page_address(hsave_page);
+       svm->nested.vmcb02.ptr = page_address(vmcb02_page);
+       svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
  
         svm->nested.msrpm = svm_vcpu_alloc_msrpm();
         if (!svm->nested.msrpm)
-               goto err_free_hsave;
+               goto err_free_vmcb02;
         svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
  
         svm->nested.initialized = true;
         return 0;
  
-err_free_hsave:
-       __free_page(hsave_page);
+err_free_vmcb02:
+       __free_page(vmcb02_page);
         return -ENOMEM;
  }
  
@@ -756,8 +859,8 @@ void svm_free_nested(struct vcpu_svm *svm)
         svm_vcpu_free_msrpm(svm->nested.msrpm);
         svm->nested.msrpm = NULL;
  
-       __free_page(virt_to_page(svm->nested.hsave));
-       svm->nested.hsave = NULL;
+       __free_page(virt_to_page(svm->nested.vmcb02.ptr));
+       svm->nested.vmcb02.ptr = NULL;
  
         svm->nested.initialized = false;
  }
@@ -767,18 +870,19 @@ void svm_free_nested(struct vcpu_svm *svm)
   */
  void svm_leave_nested(struct vcpu_svm *svm)
  {
-       if (is_guest_mode(&svm->vcpu)) {
-               struct vmcb *hsave = svm->nested.hsave;
-               struct vmcb *vmcb = svm->vmcb;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
  
+       if (is_guest_mode(vcpu)) {
                 svm->nested.nested_run_pending = 0;
-               leave_guest_mode(&svm->vcpu);
-               copy_vmcb_control_area(&vmcb->control, &hsave->control);
-               nested_svm_uninit_mmu_context(&svm->vcpu);
+               leave_guest_mode(vcpu);
+
+               svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+               nested_svm_uninit_mmu_context(vcpu);
                 vmcb_mark_all_dirty(svm->vmcb);
         }
  
-       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
  }
  
  static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
@@ -887,16 +991,15 @@ int nested_svm_exit_handled(struct vcpu_svm *svm)
         return vmexit;
  }
  
-int nested_svm_check_permissions(struct vcpu_svm *svm)
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
  {
-       if (!(svm->vcpu.arch.efer & EFER_SVME) ||
-           !is_paging(&svm->vcpu)) {
-               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
                 return 1;
         }
  
-       if (svm->vmcb->save.cpl) {
-               kvm_inject_gp(&svm->vcpu, 0);
+       if (to_svm(vcpu)->vmcb->save.cpl) {
+               kvm_inject_gp(vcpu, 0);
                 return 1;
         }
  
@@ -944,50 +1047,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
         nested_svm_vmexit(svm);
  }
  
-static void nested_svm_smi(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code = SVM_EXIT_SMI;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-static void nested_svm_nmi(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code = SVM_EXIT_NMI;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-static void nested_svm_intr(struct vcpu_svm *svm)
-{
-       trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-
-       svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
  static inline bool nested_exit_on_init(struct vcpu_svm *svm)
  {
         return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
  }
  
-static void nested_svm_init(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code   = SVM_EXIT_INIT;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-
  static int svm_check_nested_events(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -1001,7 +1065,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                         return -EBUSY;
                 if (!nested_exit_on_init(svm))
                         return 0;
-               nested_svm_init(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
                 return 0;
         }
  
@@ -1019,7 +1083,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                         return -EBUSY;
                 if (!nested_exit_on_smi(svm))
                         return 0;
-               nested_svm_smi(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
                 return 0;
         }
  
@@ -1028,7 +1092,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                         return -EBUSY;
                 if (!nested_exit_on_nmi(svm))
                         return 0;
-               nested_svm_nmi(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
                 return 0;
         }
  
@@ -1037,7 +1101,8 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                         return -EBUSY;
                 if (!nested_exit_on_intr(svm))
                         return 0;
-               nested_svm_intr(svm);
+               trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
                 return 0;
         }
  
@@ -1056,8 +1121,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
         case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
                 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
  
-               if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
-                               excp_bits)
+               if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
+                   excp_bits)
                         return NESTED_EXIT_HOST;
                 else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
                          svm->vcpu.arch.apf.host_apf_flags)
@@ -1121,10 +1186,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
         if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
                          sizeof(user_vmcb->control)))
                 return -EFAULT;
-       if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
+       if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
                          sizeof(user_vmcb->save)))
                 return -EFAULT;
-
  out:
         return kvm_state.size;
  }
@@ -1134,7 +1198,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                                 struct kvm_nested_state *kvm_state)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *hsave = svm->nested.hsave;
         struct vmcb __user *user_vmcb = (struct vmcb __user *)
                 &user_kvm_nested_state->data.svm[0];
         struct vmcb_control_area *ctl;
@@ -1196,7 +1259,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
  
         /*
          * Processor state contains L2 state.  Check that it is
-        * valid for guest mode (see nested_vmcb_checks).
+        * valid for guest mode (see nested_vmcb_check_save).
          */
         cr0 = kvm_read_cr0(vcpu);
          if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
@@ -1205,27 +1268,48 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
         /*
          * Validate host state saved from before VMRUN (see
          * nested_svm_check_permissions).
-        * TODO: validate reserved bits for all saved state.
          */
-       if (!(save->cr0 & X86_CR0_PG))
+       if (!(save->cr0 & X86_CR0_PG) ||
+           !(save->cr0 & X86_CR0_PE) ||
+           (save->rflags & X86_EFLAGS_VM) ||
+           !nested_vmcb_valid_sregs(vcpu, save))
                 goto out_free;
  
         /*
-        * All checks done, we can enter guest mode.  L1 control fields
-        * come from the nested save state.  Guest state is already
-        * in the registers, the save area of the nested state instead
-        * contains saved L1 state.
+        * All checks done, we can enter guest mode. Userspace provides
+        * vmcb12.control, which will be combined with L1 and stored into
+        * vmcb02, and the L1 save state which we store in vmcb01.
+        * L2 registers if needed are moved from the current VMCB to VMCB02.
          */
  
         svm->nested.nested_run_pending =
                 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
  
-       copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
-       hsave->save = *save;
-
         svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
-       load_nested_vmcb_control(svm, ctl);
-       nested_prepare_vmcb_control(svm);
+       if (svm->current_vmcb == &svm->vmcb01)
+               svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+       svm->vmcb01.ptr->save.es = save->es;
+       svm->vmcb01.ptr->save.cs = save->cs;
+       svm->vmcb01.ptr->save.ss = save->ss;
+       svm->vmcb01.ptr->save.ds = save->ds;
+       svm->vmcb01.ptr->save.gdtr = save->gdtr;
+       svm->vmcb01.ptr->save.idtr = save->idtr;
+       svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED;
+       svm->vmcb01.ptr->save.efer = save->efer;
+       svm->vmcb01.ptr->save.cr0 = save->cr0;
+       svm->vmcb01.ptr->save.cr3 = save->cr3;
+       svm->vmcb01.ptr->save.cr4 = save->cr4;
+       svm->vmcb01.ptr->save.rax = save->rax;
+       svm->vmcb01.ptr->save.rsp = save->rsp;
+       svm->vmcb01.ptr->save.rip = save->rip;
+       svm->vmcb01.ptr->save.cpl = 0;
+
+       nested_load_control_from_vmcb12(svm, ctl);
+
+       svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+       nested_vmcb02_prepare_control(svm);
  
         kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
         ret = 0;
@@ -1238,6 +1322,7 @@ out_free:
  
  struct kvm_x86_nested_ops svm_nested_ops = {
         .check_events = svm_check_nested_events,
+       .triple_fault = nested_svm_triple_fault,
         .get_nested_state_pages = svm_get_nested_state_pages,
         .get_state = svm_get_nested_state,
         .set_state = svm_set_nested_state,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c

index 874ea30..83e00e5 100644 (file)
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1849,7 +1849,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
                 vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
                 vcpu->arch.regs[VCPU_REGS_RCX] = 0;
  
-               ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID);
+               ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
                 if (!ret) {
                         ret = -EINVAL;
                         break;
@@ -1899,8 +1899,9 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
         return ret;
  }
  
-int sev_handle_vmgexit(struct vcpu_svm *svm)
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_svm *svm = to_svm(vcpu);
         struct vmcb_control_area *control = &svm->vmcb->control;
         u64 ghcb_gpa, exit_code;
         struct ghcb *ghcb;
@@ -1912,13 +1913,13 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                 return sev_handle_vmgexit_msr_protocol(svm);
  
         if (!ghcb_gpa) {
-               vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n");
+               vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
                 return -EINVAL;
         }
  
-       if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
+       if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
                 /* Unable to map GHCB from guest */
-               vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+               vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
                             ghcb_gpa);
                 return -EINVAL;
         }
@@ -1926,7 +1927,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
         svm->ghcb = svm->ghcb_map.hva;
         ghcb = svm->ghcb_map.hva;
  
-       trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb);
+       trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb);
  
         exit_code = ghcb_get_sw_exit_code(ghcb);
  
@@ -1944,7 +1945,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                 if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
                         break;
  
-               ret = kvm_sev_es_mmio_read(&svm->vcpu,
+               ret = kvm_sev_es_mmio_read(vcpu,
                                            control->exit_info_1,
                                            control->exit_info_2,
                                            svm->ghcb_sa);
@@ -1953,19 +1954,19 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                 if (!setup_vmgexit_scratch(svm, false, control->exit_info_2))
                         break;
  
-               ret = kvm_sev_es_mmio_write(&svm->vcpu,
+               ret = kvm_sev_es_mmio_write(vcpu,
                                             control->exit_info_1,
                                             control->exit_info_2,
                                             svm->ghcb_sa);
                 break;
         case SVM_VMGEXIT_NMI_COMPLETE:
-               ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET);
+               ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
                 break;
         case SVM_VMGEXIT_AP_HLT_LOOP:
-               ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+               ret = kvm_emulate_ap_reset_hold(vcpu);
                 break;
         case SVM_VMGEXIT_AP_JUMP_TABLE: {
-               struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+               struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
  
                 switch (control->exit_info_1) {
                 case 0:
@@ -1990,12 +1991,12 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                 break;
         }
         case SVM_VMGEXIT_UNSUPPORTED_EVENT:
-               vcpu_unimpl(&svm->vcpu,
+               vcpu_unimpl(vcpu,
                             "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
                             control->exit_info_1, control->exit_info_2);
                 break;
         default:
-               ret = svm_invoke_exit_handler(svm, exit_code);
+               ret = svm_invoke_exit_handler(vcpu, exit_code);
         }
  
         return ret;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index 58a45bb..2711964 100644 (file)
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -279,7 +279,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
                          * In this case we will return to the nested guest
                          * as soon as we leave SMM.
                          */
-                       if (!is_smm(&svm->vcpu))
+                       if (!is_smm(vcpu))
                                 svm_free_nested(svm);
  
                 } else {
@@ -363,10 +363,10 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
         bool has_error_code = vcpu->arch.exception.has_error_code;
         u32 error_code = vcpu->arch.exception.error_code;
  
-       kvm_deliver_exception_payload(&svm->vcpu);
+       kvm_deliver_exception_payload(vcpu);
  
         if (nr == BP_VECTOR && !nrips) {
-               unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+               unsigned long rip, old_rip = kvm_rip_read(vcpu);
  
                 /*
                  * For guest debugging where we have to reinject #BP if some
@@ -375,8 +375,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
                  * raises a fault that is not intercepted. Still better than
                  * failing in all cases.
                  */
-               (void)skip_emulated_instruction(&svm->vcpu);
-               rip = kvm_rip_read(&svm->vcpu);
+               (void)skip_emulated_instruction(vcpu);
+               rip = kvm_rip_read(vcpu);
                 svm->int3_rip = rip + svm->vmcb->save.cs.base;
                 svm->int3_injected = rip - old_rip;
         }
@@ -881,7 +881,7 @@ static __init void svm_adjust_mmio_mask(void)
          */
         mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
  
-       kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK);
+       kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
  }
  
  static void svm_hardware_teardown(void)
@@ -1084,8 +1084,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
         if (is_guest_mode(vcpu)) {
                 /* Write L1's TSC offset.  */
                 g_tsc_offset = svm->vmcb->control.tsc_offset -
-                              svm->nested.hsave->control.tsc_offset;
-               svm->nested.hsave->control.tsc_offset = offset;
+                              svm->vmcb01.ptr->control.tsc_offset;
+               svm->vmcb01.ptr->control.tsc_offset = offset;
         }
  
         trace_kvm_write_tsc_offset(vcpu->vcpu_id,
@@ -1113,12 +1113,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm)
         }
  }
  
-static void init_vmcb(struct vcpu_svm *svm)
+static void init_vmcb(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_svm *svm = to_svm(vcpu);
         struct vmcb_control_area *control = &svm->vmcb->control;
         struct vmcb_save_area *save = &svm->vmcb->save;
  
-       svm->vcpu.arch.hflags = 0;
+       vcpu->arch.hflags = 0;
  
         svm_set_intercept(svm, INTERCEPT_CR0_READ);
         svm_set_intercept(svm, INTERCEPT_CR3_READ);
@@ -1126,7 +1127,7 @@ static void init_vmcb(struct vcpu_svm *svm)
         svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
         svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
         svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
-       if (!kvm_vcpu_apicv_active(&svm->vcpu))
+       if (!kvm_vcpu_apicv_active(vcpu))
                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
  
         set_dr_intercepts(svm);
@@ -1170,12 +1171,12 @@ static void init_vmcb(struct vcpu_svm *svm)
         svm_set_intercept(svm, INTERCEPT_RDPRU);
         svm_set_intercept(svm, INTERCEPT_RSM);
  
-       if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
+       if (!kvm_mwait_in_guest(vcpu->kvm)) {
                 svm_set_intercept(svm, INTERCEPT_MONITOR);
                 svm_set_intercept(svm, INTERCEPT_MWAIT);
         }
  
-       if (!kvm_hlt_in_guest(svm->vcpu.kvm))
+       if (!kvm_hlt_in_guest(vcpu->kvm))
                 svm_set_intercept(svm, INTERCEPT_HLT);
  
         control->iopm_base_pa = __sme_set(iopm_base);
@@ -1201,19 +1202,19 @@ static void init_vmcb(struct vcpu_svm *svm)
         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
  
-       svm_set_cr4(&svm->vcpu, 0);
-       svm_set_efer(&svm->vcpu, 0);
+       svm_set_cr4(vcpu, 0);
+       svm_set_efer(vcpu, 0);
         save->dr6 = 0xffff0ff0;
-       kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED);
+       kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
         save->rip = 0x0000fff0;
-       svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
+       vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
  
         /*
          * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
          * It also updates the guest-visible cr0 value.
          */
-       svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
-       kvm_mmu_reset_context(&svm->vcpu);
+       svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+       kvm_mmu_reset_context(vcpu);
  
         save->cr4 = X86_CR4_PAE;
         /* rdx = ?? */
@@ -1225,17 +1226,18 @@ static void init_vmcb(struct vcpu_svm *svm)
                 clr_exception_intercept(svm, PF_VECTOR);
                 svm_clr_intercept(svm, INTERCEPT_CR3_READ);
                 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
-               save->g_pat = svm->vcpu.arch.pat;
+               save->g_pat = vcpu->arch.pat;
                 save->cr3 = 0;
                 save->cr4 = 0;
         }
-       svm->asid_generation = 0;
+       svm->current_vmcb->asid_generation = 0;
         svm->asid = 0;
  
         svm->nested.vmcb12_gpa = 0;
-       svm->vcpu.arch.hflags = 0;
+       svm->nested.last_vmcb12_gpa = 0;
+       vcpu->arch.hflags = 0;
  
-       if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
+       if (!kvm_pause_in_guest(vcpu->kvm)) {
                 control->pause_filter_count = pause_filter_count;
                 if (pause_filter_thresh)
                         control->pause_filter_thresh = pause_filter_thresh;
@@ -1246,7 +1248,14 @@ static void init_vmcb(struct vcpu_svm *svm)
  
         svm_check_invpcid(svm);
  
-       if (kvm_vcpu_apicv_active(&svm->vcpu))
+       /*
+        * If the host supports V_SPEC_CTRL then disable the interception
+        * of MSR_IA32_SPEC_CTRL.
+        */
+       if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+
+       if (kvm_vcpu_apicv_active(vcpu))
                 avic_init_vmcb(svm);
  
         /*
@@ -1265,11 +1274,11 @@ static void init_vmcb(struct vcpu_svm *svm)
                 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
         }
  
-       if (sev_guest(svm->vcpu.kvm)) {
+       if (sev_guest(vcpu->kvm)) {
                 svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
                 clr_exception_intercept(svm, UD_VECTOR);
  
-               if (sev_es_guest(svm->vcpu.kvm)) {
+               if (sev_es_guest(vcpu->kvm)) {
                         /* Perform SEV-ES specific VMCB updates */
                         sev_es_init_vmcb(svm);
                 }
@@ -1291,12 +1300,12 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
         svm->virt_spec_ctrl = 0;
  
         if (!init_event) {
-               svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
-                                          MSR_IA32_APICBASE_ENABLE;
-               if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
-                       svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+               vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+                                      MSR_IA32_APICBASE_ENABLE;
+               if (kvm_vcpu_is_reset_bsp(vcpu))
+                       vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
         }
-       init_vmcb(svm);
+       init_vmcb(vcpu);
  
         kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
         kvm_rdx_write(vcpu, eax);
@@ -1305,10 +1314,25 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
                 avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
  }
  
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
+{
+       svm->current_vmcb = target_vmcb;
+       svm->vmcb = target_vmcb->ptr;
+       svm->vmcb_pa = target_vmcb->pa;
+
+       /*
+       * Track the physical CPU the target_vmcb is running on
+       * in order to mark the VMCB dirty if the cpu changes at
+       * its next vmrun.
+       */
+
+       svm->current_vmcb->cpu = svm->vcpu.cpu;
+}
+
  static int svm_create_vcpu(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm;
-       struct page *vmcb_page;
+       struct page *vmcb01_page;
         struct page *vmsa_page = NULL;
         int err;
  
@@ -1316,11 +1340,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
         svm = to_svm(vcpu);
  
         err = -ENOMEM;
-       vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-       if (!vmcb_page)
+       vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+       if (!vmcb01_page)
                 goto out;
  
-       if (sev_es_guest(svm->vcpu.kvm)) {
+       if (sev_es_guest(vcpu->kvm)) {
                 /*
                  * SEV-ES guests require a separate VMSA page used to contain
                  * the encrypted register state of the guest.
@@ -1356,20 +1380,21 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
  
         svm_vcpu_init_msrpm(vcpu, svm->msrpm);
  
-       svm->vmcb = page_address(vmcb_page);
-       svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
+       svm->vmcb01.ptr = page_address(vmcb01_page);
+       svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
  
         if (vmsa_page)
                 svm->vmsa = page_address(vmsa_page);
  
-       svm->asid_generation = 0;
         svm->guest_state_loaded = false;
-       init_vmcb(svm);
+
+       svm_switch_vmcb(svm, &svm->vmcb01);
+       init_vmcb(vcpu);
  
         svm_init_osvw(vcpu);
         vcpu->arch.microcode_version = 0x01000065;
  
-       if (sev_es_guest(svm->vcpu.kvm))
+       if (sev_es_guest(vcpu->kvm))
                 /* Perform SEV-ES specific VMCB creation updates */
                 sev_es_create_vcpu(svm);
  
@@ -1379,7 +1404,7 @@ error_free_vmsa_page:
         if (vmsa_page)
                 __free_page(vmsa_page);
  error_free_vmcb_page:
-       __free_page(vmcb_page);
+       __free_page(vmcb01_page);
  out:
         return err;
  }
@@ -1407,7 +1432,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
  
         sev_free_vcpu(vcpu);
  
-       __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
+       __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
         __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
  }
  
@@ -1432,7 +1457,7 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
          * Save additional host state that will be restored on VMEXIT (sev-es)
          * or subsequent vmload of host save area.
          */
-       if (sev_es_guest(svm->vcpu.kvm)) {
+       if (sev_es_guest(vcpu->kvm)) {
                 sev_es_prepare_guest_switch(svm, vcpu->cpu);
         } else {
                 vmsave(__sme_page_pa(sd->save_area));
@@ -1476,11 +1501,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
         struct vcpu_svm *svm = to_svm(vcpu);
         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
  
-       if (unlikely(cpu != vcpu->cpu)) {
-               svm->asid_generation = 0;
-               vmcb_mark_all_dirty(svm->vmcb);
-       }
-
         if (sd->current_vmcb != svm->vmcb) {
                 sd->current_vmcb = svm->vmcb;
                 indirect_branch_prediction_barrier();
@@ -1564,7 +1584,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
         /* Drop int_ctl fields related to VINTR injection.  */
         svm->vmcb->control.int_ctl &= mask;
         if (is_guest_mode(&svm->vcpu)) {
-               svm->nested.hsave->control.int_ctl &= mask;
+               svm->vmcb01.ptr->control.int_ctl &= mask;
  
                 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
                         (svm->nested.ctl.int_ctl & V_TPR_MASK));
@@ -1577,16 +1597,17 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
  static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
  {
         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+       struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
  
         switch (seg) {
         case VCPU_SREG_CS: return &save->cs;
         case VCPU_SREG_DS: return &save->ds;
         case VCPU_SREG_ES: return &save->es;
-       case VCPU_SREG_FS: return &save->fs;
-       case VCPU_SREG_GS: return &save->gs;
+       case VCPU_SREG_FS: return &save01->fs;
+       case VCPU_SREG_GS: return &save01->gs;
         case VCPU_SREG_SS: return &save->ss;
-       case VCPU_SREG_TR: return &save->tr;
-       case VCPU_SREG_LDTR: return &save->ldtr;
+       case VCPU_SREG_TR: return &save01->tr;
+       case VCPU_SREG_LDTR: return &save01->ldtr;
         }
         BUG();
         return NULL;
@@ -1709,37 +1730,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
         vmcb_mark_dirty(svm->vmcb, VMCB_DT);
  }
  
-static void update_cr0_intercept(struct vcpu_svm *svm)
-{
-       ulong gcr0;
-       u64 *hcr0;
-
-       /*
-        * SEV-ES guests must always keep the CR intercepts cleared. CR
-        * tracking is done using the CR write traps.
-        */
-       if (sev_es_guest(svm->vcpu.kvm))
-               return;
-
-       gcr0 = svm->vcpu.arch.cr0;
-       hcr0 = &svm->vmcb->save.cr0;
-       *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
-               | (gcr0 & SVM_CR0_SELECTIVE_MASK);
-
-       vmcb_mark_dirty(svm->vmcb, VMCB_CR);
-
-       if (gcr0 == *hcr0) {
-               svm_clr_intercept(svm, INTERCEPT_CR0_READ);
-               svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
-       } else {
-               svm_set_intercept(svm, INTERCEPT_CR0_READ);
-               svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
-       }
-}
-
  void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
+       u64 hcr0 = cr0;
  
  #ifdef CONFIG_X86_64
         if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
@@ -1757,7 +1751,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         vcpu->arch.cr0 = cr0;
  
         if (!npt_enabled)
-               cr0 |= X86_CR0_PG | X86_CR0_WP;
+               hcr0 |= X86_CR0_PG | X86_CR0_WP;
  
         /*
          * re-enable caching here because the QEMU bios
@@ -1765,10 +1759,26 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
          * reboot
          */
         if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
-               cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
-       svm->vmcb->save.cr0 = cr0;
+               hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+
+       svm->vmcb->save.cr0 = hcr0;
         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
-       update_cr0_intercept(svm);
+
+       /*
+        * SEV-ES guests must always keep the CR intercepts cleared. CR
+        * tracking is done using the CR write traps.
+        */
+       if (sev_es_guest(vcpu->kvm))
+               return;
+
+       if (hcr0 == cr0) {
+               /* Selective CR0 write remains on.  */
+               svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+               svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+       } else {
+               svm_set_intercept(svm, INTERCEPT_CR0_READ);
+               svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+       }
  }
  
  static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1847,7 +1857,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
                 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
         }
  
-       svm->asid_generation = sd->asid_generation;
+       svm->current_vmcb->asid_generation = sd->asid_generation;
         svm->asid = sd->next_asid++;
  }
  
@@ -1896,39 +1906,43 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
         vmcb_mark_dirty(svm->vmcb, VMCB_DR);
  }
  
-static int pf_interception(struct vcpu_svm *svm)
+static int pf_interception(struct kvm_vcpu *vcpu)
  {
-       u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       u64 fault_address = svm->vmcb->control.exit_info_2;
         u64 error_code = svm->vmcb->control.exit_info_1;
  
-       return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
+       return kvm_handle_page_fault(vcpu, error_code, fault_address,
                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
                         svm->vmcb->control.insn_bytes : NULL,
                         svm->vmcb->control.insn_len);
  }
  
-static int npf_interception(struct vcpu_svm *svm)
+static int npf_interception(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_svm *svm = to_svm(vcpu);
+
         u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
         u64 error_code = svm->vmcb->control.exit_info_1;
  
         trace_kvm_page_fault(fault_address, error_code);
-       return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+       return kvm_mmu_page_fault(vcpu, fault_address, error_code,
                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
                         svm->vmcb->control.insn_bytes : NULL,
                         svm->vmcb->control.insn_len);
  }
  
-static int db_interception(struct vcpu_svm *svm)
+static int db_interception(struct kvm_vcpu *vcpu)
  {
-       struct kvm_run *kvm_run = svm->vcpu.run;
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct kvm_run *kvm_run = vcpu->run;
+       struct vcpu_svm *svm = to_svm(vcpu);
  
-       if (!(svm->vcpu.guest_debug &
+       if (!(vcpu->guest_debug &
               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
                 !svm->nmi_singlestep) {
                 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
-               kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
+               kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
                 return 1;
         }
  
@@ -1938,7 +1952,7 @@ static int db_interception(struct vcpu_svm *svm)
                 kvm_make_request(KVM_REQ_EVENT, vcpu);
         }
  
-       if (svm->vcpu.guest_debug &
+       if (vcpu->guest_debug &
             (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
                 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
@@ -1952,9 +1966,10 @@ static int db_interception(struct vcpu_svm *svm)
         return 1;
  }
  
-static int bp_interception(struct vcpu_svm *svm)
+static int bp_interception(struct kvm_vcpu *vcpu)
  {
-       struct kvm_run *kvm_run = svm->vcpu.run;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_run *kvm_run = vcpu->run;
  
         kvm_run->exit_reason = KVM_EXIT_DEBUG;
         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1962,14 +1977,14 @@ static int bp_interception(struct vcpu_svm *svm)
         return 0;
  }
  
-static int ud_interception(struct vcpu_svm *svm)
+static int ud_interception(struct kvm_vcpu *vcpu)
  {
-       return handle_ud(&svm->vcpu);
+       return handle_ud(vcpu);
  }
  
-static int ac_interception(struct vcpu_svm *svm)
+static int ac_interception(struct kvm_vcpu *vcpu)
  {
-       kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+       kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
         return 1;
  }
  
@@ -2012,7 +2027,7 @@ static bool is_erratum_383(void)
         return true;
  }
  
-static void svm_handle_mce(struct vcpu_svm *svm)
+static void svm_handle_mce(struct kvm_vcpu *vcpu)
  {
         if (is_erratum_383()) {
                 /*
@@ -2021,7 +2036,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
                  */
                 pr_err("KVM: Guest triggered AMD Erratum 383\n");
  
-               kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
+               kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
  
                 return;
         }
@@ -2033,20 +2048,21 @@ static void svm_handle_mce(struct vcpu_svm *svm)
         kvm_machine_check();
  }
  
-static int mc_interception(struct vcpu_svm *svm)
+static int mc_interception(struct kvm_vcpu *vcpu)
  {
         return 1;
  }
  
-static int shutdown_interception(struct vcpu_svm *svm)
+static int shutdown_interception(struct kvm_vcpu *vcpu)
  {
-       struct kvm_run *kvm_run = svm->vcpu.run;
+       struct kvm_run *kvm_run = vcpu->run;
+       struct vcpu_svm *svm = to_svm(vcpu);
  
         /*
          * The VM save area has already been encrypted so it
          * cannot be reinitialized - just terminate.
          */
-       if (sev_es_guest(svm->vcpu.kvm))
+       if (sev_es_guest(vcpu->kvm))
                 return -EINVAL;
  
         /*
@@ -2054,20 +2070,20 @@ static int shutdown_interception(struct vcpu_svm *svm)
          * so reinitialize it.
          */
         clear_page(svm->vmcb);
-       init_vmcb(svm);
+       init_vmcb(vcpu);
  
         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
         return 0;
  }
  
-static int io_interception(struct vcpu_svm *svm)
+static int io_interception(struct kvm_vcpu *vcpu)
  {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
         int size, in, string;
         unsigned port;
  
-       ++svm->vcpu.stat.io_exits;
+       ++vcpu->stat.io_exits;
         string = (io_info & SVM_IOIO_STR_MASK) != 0;
         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
         port = io_info >> 16;
@@ -2082,93 +2098,67 @@ static int io_interception(struct vcpu_svm *svm)
  
         svm->next_rip = svm->vmcb->control.exit_info_2;
  
-       return kvm_fast_pio(&svm->vcpu, size, port, in);
-}
-
-static int nmi_interception(struct vcpu_svm *svm)
-{
-       return 1;
+       return kvm_fast_pio(vcpu, size, port, in);
  }
  
-static int intr_interception(struct vcpu_svm *svm)
+static int nmi_interception(struct kvm_vcpu *vcpu)
  {
-       ++svm->vcpu.stat.irq_exits;
         return 1;
  }
  
-static int nop_on_interception(struct vcpu_svm *svm)
+static int intr_interception(struct kvm_vcpu *vcpu)
  {
+       ++vcpu->stat.irq_exits;
         return 1;
  }
  
-static int halt_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm)
+static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
  {
-       return kvm_emulate_hypercall(&svm->vcpu);
-}
-
-static int vmload_interception(struct vcpu_svm *svm)
-{
-       struct vmcb *nested_vmcb;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *vmcb12;
         struct kvm_host_map map;
         int ret;
  
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                 return 1;
  
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+       ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
         if (ret) {
                 if (ret == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
+                       kvm_inject_gp(vcpu, 0);
                 return 1;
         }
  
-       nested_vmcb = map.hva;
+       vmcb12 = map.hva;
+
+       ret = kvm_skip_emulated_instruction(vcpu);
  
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+       if (vmload)
+               nested_svm_vmloadsave(vmcb12, svm->vmcb);
+       else
+               nested_svm_vmloadsave(svm->vmcb, vmcb12);
  
-       nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       kvm_vcpu_unmap(vcpu, &map, true);
  
         return ret;
  }
  
-static int vmsave_interception(struct vcpu_svm *svm)
+static int vmload_interception(struct kvm_vcpu *vcpu)
  {
-       struct vmcb *nested_vmcb;
-       struct kvm_host_map map;
-       int ret;
-
-       if (nested_svm_check_permissions(svm))
-               return 1;
-
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
-       if (ret) {
-               if (ret == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
-               return 1;
-       }
-
-       nested_vmcb = map.hva;
-
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
-       nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       return vmload_vmsave_interception(vcpu, true);
+}
  
-       return ret;
+static int vmsave_interception(struct kvm_vcpu *vcpu)
+{
+       return vmload_vmsave_interception(vcpu, false);
  }
  
-static int vmrun_interception(struct vcpu_svm *svm)
+static int vmrun_interception(struct kvm_vcpu *vcpu)
  {
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                 return 1;
  
-       return nested_svm_vmrun(svm);
+       return nested_svm_vmrun(vcpu);
  }
  
  enum {
@@ -2207,7 +2197,7 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
                 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
                 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
         };
-       int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = {
+       int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
                 [SVM_INSTR_VMRUN] = vmrun_interception,
                 [SVM_INSTR_VMLOAD] = vmload_interception,
                 [SVM_INSTR_VMSAVE] = vmsave_interception,
@@ -2216,17 +2206,13 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
         int ret;
  
         if (is_guest_mode(vcpu)) {
-               svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
-               svm->vmcb->control.exit_info_1 = 0;
-               svm->vmcb->control.exit_info_2 = 0;
-
                 /* Returns '1' or -errno on failure, '0' on success. */
-               ret = nested_svm_vmexit(svm);
+               ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
                 if (ret)
                         return ret;
                 return 1;
         }
-       return svm_instr_handlers[opcode](svm);
+       return svm_instr_handlers[opcode](vcpu);
  }
  
  /*
@@ -2237,9 +2223,9 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
   *      regions (e.g. SMM memory on host).
   *   2) VMware backdoor
   */
-static int gp_interception(struct vcpu_svm *svm)
+static int gp_interception(struct kvm_vcpu *vcpu)
  {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
         u32 error_code = svm->vmcb->control.exit_info_1;
         int opcode;
  
@@ -2304,73 +2290,52 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
         }
  }
  
-static int stgi_interception(struct vcpu_svm *svm)
+static int stgi_interception(struct kvm_vcpu *vcpu)
  {
         int ret;
  
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                 return 1;
  
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-       svm_set_gif(svm, true);
+       ret = kvm_skip_emulated_instruction(vcpu);
+       svm_set_gif(to_svm(vcpu), true);
         return ret;
  }
  
-static int clgi_interception(struct vcpu_svm *svm)
+static int clgi_interception(struct kvm_vcpu *vcpu)
  {
         int ret;
  
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                 return 1;
  
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-       svm_set_gif(svm, false);
+       ret = kvm_skip_emulated_instruction(vcpu);
+       svm_set_gif(to_svm(vcpu), false);
         return ret;
  }
  
-static int invlpga_interception(struct vcpu_svm *svm)
+static int invlpga_interception(struct kvm_vcpu *vcpu)
  {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-
-       trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
-                         kvm_rax_read(&svm->vcpu));
+       trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, kvm_rcx_read(vcpu),
+                         kvm_rax_read(vcpu));
  
         /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
-       kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
+       kvm_mmu_invlpg(vcpu, kvm_rax_read(vcpu));
  
-       return kvm_skip_emulated_instruction(&svm->vcpu);
+       return kvm_skip_emulated_instruction(vcpu);
  }
  
-static int skinit_interception(struct vcpu_svm *svm)
+static int skinit_interception(struct kvm_vcpu *vcpu)
  {
-       trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
+       trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
  
-       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       kvm_queue_exception(vcpu, UD_VECTOR);
         return 1;
  }
  
-static int wbinvd_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_wbinvd(&svm->vcpu);
-}
-
-static int xsetbv_interception(struct vcpu_svm *svm)
-{
-       u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
-       u32 index = kvm_rcx_read(&svm->vcpu);
-
-       int err = kvm_set_xcr(&svm->vcpu, index, new_bv);
-       return kvm_complete_insn_gp(&svm->vcpu, err);
-}
-
-static int rdpru_interception(struct vcpu_svm *svm)
-{
-       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
-       return 1;
-}
-
-static int task_switch_interception(struct vcpu_svm *svm)
+static int task_switch_interception(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_svm *svm = to_svm(vcpu);
         u16 tss_selector;
         int reason;
         int int_type = svm->vmcb->control.exit_int_info &
@@ -2399,7 +2364,7 @@ static int task_switch_interception(struct vcpu_svm *svm)
         if (reason == TASK_SWITCH_GATE) {
                 switch (type) {
                 case SVM_EXITINTINFO_TYPE_NMI:
-                       svm->vcpu.arch.nmi_injected = false;
+                       vcpu->arch.nmi_injected = false;
                         break;
                 case SVM_EXITINTINFO_TYPE_EXEPT:
                         if (svm->vmcb->control.exit_info_2 &
@@ -2408,10 +2373,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
                                 error_code =
                                         (u32)svm->vmcb->control.exit_info_2;
                         }
-                       kvm_clear_exception_queue(&svm->vcpu);
+                       kvm_clear_exception_queue(vcpu);
                         break;
                 case SVM_EXITINTINFO_TYPE_INTR:
-                       kvm_clear_interrupt_queue(&svm->vcpu);
+                       kvm_clear_interrupt_queue(vcpu);
                         break;
                 default:
                         break;
@@ -2422,77 +2387,58 @@ static int task_switch_interception(struct vcpu_svm *svm)
             int_type == SVM_EXITINTINFO_TYPE_SOFT ||
             (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
              (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
-               if (!skip_emulated_instruction(&svm->vcpu))
+               if (!skip_emulated_instruction(vcpu))
                         return 0;
         }
  
         if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
                 int_vec = -1;
  
-       return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+       return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
                                has_error_code, error_code);
  }
  
-static int cpuid_interception(struct vcpu_svm *svm)
+static int iret_interception(struct kvm_vcpu *vcpu)
  {
-       return kvm_emulate_cpuid(&svm->vcpu);
-}
+       struct vcpu_svm *svm = to_svm(vcpu);
  
-static int iret_interception(struct vcpu_svm *svm)
-{
-       ++svm->vcpu.stat.nmi_window_exits;
-       svm->vcpu.arch.hflags |= HF_IRET_MASK;
-       if (!sev_es_guest(svm->vcpu.kvm)) {
+       ++vcpu->stat.nmi_window_exits;
+       vcpu->arch.hflags |= HF_IRET_MASK;
+       if (!sev_es_guest(vcpu->kvm)) {
                 svm_clr_intercept(svm, INTERCEPT_IRET);
-               svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+               svm->nmi_iret_rip = kvm_rip_read(vcpu);
         }
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
         return 1;
  }
  
-static int invd_interception(struct vcpu_svm *svm)
-{
-       /* Treat an INVD instruction as a NOP and just skip it. */
-       return kvm_skip_emulated_instruction(&svm->vcpu);
-}
-
-static int invlpg_interception(struct vcpu_svm *svm)
+static int invlpg_interception(struct kvm_vcpu *vcpu)
  {
         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return kvm_emulate_instruction(&svm->vcpu, 0);
-
-       kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
-       return kvm_skip_emulated_instruction(&svm->vcpu);
-}
+               return kvm_emulate_instruction(vcpu, 0);
  
-static int emulate_on_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_instruction(&svm->vcpu, 0);
+       kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
+       return kvm_skip_emulated_instruction(vcpu);
  }
  
-static int rsm_interception(struct vcpu_svm *svm)
+static int emulate_on_interception(struct kvm_vcpu *vcpu)
  {
-       return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
+       return kvm_emulate_instruction(vcpu, 0);
  }
  
-static int rdpmc_interception(struct vcpu_svm *svm)
+static int rsm_interception(struct kvm_vcpu *vcpu)
  {
-       int err;
-
-       if (!nrips)
-               return emulate_on_interception(svm);
-
-       err = kvm_rdpmc(&svm->vcpu);
-       return kvm_complete_insn_gp(&svm->vcpu, err);
+       return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
  }
  
-static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
                                             unsigned long val)
  {
-       unsigned long cr0 = svm->vcpu.arch.cr0;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       unsigned long cr0 = vcpu->arch.cr0;
         bool ret = false;
  
-       if (!is_guest_mode(&svm->vcpu) ||
+       if (!is_guest_mode(vcpu) ||
             (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
                 return false;
  
@@ -2509,17 +2455,18 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
  
  #define CR_VALID (1ULL << 63)
  
-static int cr_interception(struct vcpu_svm *svm)
+static int cr_interception(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_svm *svm = to_svm(vcpu);
         int reg, cr;
         unsigned long val;
         int err;
  
         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return emulate_on_interception(svm);
+               return emulate_on_interception(vcpu);
  
         if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
-               return emulate_on_interception(svm);
+               return emulate_on_interception(vcpu);
  
         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
         if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
@@ -2530,61 +2477,61 @@ static int cr_interception(struct vcpu_svm *svm)
         err = 0;
         if (cr >= 16) { /* mov to cr */
                 cr -= 16;
-               val = kvm_register_read(&svm->vcpu, reg);
+               val = kvm_register_read(vcpu, reg);
                 trace_kvm_cr_write(cr, val);
                 switch (cr) {
                 case 0:
-                       if (!check_selective_cr0_intercepted(svm, val))
-                               err = kvm_set_cr0(&svm->vcpu, val);
+                       if (!check_selective_cr0_intercepted(vcpu, val))
+                               err = kvm_set_cr0(vcpu, val);
                         else
                                 return 1;
  
                         break;
                 case 3:
-                       err = kvm_set_cr3(&svm->vcpu, val);
+                       err = kvm_set_cr3(vcpu, val);
                         break;
                 case 4:
-                       err = kvm_set_cr4(&svm->vcpu, val);
+                       err = kvm_set_cr4(vcpu, val);
                         break;
                 case 8:
-                       err = kvm_set_cr8(&svm->vcpu, val);
+                       err = kvm_set_cr8(vcpu, val);
                         break;
                 default:
                         WARN(1, "unhandled write to CR%d", cr);
-                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+                       kvm_queue_exception(vcpu, UD_VECTOR);
                         return 1;
                 }
         } else { /* mov from cr */
                 switch (cr) {
                 case 0:
-                       val = kvm_read_cr0(&svm->vcpu);
+                       val = kvm_read_cr0(vcpu);
                         break;
                 case 2:
-                       val = svm->vcpu.arch.cr2;
+                       val = vcpu->arch.cr2;
                         break;
                 case 3:
-                       val = kvm_read_cr3(&svm->vcpu);
+                       val = kvm_read_cr3(vcpu);
                         break;
                 case 4:
-                       val = kvm_read_cr4(&svm->vcpu);
+                       val = kvm_read_cr4(vcpu);
                         break;
                 case 8:
-                       val = kvm_get_cr8(&svm->vcpu);
+                       val = kvm_get_cr8(vcpu);
                         break;
                 default:
                         WARN(1, "unhandled read from CR%d", cr);
-                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+                       kvm_queue_exception(vcpu, UD_VECTOR);
                         return 1;
                 }
-               kvm_register_write(&svm->vcpu, reg, val);
+               kvm_register_write(vcpu, reg, val);
                 trace_kvm_cr_read(cr, val);
         }
-       return kvm_complete_insn_gp(&svm->vcpu, err);
+       return kvm_complete_insn_gp(vcpu, err);
  }
  
-static int cr_trap(struct vcpu_svm *svm)
+static int cr_trap(struct kvm_vcpu *vcpu)
  {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
         unsigned long old_value, new_value;
         unsigned int cr;
         int ret = 0;
@@ -2606,7 +2553,7 @@ static int cr_trap(struct vcpu_svm *svm)
                 kvm_post_set_cr4(vcpu, old_value, new_value);
                 break;
         case 8:
-               ret = kvm_set_cr8(&svm->vcpu, new_value);
+               ret = kvm_set_cr8(vcpu, new_value);
                 break;
         default:
                 WARN(1, "unhandled CR%d write trap", cr);
@@ -2617,57 +2564,57 @@ static int cr_trap(struct vcpu_svm *svm)
         return kvm_complete_insn_gp(vcpu, ret);
  }
  
-static int dr_interception(struct vcpu_svm *svm)
+static int dr_interception(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_svm *svm = to_svm(vcpu);
         int reg, dr;
         unsigned long val;
         int err = 0;
  
-       if (svm->vcpu.guest_debug == 0) {
+       if (vcpu->guest_debug == 0) {
                 /*
                  * No more DR vmexits; force a reload of the debug registers
                  * and reenter on this instruction.  The next vmexit will
                  * retrieve the full state of the debug registers.
                  */
                 clr_dr_intercepts(svm);
-               svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
                 return 1;
         }
  
         if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return emulate_on_interception(svm);
+               return emulate_on_interception(vcpu);
  
         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
         dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
         if (dr >= 16) { /* mov to DRn  */
                 dr -= 16;
-               val = kvm_register_read(&svm->vcpu, reg);
-               err = kvm_set_dr(&svm->vcpu, dr, val);
+               val = kvm_register_read(vcpu, reg);
+               err = kvm_set_dr(vcpu, dr, val);
         } else {
-               kvm_get_dr(&svm->vcpu, dr, &val);
-               kvm_register_write(&svm->vcpu, reg, val);
+               kvm_get_dr(vcpu, dr, &val);
+               kvm_register_write(vcpu, reg, val);
         }
  
-       return kvm_complete_insn_gp(&svm->vcpu, err);
+       return kvm_complete_insn_gp(vcpu, err);
  }
  
-static int cr8_write_interception(struct vcpu_svm *svm)
+static int cr8_write_interception(struct kvm_vcpu *vcpu)
  {
-       struct kvm_run *kvm_run = svm->vcpu.run;
         int r;
  
-       u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+       u8 cr8_prev = kvm_get_cr8(vcpu);
         /* instruction emulation calls kvm_set_cr8() */
-       r = cr_interception(svm);
-       if (lapic_in_kernel(&svm->vcpu))
+       r = cr_interception(vcpu);
+       if (lapic_in_kernel(vcpu))
                 return r;
-       if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
+       if (cr8_prev <= kvm_get_cr8(vcpu))
                 return r;
-       kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+       vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
         return 0;
  }
  
-static int efer_trap(struct vcpu_svm *svm)
+static int efer_trap(struct kvm_vcpu *vcpu)
  {
         struct msr_data msr_info;
         int ret;
@@ -2680,10 +2627,10 @@ static int efer_trap(struct vcpu_svm *svm)
          */
         msr_info.host_initiated = false;
         msr_info.index = MSR_EFER;
-       msr_info.data = svm->vmcb->control.exit_info_1 & ~EFER_SVME;
-       ret = kvm_set_msr_common(&svm->vcpu, &msr_info);
+       msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
+       ret = kvm_set_msr_common(vcpu, &msr_info);
  
-       return kvm_complete_insn_gp(&svm->vcpu, ret);
+       return kvm_complete_insn_gp(vcpu, ret);
  }
  
  static int svm_get_msr_feature(struct kvm_msr_entry *msr)
@@ -2710,24 +2657,24 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  
         switch (msr_info->index) {
         case MSR_STAR:
-               msr_info->data = svm->vmcb->save.star;
+               msr_info->data = svm->vmcb01.ptr->save.star;
                 break;
  #ifdef CONFIG_X86_64
         case MSR_LSTAR:
-               msr_info->data = svm->vmcb->save.lstar;
+               msr_info->data = svm->vmcb01.ptr->save.lstar;
                 break;
         case MSR_CSTAR:
-               msr_info->data = svm->vmcb->save.cstar;
+               msr_info->data = svm->vmcb01.ptr->save.cstar;
                 break;
         case MSR_KERNEL_GS_BASE:
-               msr_info->data = svm->vmcb->save.kernel_gs_base;
+               msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
                 break;
         case MSR_SYSCALL_MASK:
-               msr_info->data = svm->vmcb->save.sfmask;
+               msr_info->data = svm->vmcb01.ptr->save.sfmask;
                 break;
  #endif
         case MSR_IA32_SYSENTER_CS:
-               msr_info->data = svm->vmcb->save.sysenter_cs;
+               msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
                 break;
         case MSR_IA32_SYSENTER_EIP:
                 msr_info->data = svm->sysenter_eip;
@@ -2771,7 +2718,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                     !guest_has_spec_ctrl_msr(vcpu))
                         return 1;
  
-               msr_info->data = svm->spec_ctrl;
+               if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+                       msr_info->data = svm->vmcb->save.spec_ctrl;
+               else
+                       msr_info->data = svm->spec_ctrl;
                 break;
         case MSR_AMD64_VIRT_SPEC_CTRL:
                 if (!msr_info->host_initiated &&
@@ -2809,8 +2759,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
-       if (!sev_es_guest(svm->vcpu.kvm) || !err)
-               return kvm_complete_insn_gp(&svm->vcpu, err);
+       if (!sev_es_guest(vcpu->kvm) || !err)
+               return kvm_complete_insn_gp(vcpu, err);
  
         ghcb_set_sw_exit_info_1(svm->ghcb, 1);
         ghcb_set_sw_exit_info_2(svm->ghcb,
@@ -2820,11 +2770,6 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
         return 1;
  }
  
-static int rdmsr_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_rdmsr(&svm->vcpu);
-}
-
  static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -2861,7 +2806,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
                         return 1;
                 vcpu->arch.pat = data;
-               svm->vmcb->save.g_pat = data;
+               svm->vmcb01.ptr->save.g_pat = data;
+               if (is_guest_mode(vcpu))
+                       nested_vmcb02_compute_g_pat(svm);
                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
                 break;
         case MSR_IA32_SPEC_CTRL:
@@ -2872,7 +2819,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                 if (kvm_spec_ctrl_test_value(data))
                         return 1;
  
-               svm->spec_ctrl = data;
+               if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+                       svm->vmcb->save.spec_ctrl = data;
+               else
+                       svm->spec_ctrl = data;
                 if (!data)
                         break;
  
@@ -2915,32 +2865,32 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                 svm->virt_spec_ctrl = data;
                 break;
         case MSR_STAR:
-               svm->vmcb->save.star = data;
+               svm->vmcb01.ptr->save.star = data;
                 break;
  #ifdef CONFIG_X86_64
         case MSR_LSTAR:
-               svm->vmcb->save.lstar = data;
+               svm->vmcb01.ptr->save.lstar = data;
                 break;
         case MSR_CSTAR:
-               svm->vmcb->save.cstar = data;
+               svm->vmcb01.ptr->save.cstar = data;
                 break;
         case MSR_KERNEL_GS_BASE:
-               svm->vmcb->save.kernel_gs_base = data;
+               svm->vmcb01.ptr->save.kernel_gs_base = data;
                 break;
         case MSR_SYSCALL_MASK:
-               svm->vmcb->save.sfmask = data;
+               svm->vmcb01.ptr->save.sfmask = data;
                 break;
  #endif
         case MSR_IA32_SYSENTER_CS:
-               svm->vmcb->save.sysenter_cs = data;
+               svm->vmcb01.ptr->save.sysenter_cs = data;
                 break;
         case MSR_IA32_SYSENTER_EIP:
                 svm->sysenter_eip = data;
-               svm->vmcb->save.sysenter_eip = data;
+               svm->vmcb01.ptr->save.sysenter_eip = data;
                 break;
         case MSR_IA32_SYSENTER_ESP:
                 svm->sysenter_esp = data;
-               svm->vmcb->save.sysenter_esp = data;
+               svm->vmcb01.ptr->save.sysenter_esp = data;
                 break;
         case MSR_TSC_AUX:
                 if (!boot_cpu_has(X86_FEATURE_RDTSCP))
@@ -3006,38 +2956,32 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
         return 0;
  }
  
-static int wrmsr_interception(struct vcpu_svm *svm)
+static int msr_interception(struct kvm_vcpu *vcpu)
  {
-       return kvm_emulate_wrmsr(&svm->vcpu);
-}
-
-static int msr_interception(struct vcpu_svm *svm)
-{
-       if (svm->vmcb->control.exit_info_1)
-               return wrmsr_interception(svm);
+       if (to_svm(vcpu)->vmcb->control.exit_info_1)
+               return kvm_emulate_wrmsr(vcpu);
         else
-               return rdmsr_interception(svm);
+               return kvm_emulate_rdmsr(vcpu);
  }
  
-static int interrupt_window_interception(struct vcpu_svm *svm)
+static int interrupt_window_interception(struct kvm_vcpu *vcpu)
  {
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-       svm_clear_vintr(svm);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+       svm_clear_vintr(to_svm(vcpu));
  
         /*
          * For AVIC, the only reason to end up here is ExtINTs.
          * In this case AVIC was temporarily disabled for
          * requesting the IRQ window and we have to re-enable it.
          */
-       svm_toggle_avic_for_irq_window(&svm->vcpu, true);
+       svm_toggle_avic_for_irq_window(vcpu, true);
  
-       ++svm->vcpu.stat.irq_window_exits;
+       ++vcpu->stat.irq_window_exits;
         return 1;
  }
  
-static int pause_interception(struct vcpu_svm *svm)
+static int pause_interception(struct kvm_vcpu *vcpu)
  {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
         bool in_kernel;
  
         /*
@@ -3045,35 +2989,18 @@ static int pause_interception(struct vcpu_svm *svm)
          * vcpu->arch.preempted_in_kernel can never be true.  Just
          * set in_kernel to false as well.
          */
-       in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0;
+       in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
  
         if (!kvm_pause_in_guest(vcpu->kvm))
                 grow_ple_window(vcpu);
  
         kvm_vcpu_on_spin(vcpu, in_kernel);
-       return 1;
-}
-
-static int nop_interception(struct vcpu_svm *svm)
-{
-       return kvm_skip_emulated_instruction(&(svm->vcpu));
+       return kvm_skip_emulated_instruction(vcpu);
  }
  
-static int monitor_interception(struct vcpu_svm *svm)
+static int invpcid_interception(struct kvm_vcpu *vcpu)
  {
-       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
-       return nop_interception(svm);
-}
-
-static int mwait_interception(struct vcpu_svm *svm)
-{
-       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
-       return nop_interception(svm);
-}
-
-static int invpcid_interception(struct vcpu_svm *svm)
-{
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
         unsigned long type;
         gva_t gva;
  
@@ -3098,7 +3025,7 @@ static int invpcid_interception(struct vcpu_svm *svm)
         return kvm_handle_invpcid(vcpu, type, gva);
  }
  
-static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
+static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [SVM_EXIT_READ_CR0]                     = cr_interception,
         [SVM_EXIT_READ_CR3]                     = cr_interception,
         [SVM_EXIT_READ_CR4]                     = cr_interception,
@@ -3133,15 +3060,15 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
         [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
         [SVM_EXIT_INTR]                         = intr_interception,
         [SVM_EXIT_NMI]                          = nmi_interception,
-       [SVM_EXIT_SMI]                          = nop_on_interception,
-       [SVM_EXIT_INIT]                         = nop_on_interception,
+       [SVM_EXIT_SMI]                          = kvm_emulate_as_nop,
+       [SVM_EXIT_INIT]                         = kvm_emulate_as_nop,
         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
-       [SVM_EXIT_RDPMC]                        = rdpmc_interception,
-       [SVM_EXIT_CPUID]                        = cpuid_interception,
+       [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
+       [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
         [SVM_EXIT_IRET]                         = iret_interception,
-       [SVM_EXIT_INVD]                         = invd_interception,
+       [SVM_EXIT_INVD]                         = kvm_emulate_invd,
         [SVM_EXIT_PAUSE]                        = pause_interception,
-       [SVM_EXIT_HLT]                          = halt_interception,
+       [SVM_EXIT_HLT]                          = kvm_emulate_halt,
         [SVM_EXIT_INVLPG]                       = invlpg_interception,
         [SVM_EXIT_INVLPGA]                      = invlpga_interception,
         [SVM_EXIT_IOIO]                         = io_interception,
@@ -3149,17 +3076,17 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
         [SVM_EXIT_VMRUN]                        = vmrun_interception,
-       [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
+       [SVM_EXIT_VMMCALL]                      = kvm_emulate_hypercall,
         [SVM_EXIT_VMLOAD]                       = vmload_interception,
         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
         [SVM_EXIT_STGI]                         = stgi_interception,
         [SVM_EXIT_CLGI]                         = clgi_interception,
         [SVM_EXIT_SKINIT]                       = skinit_interception,
-       [SVM_EXIT_WBINVD]                       = wbinvd_interception,
-       [SVM_EXIT_MONITOR]                      = monitor_interception,
-       [SVM_EXIT_MWAIT]                        = mwait_interception,
-       [SVM_EXIT_XSETBV]                       = xsetbv_interception,
-       [SVM_EXIT_RDPRU]                        = rdpru_interception,
+       [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
+       [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
+       [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
+       [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
+       [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
         [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
         [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
         [SVM_EXIT_CR4_WRITE_TRAP]               = cr_trap,
@@ -3177,6 +3104,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
         struct vcpu_svm *svm = to_svm(vcpu);
         struct vmcb_control_area *control = &svm->vmcb->control;
         struct vmcb_save_area *save = &svm->vmcb->save;
+       struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
  
         if (!dump_invalid_vmcb) {
                 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
@@ -3239,28 +3167,28 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
                save->ds.limit, save->ds.base);
         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
                "fs:",
-              save->fs.selector, save->fs.attrib,
-              save->fs.limit, save->fs.base);
+              save01->fs.selector, save01->fs.attrib,
+              save01->fs.limit, save01->fs.base);
         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
                "gs:",
-              save->gs.selector, save->gs.attrib,
-              save->gs.limit, save->gs.base);
+              save01->gs.selector, save01->gs.attrib,
+              save01->gs.limit, save01->gs.base);
         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
                "gdtr:",
                save->gdtr.selector, save->gdtr.attrib,
                save->gdtr.limit, save->gdtr.base);
         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
                "ldtr:",
-              save->ldtr.selector, save->ldtr.attrib,
-              save->ldtr.limit, save->ldtr.base);
+              save01->ldtr.selector, save01->ldtr.attrib,
+              save01->ldtr.limit, save01->ldtr.base);
         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
                "idtr:",
                save->idtr.selector, save->idtr.attrib,
                save->idtr.limit, save->idtr.base);
         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
                "tr:",
-              save->tr.selector, save->tr.attrib,
-              save->tr.limit, save->tr.base);
+              save01->tr.selector, save01->tr.attrib,
+              save01->tr.limit, save01->tr.base);
         pr_err("cpl:            %d                efer:         %016llx\n",
                 save->cpl, save->efer);
         pr_err("%-15s %016llx %-13s %016llx\n",
@@ -3274,15 +3202,15 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
         pr_err("%-15s %016llx %-13s %016llx\n",
                "rsp:", save->rsp, "rax:", save->rax);
         pr_err("%-15s %016llx %-13s %016llx\n",
-              "star:", save->star, "lstar:", save->lstar);
+              "star:", save01->star, "lstar:", save01->lstar);
         pr_err("%-15s %016llx %-13s %016llx\n",
-              "cstar:", save->cstar, "sfmask:", save->sfmask);
+              "cstar:", save01->cstar, "sfmask:", save01->sfmask);
         pr_err("%-15s %016llx %-13s %016llx\n",
-              "kernel_gs_base:", save->kernel_gs_base,
-              "sysenter_cs:", save->sysenter_cs);
+              "kernel_gs_base:", save01->kernel_gs_base,
+              "sysenter_cs:", save01->sysenter_cs);
         pr_err("%-15s %016llx %-13s %016llx\n",
-              "sysenter_esp:", save->sysenter_esp,
-              "sysenter_eip:", save->sysenter_eip);
+              "sysenter_esp:", save01->sysenter_esp,
+              "sysenter_eip:", save01->sysenter_eip);
         pr_err("%-15s %016llx %-13s %016llx\n",
                "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
         pr_err("%-15s %016llx %-13s %016llx\n",
@@ -3309,24 +3237,24 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
         return -EINVAL;
  }
  
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code)
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
  {
-       if (svm_handle_invalid_exit(&svm->vcpu, exit_code))
+       if (svm_handle_invalid_exit(vcpu, exit_code))
                 return 0;
  
  #ifdef CONFIG_RETPOLINE
         if (exit_code == SVM_EXIT_MSR)
-               return msr_interception(svm);
+               return msr_interception(vcpu);
         else if (exit_code == SVM_EXIT_VINTR)
-               return interrupt_window_interception(svm);
+               return interrupt_window_interception(vcpu);
         else if (exit_code == SVM_EXIT_INTR)
-               return intr_interception(svm);
+               return intr_interception(vcpu);
         else if (exit_code == SVM_EXIT_HLT)
-               return halt_interception(svm);
+               return kvm_emulate_halt(vcpu);
         else if (exit_code == SVM_EXIT_NPF)
-               return npf_interception(svm);
+               return npf_interception(vcpu);
  #endif
-       return svm_exit_handlers[exit_code](svm);
+       return svm_exit_handlers[exit_code](vcpu);
  }
  
  static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
@@ -3395,7 +3323,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
         if (exit_fastpath != EXIT_FASTPATH_NONE)
                 return 1;
  
-       return svm_invoke_exit_handler(svm, exit_code);
+       return svm_invoke_exit_handler(vcpu, exit_code);
  }
  
  static void reload_tss(struct kvm_vcpu *vcpu)
@@ -3406,15 +3334,28 @@ static void reload_tss(struct kvm_vcpu *vcpu)
         load_TR_desc();
  }
  
-static void pre_svm_run(struct vcpu_svm *svm)
+static void pre_svm_run(struct kvm_vcpu *vcpu)
  {
-       struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
+       struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       /*
+        * If the previous vmrun of the vmcb occurred on
+        * a different physical cpu then we must mark the vmcb dirty.
+        * and assign a new asid.
+       */
+
+       if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
+               svm->current_vmcb->asid_generation = 0;
+               vmcb_mark_all_dirty(svm->vmcb);
+               svm->current_vmcb->cpu = vcpu->cpu;
+        }
  
-       if (sev_guest(svm->vcpu.kvm))
-               return pre_sev_run(svm, svm->vcpu.cpu);
+       if (sev_guest(vcpu->kvm))
+               return pre_sev_run(svm, vcpu->cpu);
  
         /* FIXME: handle wraparound of asid_generation */
-       if (svm->asid_generation != sd->asid_generation)
+       if (svm->current_vmcb->asid_generation != sd->asid_generation)
                 new_asid(svm, sd);
  }
  
@@ -3424,7 +3365,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
  
         svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
         vcpu->arch.hflags |= HF_NMI_MASK;
-       if (!sev_es_guest(svm->vcpu.kvm))
+       if (!sev_es_guest(vcpu->kvm))
                 svm_set_intercept(svm, INTERCEPT_IRET);
         ++vcpu->stat.nmi_injections;
  }
@@ -3478,7 +3419,7 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
                 return false;
  
         ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
-             (svm->vcpu.arch.hflags & HF_NMI_MASK);
+             (vcpu->arch.hflags & HF_NMI_MASK);
  
         return ret;
  }
@@ -3498,9 +3439,7 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
  
  static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
  {
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+       return !!(vcpu->arch.hflags & HF_NMI_MASK);
  }
  
  static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3508,12 +3447,12 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
         struct vcpu_svm *svm = to_svm(vcpu);
  
         if (masked) {
-               svm->vcpu.arch.hflags |= HF_NMI_MASK;
-               if (!sev_es_guest(svm->vcpu.kvm))
+               vcpu->arch.hflags |= HF_NMI_MASK;
+               if (!sev_es_guest(vcpu->kvm))
                         svm_set_intercept(svm, INTERCEPT_IRET);
         } else {
-               svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
-               if (!sev_es_guest(svm->vcpu.kvm))
+               vcpu->arch.hflags &= ~HF_NMI_MASK;
+               if (!sev_es_guest(vcpu->kvm))
                         svm_clr_intercept(svm, INTERCEPT_IRET);
         }
  }
@@ -3526,7 +3465,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
         if (!gif_set(svm))
                 return true;
  
-       if (sev_es_guest(svm->vcpu.kvm)) {
+       if (sev_es_guest(vcpu->kvm)) {
                 /*
                  * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
                  * bit to determine the state of the IF flag.
@@ -3536,7 +3475,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
         } else if (is_guest_mode(vcpu)) {
                 /* As long as interrupts are being delivered...  */
                 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
-                   ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
+                   ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
                     : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
                         return true;
  
@@ -3595,8 +3534,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
-       if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
-           == HF_NMI_MASK)
+       if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
                 return; /* IRET will cause a vm exit */
  
         if (!gif_set(svm)) {
@@ -3638,7 +3576,7 @@ void svm_flush_tlb(struct kvm_vcpu *vcpu)
         if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
         else
-               svm->asid_generation--;
+               svm->current_vmcb->asid_generation--;
  }
  
  static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
@@ -3675,8 +3613,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
  }
  
-static void svm_complete_interrupts(struct vcpu_svm *svm)
+static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_svm *svm = to_svm(vcpu);
         u8 vector;
         int type;
         u32 exitintinfo = svm->vmcb->control.exit_int_info;
@@ -3688,28 +3627,28 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
          * If we've made progress since setting HF_IRET_MASK, we've
          * executed an IRET and can allow NMI injection.
          */
-       if ((svm->vcpu.arch.hflags & HF_IRET_MASK) &&
-           (sev_es_guest(svm->vcpu.kvm) ||
-            kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip)) {
-               svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
-               kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       if ((vcpu->arch.hflags & HF_IRET_MASK) &&
+           (sev_es_guest(vcpu->kvm) ||
+            kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
+               vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
         }
  
-       svm->vcpu.arch.nmi_injected = false;
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       vcpu->arch.nmi_injected = false;
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
  
         if (!(exitintinfo & SVM_EXITINTINFO_VALID))
                 return;
  
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
  
         vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
         type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
  
         switch (type) {
         case SVM_EXITINTINFO_TYPE_NMI:
-               svm->vcpu.arch.nmi_injected = true;
+               vcpu->arch.nmi_injected = true;
                 break;
         case SVM_EXITINTINFO_TYPE_EXEPT:
                 /*
@@ -3725,21 +3664,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
                  */
                 if (kvm_exception_is_soft(vector)) {
                         if (vector == BP_VECTOR && int3_injected &&
-                           kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
-                               kvm_rip_write(&svm->vcpu,
-                                             kvm_rip_read(&svm->vcpu) -
-                                             int3_injected);
+                           kvm_is_linear_rip(vcpu, svm->int3_rip))
+                               kvm_rip_write(vcpu,
+                                             kvm_rip_read(vcpu) - int3_injected);
                         break;
                 }
                 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
                         u32 err = svm->vmcb->control.exit_int_info_err;
-                       kvm_requeue_exception_e(&svm->vcpu, vector, err);
+                       kvm_requeue_exception_e(vcpu, vector, err);
  
                 } else
-                       kvm_requeue_exception(&svm->vcpu, vector);
+                       kvm_requeue_exception(vcpu, vector);
                 break;
         case SVM_EXITINTINFO_TYPE_INTR:
-               kvm_queue_interrupt(&svm->vcpu, vector, false);
+               kvm_queue_interrupt(vcpu, vector, false);
                 break;
         default:
                 break;
@@ -3754,7 +3692,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
         control->exit_int_info = control->event_inj;
         control->exit_int_info_err = control->event_inj_err;
         control->event_inj = 0;
-       svm_complete_interrupts(svm);
+       svm_complete_interrupts(vcpu);
  }
  
  static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
@@ -3766,9 +3704,10 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
         return EXIT_FASTPATH_NONE;
  }
  
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
-                                       struct vcpu_svm *svm)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_svm *svm = to_svm(vcpu);
+
         /*
          * VMENTER enables interrupts (host state), but the kernel state is
          * interrupts disabled when this is invoked. Also tell RCU about
@@ -3789,12 +3728,14 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
         guest_enter_irqoff();
         lockdep_hardirqs_on(CALLER_ADDR0);
  
-       if (sev_es_guest(svm->vcpu.kvm)) {
+       if (sev_es_guest(vcpu->kvm)) {
                 __svm_sev_es_vcpu_run(svm->vmcb_pa);
         } else {
                 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
  
-               __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
+               vmload(svm->vmcb01.pa);
+               __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&vcpu->arch.regs);
+               vmsave(svm->vmcb01.pa);
  
                 vmload(__sme_page_pa(sd->save_area));
         }
@@ -3845,7 +3786,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
                 smp_send_reschedule(vcpu->cpu);
         }
  
-       pre_svm_run(svm);
+       pre_svm_run(vcpu);
  
         sync_lapic_to_cr8(vcpu);
  
@@ -3859,7 +3800,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
          * Run with all-zero DR6 unless needed, so that we can get the exact cause
          * of a #DB.
          */
-       if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+       if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
                 svm_set_dr6(svm, vcpu->arch.dr6);
         else
                 svm_set_dr6(svm, DR6_ACTIVE_LOW);
@@ -3875,9 +3816,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
          * is no need to worry about the conditional branch over the wrmsr
          * being speculatively taken.
          */
-       x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
+       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+               x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
  
-       svm_vcpu_enter_exit(vcpu, svm);
+       svm_vcpu_enter_exit(vcpu);
  
         /*
          * We do not use IBRS in the kernel. If this vCPU has used the
@@ -3894,15 +3836,17 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
          * If the L02 MSR bitmap does not intercept the MSR, then we need to
          * save it.
          */
-       if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
+           unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
                 svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
  
-       if (!sev_es_guest(svm->vcpu.kvm))
+       if (!sev_es_guest(vcpu->kvm))
                 reload_tss(vcpu);
  
-       x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
+       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+               x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
  
-       if (!sev_es_guest(svm->vcpu.kvm)) {
+       if (!sev_es_guest(vcpu->kvm)) {
                 vcpu->arch.cr2 = svm->vmcb->save.cr2;
                 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
                 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
@@ -3910,7 +3854,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         }
  
         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
-               kvm_before_interrupt(&svm->vcpu);
+               kvm_before_interrupt(vcpu);
  
         kvm_load_host_xsave_state(vcpu);
         stgi();
@@ -3918,13 +3862,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         /* Any pending NMI will happen here */
  
         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
-               kvm_after_interrupt(&svm->vcpu);
+               kvm_after_interrupt(vcpu);
  
         sync_cr8_to_lapic(vcpu);
  
         svm->next_rip = 0;
-       if (is_guest_mode(&svm->vcpu)) {
-               sync_nested_vmcb_control(svm);
+       if (is_guest_mode(vcpu)) {
+               nested_sync_control_from_vmcb02(svm);
                 svm->nested.nested_run_pending = 0;
         }
  
@@ -3933,7 +3877,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
  
         /* if exit due to PF check for async PF */
         if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
-               svm->vcpu.arch.apf.host_apf_flags =
+               vcpu->arch.apf.host_apf_flags =
                         kvm_read_and_reset_apf_flags();
  
         if (npt_enabled) {
@@ -3947,9 +3891,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
          */
         if (unlikely(svm->vmcb->control.exit_code ==
                      SVM_EXIT_EXCP_BASE + MC_VECTOR))
-               svm_handle_mce(svm);
+               svm_handle_mce(vcpu);
  
-       svm_complete_interrupts(svm);
+       svm_complete_interrupts(vcpu);
  
         if (is_guest_mode(vcpu))
                 return EXIT_FASTPATH_NONE;
@@ -3957,21 +3901,26 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         return svm_exit_handlers_fastpath(vcpu);
  }
  
-static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
+static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
                              int root_level)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
         unsigned long cr3;
  
-       cr3 = __sme_set(root);
         if (npt_enabled) {
-               svm->vmcb->control.nested_cr3 = cr3;
+               svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
  
                 /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
                 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
                         return;
                 cr3 = vcpu->arch.cr3;
+       } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
+               cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
+       } else {
+               /* PCID in the guest should be impossible with a 32-bit MMU. */
+               WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
+               cr3 = root_hpa;
         }
  
         svm->vmcb->save.cr3 = cr3;
@@ -4048,7 +3997,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
  
         /* Update nrips enabled cache */
         svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
-                            guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
+                            guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
  
         /* Check again if INVPCID interception if required */
         svm_check_invpcid(svm);
@@ -4349,15 +4298,15 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
                         if (!(saved_efer & EFER_SVME))
                                 return 1;
  
-                       if (kvm_vcpu_map(&svm->vcpu,
+                       if (kvm_vcpu_map(vcpu,
                                          gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
                                 return 1;
  
                         if (svm_allocate_nested(svm))
                                 return 1;
  
-                       ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
-                       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+                       ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva);
+                       kvm_vcpu_unmap(vcpu, &map, true);
                 }
         }
  
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h

index 39e071f..8e276c4 100644 (file)
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -81,11 +81,19 @@ struct kvm_svm {
  
  struct kvm_vcpu;
  
+struct kvm_vmcb_info {
+       struct vmcb *ptr;
+       unsigned long pa;
+       int cpu;
+       uint64_t asid_generation;
+};
+
  struct svm_nested_state {
-       struct vmcb *hsave;
+       struct kvm_vmcb_info vmcb02;
         u64 hsave_msr;
         u64 vm_cr_msr;
         u64 vmcb12_gpa;
+       u64 last_vmcb12_gpa;
  
         /* These are the merged vectors */
         u32 *msrpm;
@@ -104,9 +112,10 @@ struct vcpu_svm {
         struct kvm_vcpu vcpu;
         struct vmcb *vmcb;
         unsigned long vmcb_pa;
+       struct kvm_vmcb_info vmcb01;
+       struct kvm_vmcb_info *current_vmcb;
         struct svm_cpu_data *svm_data;
         u32 asid;
-       uint64_t asid_generation;
         uint64_t sysenter_esp;
         uint64_t sysenter_eip;
         uint64_t tsc_aux;
@@ -239,17 +248,14 @@ static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
         vmcb->control.clean &= ~(1 << bit);
  }
  
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
  {
-       return container_of(vcpu, struct vcpu_svm, vcpu);
+        return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
  }
  
-static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
  {
-       if (is_guest_mode(&svm->vcpu))
-               return svm->nested.hsave;
-       else
-               return svm->vmcb;
+       return container_of(vcpu, struct vcpu_svm, vcpu);
  }
  
  static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
@@ -272,7 +278,7 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
  
  static inline void set_dr_intercepts(struct vcpu_svm *svm)
  {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
  
         if (!sev_es_guest(svm->vcpu.kvm)) {
                 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
@@ -299,7 +305,7 @@ static inline void set_dr_intercepts(struct vcpu_svm *svm)
  
  static inline void clr_dr_intercepts(struct vcpu_svm *svm)
  {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
  
         vmcb->control.intercepts[INTERCEPT_DR] = 0;
  
@@ -314,7 +320,7 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm)
  
  static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
  {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
  
         WARN_ON_ONCE(bit >= 32);
         vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
@@ -324,7 +330,7 @@ static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
  
  static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
  {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
  
         WARN_ON_ONCE(bit >= 32);
         vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
@@ -334,7 +340,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
  
  static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
  {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
  
         vmcb_set_intercept(&vmcb->control, bit);
  
@@ -343,7 +349,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
  
  static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
  {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
  
         vmcb_clr_intercept(&vmcb->control, bit);
  
@@ -405,7 +411,7 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu);
  bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
  bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
  void svm_set_gif(struct vcpu_svm *svm, bool value);
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code);
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
  void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
                           int read, int write);
  
@@ -437,20 +443,30 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
         return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
  }
  
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-                        struct vmcb *nested_vmcb);
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12);
  void svm_leave_nested(struct vcpu_svm *svm);
  void svm_free_nested(struct vcpu_svm *svm);
  int svm_allocate_nested(struct vcpu_svm *svm);
-int nested_svm_vmrun(struct vcpu_svm *svm);
+int nested_svm_vmrun(struct kvm_vcpu *vcpu);
  void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
  int nested_svm_vmexit(struct vcpu_svm *svm);
+
+static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
+{
+       svm->vmcb->control.exit_code   = exit_code;
+       svm->vmcb->control.exit_info_1 = 0;
+       svm->vmcb->control.exit_info_2 = 0;
+       return nested_svm_vmexit(svm);
+}
+
  int nested_svm_exit_handled(struct vcpu_svm *svm);
-int nested_svm_check_permissions(struct vcpu_svm *svm);
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
  int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
                                bool has_error_code, u32 error_code);
  int nested_svm_exit_special(struct vcpu_svm *svm);
-void sync_nested_vmcb_control(struct vcpu_svm *svm);
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
  
  extern struct kvm_x86_nested_ops svm_nested_ops;
  
@@ -491,8 +507,8 @@ void avic_vm_destroy(struct kvm *kvm);
  int avic_vm_init(struct kvm *kvm);
  void avic_init_vmcb(struct vcpu_svm *svm);
  void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm);
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm);
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
  int avic_init_vcpu(struct vcpu_svm *svm);
  void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
  void avic_vcpu_put(struct kvm_vcpu *vcpu);
@@ -565,7 +581,7 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu);
  void __init sev_hardware_setup(void);
  void sev_hardware_teardown(void);
  void sev_free_vcpu(struct kvm_vcpu *vcpu);
-int sev_handle_vmgexit(struct vcpu_svm *svm);
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
  int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
  void sev_es_init_vmcb(struct vcpu_svm *svm);
  void sev_es_create_vcpu(struct vcpu_svm *svm);
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S

index 6feb8c0..4fa17df 100644 (file)
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -79,28 +79,10 @@ SYM_FUNC_START(__svm_vcpu_run)
  
         /* Enter guest mode */
         sti
-1:     vmload %_ASM_AX
-       jmp 3f
-2:     cmpb $0, kvm_rebooting
-       jne 3f
-       ud2
-       _ASM_EXTABLE(1b, 2b)
  
-3:     vmrun %_ASM_AX
-       jmp 5f
-4:     cmpb $0, kvm_rebooting
-       jne 5f
-       ud2
-       _ASM_EXTABLE(3b, 4b)
+1:     vmrun %_ASM_AX
  
-5:     vmsave %_ASM_AX
-       jmp 7f
-6:     cmpb $0, kvm_rebooting
-       jne 7f
-       ud2
-       _ASM_EXTABLE(5b, 6b)
-7:
-       cli
+2:     cli
  
  #ifdef CONFIG_RETPOLINE
         /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
@@ -167,6 +149,13 @@ SYM_FUNC_START(__svm_vcpu_run)
  #endif
         pop %_ASM_BP
         ret
+
+3:     cmpb $0, kvm_rebooting
+       jne 2b
+       ud2
+
+       _ASM_EXTABLE(1b, 3b)
+
  SYM_FUNC_END(__svm_vcpu_run)
  
  /**
@@ -186,18 +175,15 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
  #endif
         push %_ASM_BX
  
-       /* Enter guest mode */
+       /* Move @vmcb to RAX. */
         mov %_ASM_ARG1, %_ASM_AX
+
+       /* Enter guest mode */
         sti
  
  1:     vmrun %_ASM_AX
-       jmp 3f
-2:     cmpb $0, kvm_rebooting
-       jne 3f
-       ud2
-       _ASM_EXTABLE(1b, 2b)
  
-3:     cli
+2:     cli
  
  #ifdef CONFIG_RETPOLINE
         /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
@@ -217,4 +203,11 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
  #endif
         pop %_ASM_BP
         ret
+
+3:     cmpb $0, kvm_rebooting
+       jne 2b
+       ud2
+
+       _ASM_EXTABLE(1b, 3b)
+
  SYM_FUNC_END(__svm_sev_es_vcpu_run)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c

index bcca0b8..fd334e4 100644 (file)
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -21,13 +21,7 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
  static bool __read_mostly nested_early_check = 0;
  module_param(nested_early_check, bool, S_IRUGO);
  
-#define CC(consistency_check)                                          \
-({                                                                     \
-       bool failed = (consistency_check);                              \
-       if (failed)                                                     \
-               trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
-       failed;                                                         \
-})
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
  
  /*
   * Hyper-V requires all of these, so mark them as supported even though
@@ -3453,6 +3447,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
         enum nested_evmptrld_status evmptrld_status;
  
+       ++vcpu->stat.nested_run;
+
         if (!nested_vmx_check_permission(vcpu))
                 return 1;
  
@@ -4422,6 +4418,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
         /* trying to cancel vmlaunch/vmresume is a bug */
         WARN_ON_ONCE(vmx->nested.nested_run_pending);
  
+       /* Similarly, triple faults in L2 should never escape. */
+       WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+
         kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
  
         /* Service the TLB flush request for L2 before switching to L1. */
@@ -4558,6 +4557,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
         vmx->fail = 0;
  }
  
+static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
+{
+       nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+}
+
  /*
   * Decode the memory-address operand of a vmx instruction, as recorded on an
   * exit caused by such an instruction (run by a guest hypervisor).
@@ -5479,16 +5483,11 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
                 if (!nested_vmx_check_eptp(vcpu, new_eptp))
                         return 1;
  
-               kvm_mmu_unload(vcpu);
                 mmu->ept_ad = accessed_dirty;
                 mmu->mmu_role.base.ad_disabled = !accessed_dirty;
                 vmcs12->ept_pointer = new_eptp;
-               /*
-                * TODO: Check what's the correct approach in case
-                * mmu reload fails. Currently, we just let the next
-                * reload potentially fail
-                */
-               kvm_mmu_reload(vcpu);
+
+               kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
         }
  
         return 0;
@@ -6599,6 +6598,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
  struct kvm_x86_nested_ops vmx_nested_ops = {
         .check_events = vmx_check_nested_events,
         .hv_timer_pending = nested_vmx_preemption_timer_pending,
+       .triple_fault = nested_vmx_triple_fault,
         .get_state = vmx_get_nested_state,
         .set_state = vmx_set_nested_state,
         .get_nested_state_pages = vmx_get_nested_state_pages,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index 32cf828..c8a4a54 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -472,26 +472,6 @@ static const u32 vmx_uret_msrs_list[] = {
  static bool __read_mostly enlightened_vmcs = true;
  module_param(enlightened_vmcs, bool, 0444);
  
-/* check_ept_pointer() should be under protection of ept_pointer_lock. */
-static void check_ept_pointer_match(struct kvm *kvm)
-{
-       struct kvm_vcpu *vcpu;
-       u64 tmp_eptp = INVALID_PAGE;
-       int i;
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (!VALID_PAGE(tmp_eptp)) {
-                       tmp_eptp = to_vmx(vcpu)->ept_pointer;
-               } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
-                       to_kvm_vmx(kvm)->ept_pointers_match
-                               = EPT_POINTERS_MISMATCH;
-                       return;
-               }
-       }
-
-       to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
-}
-
  static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
                 void *data)
  {
@@ -501,47 +481,70 @@ static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush
                         range->pages);
  }
  
-static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
-               struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
+static inline int hv_remote_flush_root_ept(hpa_t root_ept,
+                                          struct kvm_tlb_range *range)
  {
-       u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
-
-       /*
-        * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
-        * of the base of EPT PML4 table, strip off EPT configuration
-        * information.
-        */
         if (range)
-               return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
+               return hyperv_flush_guest_mapping_range(root_ept,
                                 kvm_fill_hv_flush_list_func, (void *)range);
         else
-               return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
+               return hyperv_flush_guest_mapping(root_ept);
  }
  
  static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
                 struct kvm_tlb_range *range)
  {
+       struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
         struct kvm_vcpu *vcpu;
-       int ret = 0, i;
+       int ret = 0, i, nr_unique_valid_roots;
+       hpa_t root;
  
-       spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+       spin_lock(&kvm_vmx->hv_root_ept_lock);
  
-       if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
-               check_ept_pointer_match(kvm);
+       if (!VALID_PAGE(kvm_vmx->hv_root_ept)) {
+               nr_unique_valid_roots = 0;
  
-       if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
+               /*
+                * Flush all valid roots, and see if all vCPUs have converged
+                * on a common root, in which case future flushes can skip the
+                * loop and flush the common root.
+                */
                 kvm_for_each_vcpu(i, vcpu, kvm) {
-                       /* If ept_pointer is invalid pointer, bypass flush request. */
-                       if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
-                               ret |= __hv_remote_flush_tlb_with_range(
-                                       kvm, vcpu, range);
+                       root = to_vmx(vcpu)->hv_root_ept;
+                       if (!VALID_PAGE(root) || root == kvm_vmx->hv_root_ept)
+                               continue;
+
+                       /*
+                        * Set the tracked root to the first valid root.  Keep
+                        * this root for the entirety of the loop even if more
+                        * roots are encountered as a low effort optimization
+                        * to avoid flushing the same (first) root again.
+                        */
+                       if (++nr_unique_valid_roots == 1)
+                               kvm_vmx->hv_root_ept = root;
+
+                       if (!ret)
+                               ret = hv_remote_flush_root_ept(root, range);
+
+                       /*
+                        * Stop processing roots if a failure occurred and
+                        * multiple valid roots have already been detected.
+                        */
+                       if (ret && nr_unique_valid_roots > 1)
+                               break;
                 }
+
+               /*
+                * The optimized flush of a single root can't be used if there
+                * are multiple valid roots (obviously).
+                */
+               if (nr_unique_valid_roots > 1)
+                       kvm_vmx->hv_root_ept = INVALID_PAGE;
         } else {
-               ret = __hv_remote_flush_tlb_with_range(kvm,
-                               kvm_get_vcpu(kvm, 0), range);
+               ret = hv_remote_flush_root_ept(kvm_vmx->hv_root_ept, range);
         }
  
-       spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+       spin_unlock(&kvm_vmx->hv_root_ept_lock);
         return ret;
  }
  static int hv_remote_flush_tlb(struct kvm *kvm)
@@ -576,6 +579,21 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
  
  #endif /* IS_ENABLED(CONFIG_HYPERV) */
  
+static void hv_track_root_ept(struct kvm_vcpu *vcpu, hpa_t root_ept)
+{
+#if IS_ENABLED(CONFIG_HYPERV)
+       struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+
+       if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+               spin_lock(&kvm_vmx->hv_root_ept_lock);
+               to_vmx(vcpu)->hv_root_ept = root_ept;
+               if (root_ept != kvm_vmx->hv_root_ept)
+                       kvm_vmx->hv_root_ept = INVALID_PAGE;
+               spin_unlock(&kvm_vmx->hv_root_ept_lock);
+       }
+#endif
+}
+
  /*
   * Comment's format: document - errata name - stepping - processor name.
   * Refer from
@@ -3088,8 +3106,7 @@ static int vmx_get_max_tdp_level(void)
         return 4;
  }
  
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
-                  int root_level)
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
  {
         u64 eptp = VMX_EPTP_MT_WB;
  
@@ -3098,13 +3115,13 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
         if (enable_ept_ad_bits &&
             (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
                 eptp |= VMX_EPTP_AD_ENABLE_BIT;
-       eptp |= (root_hpa & PAGE_MASK);
+       eptp |= root_hpa;
  
         return eptp;
  }
  
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
-                            int pgd_level)
+static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+                            int root_level)
  {
         struct kvm *kvm = vcpu->kvm;
         bool update_guest_cr3 = true;
@@ -3112,16 +3129,10 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
         u64 eptp;
  
         if (enable_ept) {
-               eptp = construct_eptp(vcpu, pgd, pgd_level);
+               eptp = construct_eptp(vcpu, root_hpa, root_level);
                 vmcs_write64(EPT_POINTER, eptp);
  
-               if (kvm_x86_ops.tlb_remote_flush) {
-                       spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-                       to_vmx(vcpu)->ept_pointer = eptp;
-                       to_kvm_vmx(kvm)->ept_pointers_match
-                               = EPT_POINTERS_CHECK;
-                       spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-               }
+               hv_track_root_ept(vcpu, root_hpa);
  
                 if (!enable_unrestricted_guest && !is_paging(vcpu))
                         guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
@@ -3131,7 +3142,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
                         update_guest_cr3 = false;
                 vmx_ept_load_pdptrs(vcpu);
         } else {
-               guest_cr3 = pgd;
+               guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
         }
  
         if (update_guest_cr3)
@@ -4314,15 +4325,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
         vmx->secondary_exec_control = exec_control;
  }
  
-static void ept_set_mmio_spte_mask(void)
-{
-       /*
-        * EPT Misconfigurations can be generated if the value of bits 2:0
-        * of an EPT paging-structure entry is 110b (write/execute).
-        */
-       kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0);
-}
-
  #define VMX_XSS_EXIT_BITMAP 0
  
  /*
@@ -5184,17 +5186,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
         return 1;
  }
  
-static int handle_vmcall(struct kvm_vcpu *vcpu)
-{
-       return kvm_emulate_hypercall(vcpu);
-}
-
-static int handle_invd(struct kvm_vcpu *vcpu)
-{
-       /* Treat an INVD instruction as a NOP and just skip it. */
-       return kvm_skip_emulated_instruction(vcpu);
-}
-
  static int handle_invlpg(struct kvm_vcpu *vcpu)
  {
         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
@@ -5203,28 +5194,6 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
         return kvm_skip_emulated_instruction(vcpu);
  }
  
-static int handle_rdpmc(struct kvm_vcpu *vcpu)
-{
-       int err;
-
-       err = kvm_rdpmc(vcpu);
-       return kvm_complete_insn_gp(vcpu, err);
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu)
-{
-       return kvm_emulate_wbinvd(vcpu);
-}
-
-static int handle_xsetbv(struct kvm_vcpu *vcpu)
-{
-       u64 new_bv = kvm_read_edx_eax(vcpu);
-       u32 index = kvm_rcx_read(vcpu);
-
-       int err = kvm_set_xcr(vcpu, index, new_bv);
-       return kvm_complete_insn_gp(vcpu, err);
-}
-
  static int handle_apic_access(struct kvm_vcpu *vcpu)
  {
         if (likely(fasteoi)) {
@@ -5485,18 +5454,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
         }
  }
  
-static void vmx_enable_tdp(void)
-{
-       kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
-               enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
-               enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
-               0ull, VMX_EPT_EXECUTABLE_MASK,
-               cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-               VMX_EPT_RWX_MASK, 0ull);
-
-       ept_set_mmio_spte_mask();
-}
-
  /*
   * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
   * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -5516,34 +5473,11 @@ static int handle_pause(struct kvm_vcpu *vcpu)
         return kvm_skip_emulated_instruction(vcpu);
  }
  
-static int handle_nop(struct kvm_vcpu *vcpu)
-{
-       return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int handle_mwait(struct kvm_vcpu *vcpu)
-{
-       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
-       return handle_nop(vcpu);
-}
-
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
-{
-       kvm_queue_exception(vcpu, UD_VECTOR);
-       return 1;
-}
-
  static int handle_monitor_trap(struct kvm_vcpu *vcpu)
  {
         return 1;
  }
  
-static int handle_monitor(struct kvm_vcpu *vcpu)
-{
-       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
-       return handle_nop(vcpu);
-}
-
  static int handle_invpcid(struct kvm_vcpu *vcpu)
  {
         u32 vmx_instruction_info;
@@ -5668,10 +5602,10 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
         [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
         [EXIT_REASON_HLT]                     = kvm_emulate_halt,
-       [EXIT_REASON_INVD]                    = handle_invd,
+       [EXIT_REASON_INVD]                    = kvm_emulate_invd,
         [EXIT_REASON_INVLPG]                  = handle_invlpg,
-       [EXIT_REASON_RDPMC]                   = handle_rdpmc,
-       [EXIT_REASON_VMCALL]                  = handle_vmcall,
+       [EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,
+       [EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,
         [EXIT_REASON_VMCLEAR]                 = handle_vmx_instruction,
         [EXIT_REASON_VMLAUNCH]                = handle_vmx_instruction,
         [EXIT_REASON_VMPTRLD]                 = handle_vmx_instruction,
@@ -5685,8 +5619,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
         [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
         [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
-       [EXIT_REASON_WBINVD]                  = handle_wbinvd,
-       [EXIT_REASON_XSETBV]                  = handle_xsetbv,
+       [EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,
+       [EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,
         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
         [EXIT_REASON_GDTR_IDTR]               = handle_desc,
@@ -5694,13 +5628,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
-       [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_mwait,
+       [EXIT_REASON_MWAIT_INSTRUCTION]       = kvm_emulate_mwait,
         [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
-       [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
+       [EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,
         [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
         [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
-       [EXIT_REASON_RDRAND]                  = handle_invalid_op,
-       [EXIT_REASON_RDSEED]                  = handle_invalid_op,
+       [EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,
+       [EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,
         [EXIT_REASON_PML_FULL]                = handle_pml_full,
         [EXIT_REASON_INVPCID]                 = handle_invpcid,
         [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
@@ -6989,8 +6923,9 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
         vmx->pi_desc.nv = POSTED_INTR_VECTOR;
         vmx->pi_desc.sn = 1;
  
-       vmx->ept_pointer = INVALID_PAGE;
-
+#if IS_ENABLED(CONFIG_HYPERV)
+       vmx->hv_root_ept = INVALID_PAGE;
+#endif
         return 0;
  
  free_vmcs:
@@ -7007,7 +6942,9 @@ free_vpid:
  
  static int vmx_vm_init(struct kvm *kvm)
  {
-       spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+#if IS_ENABLED(CONFIG_HYPERV)
+       spin_lock_init(&to_kvm_vmx(kvm)->hv_root_ept_lock);
+#endif
  
         if (!ple_gap)
                 kvm->arch.pause_in_guest = true;
@@ -7848,7 +7785,8 @@ static __init int hardware_setup(void)
         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
  
         if (enable_ept)
-               vmx_enable_tdp();
+               kvm_mmu_set_ept_masks(enable_ept_ad_bits,
+                                     cpu_has_vmx_ept_execute_only());
  
         if (!enable_ept)
                 ept_lpage_level = 0;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h

index 89da5e1..0fb3236 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -325,7 +325,9 @@ struct vcpu_vmx {
          */
         u64 msr_ia32_feature_control;
         u64 msr_ia32_feature_control_valid_bits;
-       u64 ept_pointer;
+#if IS_ENABLED(CONFIG_HYPERV)
+       u64 hv_root_ept;
+#endif
  
         struct pt_desc pt_desc;
         struct lbr_desc lbr_desc;
@@ -338,12 +340,6 @@ struct vcpu_vmx {
         } shadow_msr_intercept;
  };
  
-enum ept_pointers_status {
-       EPT_POINTERS_CHECK = 0,
-       EPT_POINTERS_MATCH = 1,
-       EPT_POINTERS_MISMATCH = 2
-};
-
  struct kvm_vmx {
         struct kvm kvm;
  
@@ -351,8 +347,10 @@ struct kvm_vmx {
         bool ept_identity_pagetable_done;
         gpa_t ept_identity_map_addr;
  
-       enum ept_pointers_status ept_pointers_match;
-       spinlock_t ept_pointer_lock;
+#if IS_ENABLED(CONFIG_HYPERV)
+       hpa_t hv_root_ept;
+       spinlock_t hv_root_ept_lock;
+#endif
  };
  
  bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
@@ -376,8 +374,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
  void ept_save_pdptrs(struct kvm_vcpu *vcpu);
  void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
  void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
-                  int root_level);
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
  
  void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
  void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 47e021b..a9d95f9 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -245,6 +245,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
         VCPU_STAT("l1d_flush", l1d_flush),
         VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
         VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+       VCPU_STAT("nested_run", nested_run),
         VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
         VM_STAT("mmu_pte_write", mmu_pte_write),
         VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
@@ -544,8 +545,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
  
         if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
         queue:
-               if (has_error && !is_protmode(vcpu))
-                       has_error = false;
                 if (reinject) {
                         /*
                          * On vmentry, vcpu->arch.exception.pending is only
@@ -984,14 +983,17 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
         return 0;
  }
  
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
  {
-       if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
-               return __kvm_set_xcr(vcpu, index, xcr);
+       if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
+           __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
  
-       return 1;
+       return kvm_skip_emulated_instruction(vcpu);
  }
-EXPORT_SYMBOL_GPL(kvm_set_xcr);
+EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
  
  bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  {
@@ -1192,20 +1194,21 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
  }
  EXPORT_SYMBOL_GPL(kvm_get_dr);
  
-bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
  {
         u32 ecx = kvm_rcx_read(vcpu);
         u64 data;
-       int err;
  
-       err = kvm_pmu_rdpmc(vcpu, ecx, &data);
-       if (err)
-               return err;
+       if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
         kvm_rax_write(vcpu, (u32)data);
         kvm_rdx_write(vcpu, data >> 32);
-       return err;
+       return kvm_skip_emulated_instruction(vcpu);
  }
-EXPORT_SYMBOL_GPL(kvm_rdpmc);
+EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
  
  /*
   * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -1783,6 +1786,40 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
  }
  EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
  
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
+{
+       return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_as_nop);
+
+int kvm_emulate_invd(struct kvm_vcpu *vcpu)
+{
+       /* Treat an INVD instruction as a NOP and just skip it. */
+       return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_invd);
+
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+       pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n");
+       return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+       kvm_queue_exception(vcpu, UD_VECTOR);
+       return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
+
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+       pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n");
+       return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
+
  static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
  {
         xfer_to_guest_mode_prepare();
@@ -8004,9 +8041,6 @@ int kvm_arch_init(void *opaque)
         if (r)
                 goto out_free_percpu;
  
-       kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-                       PT_DIRTY_MASK, PT64_NX_MASK, 0,
-                       PT_PRESENT_MASK, 0, sme_me_mask);
         kvm_timer_init();
  
         perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -8328,6 +8362,27 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
         static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
  }
  
+
+int kvm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+       if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
+               return -EIO;
+
+       if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+               kvm_x86_ops.nested_ops->triple_fault(vcpu);
+               return 1;
+       }
+
+       return kvm_x86_ops.nested_ops->check_events(vcpu);
+}
+
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+               vcpu->arch.exception.error_code = false;
+       static_call(kvm_x86_queue_exception)(vcpu);
+}
+
  static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
  {
         int r;
@@ -8336,7 +8391,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
         /* try to reinject previous events if any */
  
         if (vcpu->arch.exception.injected) {
-               static_call(kvm_x86_queue_exception)(vcpu);
+               kvm_inject_exception(vcpu);
                 can_inject = false;
         }
         /*
@@ -8373,7 +8428,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
          * from L2 to L1.
          */
         if (is_guest_mode(vcpu)) {
-               r = kvm_x86_ops.nested_ops->check_events(vcpu);
+               r = kvm_check_nested_events(vcpu);
                 if (r < 0)
                         goto busy;
         }
@@ -8399,7 +8454,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
                         }
                 }
  
-               static_call(kvm_x86_queue_exception)(vcpu);
+               kvm_inject_exception(vcpu);
                 can_inject = false;
         }
  
@@ -8936,10 +8991,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                         goto out;
                 }
                 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
-                       vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
-                       vcpu->mmio_needed = 0;
-                       r = 0;
-                       goto out;
+                       if (is_guest_mode(vcpu)) {
+                               kvm_x86_ops.nested_ops->triple_fault(vcpu);
+                       } else {
+                               vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+                               vcpu->mmio_needed = 0;
+                               r = 0;
+                               goto out;
+                       }
                 }
                 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
                         /* Page is swapped out. Do synthetic halt */
@@ -9237,7 +9296,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
  static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
  {
         if (is_guest_mode(vcpu))
-               kvm_x86_ops.nested_ops->check_events(vcpu);
+               kvm_check_nested_events(vcpu);
  
         return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                 !vcpu->arch.apf.halted);
@@ -11503,7 +11562,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
  
                 fallthrough;
         case INVPCID_TYPE_ALL_INCL_GLOBAL:
-               kvm_mmu_unload(vcpu);
+               kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
                 return kvm_skip_emulated_instruction(vcpu);
  
         default:
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h

index 39eb048..daccf20 100644 (file)
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,6 +8,14 @@
  #include "kvm_cache_regs.h"
  #include "kvm_emulate.h"
  
+#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check)                \
+({                                                                     \
+       bool failed = (consistency_check);                              \
+       if (failed)                                                     \
+               trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
+       failed;                                                         \
+})
+
  #define KVM_DEFAULT_PLE_GAP            128
  #define KVM_VMX_DEFAULT_PLE_WINDOW     4096
  #define KVM_DEFAULT_PLE_WINDOW_GROW    2
@@ -48,6 +56,8 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
  
  #define MSR_IA32_CR_PAT_DEFAULT  0x0007040600070406ULL
  
+int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+
  static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
  {
         vcpu->arch.exception.pending = false;
author	Paolo Bonzini <pbonzini@redhat.com>
	Fri, 2 Apr 2021 11:25:32 +0000 (07:25 -0400)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Fri, 2 Apr 2021 11:25:32 +0000 (07:25 -0400)
Documentation/virt/kvm/locking.rst		patch \| blob \| history
arch/x86/include/asm/cpufeatures.h		patch \| blob \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/include/asm/svm.h		patch \| blob \| history
arch/x86/kvm/lapic.c		patch \| blob \| history
arch/x86/kvm/mmu.h		patch \| blob \| history
arch/x86/kvm/mmu/mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/mmu_audit.c		patch \| blob \| history
arch/x86/kvm/mmu/mmu_internal.h		patch \| blob \| history
arch/x86/kvm/mmu/paging_tmpl.h		patch \| blob \| history
arch/x86/kvm/mmu/spte.c		patch \| blob \| history
arch/x86/kvm/mmu/spte.h		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.h		patch \| blob \| history
arch/x86/kvm/svm/avic.c		patch \| blob \| history
arch/x86/kvm/svm/nested.c		patch \| blob \| history
arch/x86/kvm/svm/sev.c		patch \| blob \| history
arch/x86/kvm/svm/svm.c		patch \| blob \| history
arch/x86/kvm/svm/svm.h		patch \| blob \| history
arch/x86/kvm/svm/vmenter.S		patch \| blob \| history
arch/x86/kvm/vmx/nested.c		patch \| blob \| history
arch/x86/kvm/vmx/vmx.c		patch \| blob \| history
arch/x86/kvm/vmx/vmx.h		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
arch/x86/kvm/x86.h		patch \| blob \| history