Merge branch 'kvm-tdp-mmu-atomicity-fix' into HEAD

author Paolo Bonzini <pbonzini@redhat.com>

Tue, 3 May 2022 11:23:08 +0000 (07:23 -0400)

committer Paolo Bonzini <pbonzini@redhat.com>

Tue, 3 May 2022 11:23:08 +0000 (07:23 -0400)
author Paolo Bonzini <pbonzini@redhat.com>
Tue, 3 May 2022 11:23:08 +0000 (07:23 -0400)
committer Paolo Bonzini <pbonzini@redhat.com>
Tue, 3 May 2022 11:23:08 +0000 (07:23 -0400)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index 64a2a7e..311e4e1 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -473,30 +473,6 @@ retry:
  }
  #endif
  
-static bool spte_has_volatile_bits(u64 spte)
-{
-       if (!is_shadow_present_pte(spte))
-               return false;
-
-       /*
-        * Always atomically update spte if it can be updated
-        * out of mmu-lock, it can ensure dirty bit is not lost,
-        * also, it can help us to get a stable is_writable_pte()
-        * to ensure tlb flush is not missed.
-        */
-       if (spte_can_locklessly_be_made_writable(spte) ||
-           is_access_track_spte(spte))
-               return true;
-
-       if (spte_ad_enabled(spte)) {
-               if ((spte & shadow_accessed_mask) == 0 ||
-                   (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
-                       return true;
-       }
-
-       return false;
-}
-
  /* Rules for using mmu_spte_set:
   * Set the sptep from nonpresent to present.
   * Note: the sptep being assigned *must* be either not present
@@ -557,7 +533,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
          * we always atomically update it, see the comments in
          * spte_has_volatile_bits().
          */
-       if (spte_can_locklessly_be_made_writable(old_spte) &&
+       if (is_mmu_writable_spte(old_spte) &&
               !is_writable_pte(new_spte))
                 flush = true;
  
@@ -591,7 +567,8 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
         u64 old_spte = *sptep;
         int level = sptep_to_sp(sptep)->role.level;
  
-       if (!spte_has_volatile_bits(old_spte))
+       if (!is_shadow_present_pte(old_spte) ||
+           !spte_has_volatile_bits(old_spte))
                 __update_clear_spte_fast(sptep, 0ull);
         else
                 old_spte = __update_clear_spte_slow(sptep, 0ull);
@@ -1187,7 +1164,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
         u64 spte = *sptep;
  
         if (!is_writable_pte(spte) &&
-             !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
+           !(pt_protect && is_mmu_writable_spte(spte)))
                 return false;
  
         rmap_printk("spte %p %llx\n", sptep, *sptep);
@@ -3196,8 +3173,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
                  * be removed in the fast path only if the SPTE was
                  * write-protected for dirty-logging or access tracking.
                  */
-               if (fault->write &&
-                   spte_can_locklessly_be_made_writable(spte)) {
+               if (fault->write && is_mmu_writable_spte(spte)) {
                         new_spte |= PT_WRITABLE_MASK;
  
                         /*
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c

index 4739b53..e5c0b6d 100644 (file)
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -90,6 +90,34 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
                                      E820_TYPE_RAM);
  }
  
+/*
+ * Returns true if the SPTE has bits that may be set without holding mmu_lock.
+ * The caller is responsible for checking if the SPTE is shadow-present, and
+ * for determining whether or not the caller cares about non-leaf SPTEs.
+ */
+bool spte_has_volatile_bits(u64 spte)
+{
+       /*
+        * Always atomically update spte if it can be updated
+        * out of mmu-lock, it can ensure dirty bit is not lost,
+        * also, it can help us to get a stable is_writable_pte()
+        * to ensure tlb flush is not missed.
+        */
+       if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
+               return true;
+
+       if (is_access_track_spte(spte))
+               return true;
+
+       if (spte_ad_enabled(spte)) {
+               if (!(spte & shadow_accessed_mask) ||
+                   (is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
+                       return true;
+       }
+
+       return false;
+}
+
  bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                const struct kvm_memory_slot *slot,
                unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h

index e4abeb5..80ab0f5 100644 (file)
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -390,7 +390,7 @@ static inline void check_spte_writable_invariants(u64 spte)
                           "kvm: Writable SPTE is not MMU-writable: %llx", spte);
  }
  
-static inline bool spte_can_locklessly_be_made_writable(u64 spte)
+static inline bool is_mmu_writable_spte(u64 spte)
  {
         return spte & shadow_mmu_writable_mask;
  }
@@ -404,6 +404,8 @@ static inline u64 get_mmio_spte_generation(u64 spte)
         return gen;
  }
  
+bool spte_has_volatile_bits(u64 spte);
+
  bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                const struct kvm_memory_slot *slot,
                unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h

index b1eaf6e..f0af385 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -6,6 +6,7 @@
  #include <linux/kvm_host.h>
  
  #include "mmu.h"
+#include "spte.h"
  
  /*
   * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
@@ -17,9 +18,38 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
  {
         return READ_ONCE(*rcu_dereference(sptep));
  }
-static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val)
+
+static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
+{
+       return xchg(rcu_dereference(sptep), new_spte);
+}
+
+static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
+{
+       WRITE_ONCE(*rcu_dereference(sptep), new_spte);
+}
+
+static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
+                                        u64 new_spte, int level)
  {
-       WRITE_ONCE(*rcu_dereference(sptep), val);
+       /*
+        * Atomically write the SPTE if it is a shadow-present, leaf SPTE with
+        * volatile bits, i.e. has bits that can be set outside of mmu_lock.
+        * The Writable bit can be set by KVM's fast page fault handler, and
+        * Accessed and Dirty bits can be set by the CPU.
+        *
+        * Note, non-leaf SPTEs do have Accessed bits and those bits are
+        * technically volatile, but KVM doesn't consume the Accessed bit of
+        * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit.  This
+        * logic needs to be reassessed if KVM were to use non-leaf Accessed
+        * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
+        */
+       if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
+           spte_has_volatile_bits(old_spte))
+               return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
+
+       __kvm_tdp_mmu_write_spte(sptep, new_spte);
+       return old_spte;
  }
  
  /*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c

index edc6853..922b06b 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
         tdp_mmu_unlink_sp(kvm, sp, shared);
  
         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
-               u64 *sptep = rcu_dereference(pt) + i;
+               tdp_ptep_t sptep = pt + i;
                 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
-               u64 old_child_spte;
+               u64 old_spte;
  
                 if (shared) {
                         /*
@@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
                          * value to the removed SPTE value.
                          */
                         for (;;) {
-                               old_child_spte = xchg(sptep, REMOVED_SPTE);
-                               if (!is_removed_spte(old_child_spte))
+                               old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
+                               if (!is_removed_spte(old_spte))
                                         break;
                                 cpu_relax();
                         }
@@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
                          * are guarded by the memslots generation, not by being
                          * unreachable.
                          */
-                       old_child_spte = READ_ONCE(*sptep);
-                       if (!is_shadow_present_pte(old_child_spte))
+                       old_spte = kvm_tdp_mmu_read_spte(sptep);
+                       if (!is_shadow_present_pte(old_spte))
                                 continue;
  
                         /*
-                        * Marking the SPTE as a removed SPTE is not
-                        * strictly necessary here as the MMU lock will
-                        * stop other threads from concurrently modifying
-                        * this SPTE. Using the removed SPTE value keeps
-                        * the two branches consistent and simplifies
-                        * the function.
+                        * Use the common helper instead of a raw WRITE_ONCE as
+                        * the SPTE needs to be updated atomically if it can be
+                        * modified by a different vCPU outside of mmu_lock.
+                        * Even though the parent SPTE is !PRESENT, the TLB
+                        * hasn't yet been flushed, and both Intel and AMD
+                        * document that A/D assists can use upper-level PxE
+                        * entries that are cached in the TLB, i.e. the CPU can
+                        * still access the page and mark it dirty.
+                        *
+                        * No retry is needed in the atomic update path as the
+                        * sole concern is dropping a Dirty bit, i.e. no other
+                        * task can zap/remove the SPTE as mmu_lock is held for
+                        * write.  Marking the SPTE as a removed SPTE is not
+                        * strictly necessary for the same reason, but using
+                        * the remove SPTE value keeps the shared/exclusive
+                        * paths consistent and allows the handle_changed_spte()
+                        * call below to hardcode the new value to REMOVED_SPTE.
+                        *
+                        * Note, even though dropping a Dirty bit is the only
+                        * scenario where a non-atomic update could result in a
+                        * functional bug, simply checking the Dirty bit isn't
+                        * sufficient as a fast page fault could read the upper
+                        * level SPTE before it is zapped, and then make this
+                        * target SPTE writable, resume the guest, and set the
+                        * Dirty bit between reading the SPTE above and writing
+                        * it here.
                          */
-                       WRITE_ONCE(*sptep, REMOVED_SPTE);
+                       old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
+                                                         REMOVED_SPTE, level);
                 }
                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
-                                   old_child_spte, REMOVED_SPTE, level,
-                                   shared);
+                                   old_spte, REMOVED_SPTE, level, shared);
         }
  
         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
                                            KVM_PAGES_PER_HPAGE(iter->level));
  
         /*
-        * No other thread can overwrite the removed SPTE as they
-        * must either wait on the MMU lock or use
-        * tdp_mmu_set_spte_atomic which will not overwrite the
-        * special removed SPTE value. No bookkeeping is needed
-        * here since the SPTE is going from non-present
-        * to non-present.
+        * No other thread can overwrite the removed SPTE as they must either
+        * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
+        * overwrite the special removed SPTE value. No bookkeeping is needed
+        * here since the SPTE is going from non-present to non-present.  Use
+        * the raw write helper to avoid an unnecessary check on volatile bits.
          */
-       kvm_tdp_mmu_write_spte(iter->sptep, 0);
+       __kvm_tdp_mmu_write_spte(iter->sptep, 0);
  
         return 0;
  }
@@ -699,10 +718,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
   *                   unless performing certain dirty logging operations.
   *                   Leaving record_dirty_log unset in that case prevents page
   *                   writes from being double counted.
+ *
+ * Returns the old SPTE value, which _may_ be different than @old_spte if the
+ * SPTE had voldatile bits.
   */
-static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
-                              u64 old_spte, u64 new_spte, gfn_t gfn, int level,
-                              bool record_acc_track, bool record_dirty_log)
+static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+                             u64 old_spte, u64 new_spte, gfn_t gfn, int level,
+                             bool record_acc_track, bool record_dirty_log)
  {
         lockdep_assert_held_write(&kvm->mmu_lock);
  
@@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
          */
         WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
  
-       kvm_tdp_mmu_write_spte(sptep, new_spte);
+       old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
  
         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
  
@@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
         if (record_dirty_log)
                 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
                                               new_spte, level);
+       return old_spte;
  }
  
  static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
@@ -732,9 +755,10 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
  {
         WARN_ON_ONCE(iter->yielded);
  
-       __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
-                          new_spte, iter->gfn, iter->level,
-                          record_acc_track, record_dirty_log);
+       iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
+                                           iter->old_spte, new_spte,
+                                           iter->gfn, iter->level,
+                                           record_acc_track, record_dirty_log);
  }
  
  static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
author	Paolo Bonzini <pbonzini@redhat.com>
	Tue, 3 May 2022 11:23:08 +0000 (07:23 -0400)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Tue, 3 May 2022 11:23:08 +0000 (07:23 -0400)
arch/x86/kvm/mmu/mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/spte.c		patch \| blob \| history
arch/x86/kvm/mmu/spte.h		patch \| blob \| history
arch/x86/kvm/mmu/tdp_iter.h		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.c		patch \| blob \| history