Merge branch 'kvm-tdp-fix-rcu' into HEAD
[linux-2.6-microblaze.git] / arch / x86 / kvm / mmu / tdp_mmu.c
index c926c6b..fd50008 100644 (file)
@@ -86,7 +86,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
        list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
 
 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield);
+                         gfn_t start, gfn_t end, bool can_yield, bool flush);
 
 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
 {
@@ -99,7 +99,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
 
        list_del(&root->link);
 
-       zap_gfn_range(kvm, root, 0, max_gfn, false);
+       zap_gfn_range(kvm, root, 0, max_gfn, false, false);
 
        free_page((unsigned long)root->spt);
        kmem_cache_free(mmu_page_header_cache, root);
@@ -137,22 +137,21 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
        return sp;
 }
 
-static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 {
        union kvm_mmu_page_role role;
        struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_page *root;
 
-       role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
+       lockdep_assert_held_write(&kvm->mmu_lock);
 
-       write_lock(&kvm->mmu_lock);
+       role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 
        /* Check for an existing root before allocating a new one. */
        for_each_tdp_mmu_root(kvm, root) {
                if (root->role.word == role.word) {
                        kvm_mmu_get_root(kvm, root);
-                       write_unlock(&kvm->mmu_lock);
-                       return root;
+                       goto out;
                }
        }
 
@@ -161,19 +160,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
 
        list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 
-       write_unlock(&kvm->mmu_lock);
-
-       return root;
-}
-
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu_page *root;
-
-       root = get_tdp_mmu_vcpu_root(vcpu);
-       if (!root)
-               return INVALID_PAGE;
-
+out:
        return __pa(root->spt);
 }
 
@@ -203,20 +190,14 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
                                u64 old_spte, u64 new_spte, int level,
                                bool shared);
 
-static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
-{
-       return sp->role.smm ? 1 : 0;
-}
-
 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 {
-       bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
-
        if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
                return;
 
        if (is_accessed_spte(old_spte) &&
-           (!is_accessed_spte(new_spte) || pfn_changed))
+           (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
+            spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
                kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 }
 
@@ -301,11 +282,16 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
  *
  * Given a page table that has been removed from the TDP paging structure,
  * iterates through the page table to clear SPTEs and free child page tables.
+ *
+ * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
+ * protection. Since this thread removed it from the paging structure,
+ * this thread will be responsible for ensuring the page is freed. Hence the
+ * early rcu_dereferences in the function.
  */
-static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
+static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
                                        bool shared)
 {
-       struct kvm_mmu_page *sp = sptep_to_sp(pt);
+       struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
        int level = sp->role.level;
        gfn_t base_gfn = sp->gfn;
        u64 old_child_spte;
@@ -318,7 +304,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
        tdp_mmu_unlink_page(kvm, sp, shared);
 
        for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
-               sptep = pt + i;
+               sptep = rcu_dereference(pt) + i;
                gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 
                if (shared) {
@@ -337,7 +323,18 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
                                cpu_relax();
                        }
                } else {
+                       /*
+                        * If the SPTE is not MMU-present, there is no backing
+                        * page associated with the SPTE and so no side effects
+                        * that need to be recorded, and exclusive ownership of
+                        * mmu_lock ensures the SPTE can't be made present.
+                        * Note, zapping MMIO SPTEs is also unnecessary as they
+                        * are guarded by the memslots generation, not by being
+                        * unreachable.
+                        */
                        old_child_spte = READ_ONCE(*sptep);
+                       if (!is_shadow_present_pte(old_child_spte))
+                               continue;
 
                        /*
                         * Marking the SPTE as a removed SPTE is not
@@ -444,7 +441,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 
 
        if (was_leaf && is_dirty_spte(old_spte) &&
-           (!is_dirty_spte(new_spte) || pfn_changed))
+           (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
                kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 
        /*
@@ -481,25 +478,21 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
                                           struct tdp_iter *iter,
                                           u64 new_spte)
 {
-       u64 *root_pt = tdp_iter_root_pt(iter);
-       struct kvm_mmu_page *root = sptep_to_sp(root_pt);
-       int as_id = kvm_mmu_page_as_id(root);
-
        lockdep_assert_held_read(&kvm->mmu_lock);
 
        /*
         * Do not change removed SPTEs. Only the thread that froze the SPTE
         * may modify it.
         */
-       if (iter->old_spte == REMOVED_SPTE)
+       if (is_removed_spte(iter->old_spte))
                return false;
 
        if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
                      new_spte) != iter->old_spte)
                return false;
 
-       handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
-                           iter->level, true);
+       handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+                           new_spte, iter->level, true);
 
        return true;
 }
@@ -527,7 +520,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
         * here since the SPTE is going from non-present
         * to non-present.
         */
-       WRITE_ONCE(*iter->sptep, 0);
+       WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 
        return true;
 }
@@ -553,10 +546,6 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
                                      u64 new_spte, bool record_acc_track,
                                      bool record_dirty_log)
 {
-       tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
-       struct kvm_mmu_page *root = sptep_to_sp(root_pt);
-       int as_id = kvm_mmu_page_as_id(root);
-
        lockdep_assert_held_write(&kvm->mmu_lock);
 
        /*
@@ -566,17 +555,17 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
         * should be used. If operating under the MMU lock in write mode, the
         * use of the removed SPTE should not be necessary.
         */
-       WARN_ON(iter->old_spte == REMOVED_SPTE);
+       WARN_ON(is_removed_spte(iter->old_spte));
 
        WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 
-       __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
-                             iter->level, false);
+       __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+                             new_spte, iter->level, false);
        if (record_acc_track)
                handle_changed_spte_acc_track(iter->old_spte, new_spte,
                                              iter->level);
        if (record_dirty_log)
-               handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
+               handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
                                              iter->old_spte, new_spte,
                                              iter->level);
 }
@@ -648,9 +637,7 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 
                WARN_ON(iter->gfn > iter->next_last_level_gfn);
 
-               tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
-                              iter->root_level, iter->min_level,
-                              iter->next_last_level_gfn);
+               tdp_iter_restart(iter);
 
                return true;
        }
@@ -667,20 +654,21 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
  * scheduler needs the CPU or there is contention on the MMU lock. If this
  * function cannot yield, it will not release the MMU lock or reschedule and
  * the caller must ensure it does not supply too large a GFN range, or the
- * operation can cause a soft lockup.
+ * operation can cause a soft lockup.  Note, in some use cases a flush may be
+ * required by prior actions.  Ensure the pending flush is performed prior to
+ * yielding.
  */
 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield)
+                         gfn_t start, gfn_t end, bool can_yield, bool flush)
 {
        struct tdp_iter iter;
-       bool flush_needed = false;
 
        rcu_read_lock();
 
        tdp_root_for_each_pte(iter, root, start, end) {
                if (can_yield &&
-                   tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
-                       flush_needed = false;
+                   tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
+                       flush = false;
                        continue;
                }
 
@@ -698,11 +686,11 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
                        continue;
 
                tdp_mmu_set_spte(kvm, &iter, 0);
-               flush_needed = true;
+               flush = true;
        }
 
        rcu_read_unlock();
-       return flush_needed;
+       return flush;
 }
 
 /*
@@ -711,13 +699,14 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  * SPTEs have been cleared and a TLB flush is needed before releasing the
  * MMU lock.
  */
-bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
+                                bool can_yield)
 {
        struct kvm_mmu_page *root;
        bool flush = false;
 
        for_each_tdp_mmu_root_yield_safe(kvm, root)
-               flush |= zap_gfn_range(kvm, root, start, end, true);
+               flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
 
        return flush;
 }
@@ -774,12 +763,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
                trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
                                     new_spte);
                ret = RET_PF_EMULATE;
-       } else
+       } else {
                trace_kvm_mmu_set_spte(iter->level, iter->gfn,
                                       rcu_dereference(iter->sptep));
+       }
 
-       trace_kvm_mmu_set_spte(iter->level, iter->gfn,
-                              rcu_dereference(iter->sptep));
        if (!prefault)
                vcpu->stat.pf_fixed++;
 
@@ -879,17 +867,15 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
        return ret;
 }
 
-static __always_inline int
-kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
-                            unsigned long start,
-                            unsigned long end,
-                            unsigned long data,
-                            int (*handler)(struct kvm *kvm,
-                                           struct kvm_memory_slot *slot,
-                                           struct kvm_mmu_page *root,
-                                           gfn_t start,
-                                           gfn_t end,
-                                           unsigned long data))
+typedef int (*tdp_handler_t)(struct kvm *kvm, struct kvm_memory_slot *slot,
+                            struct kvm_mmu_page *root, gfn_t start, gfn_t end,
+                            unsigned long data);
+
+static __always_inline int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
+                                                       unsigned long start,
+                                                       unsigned long end,
+                                                       unsigned long data,
+                                                       tdp_handler_t handler)
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
@@ -924,12 +910,20 @@ kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
        return ret;
 }
 
+static __always_inline int kvm_tdp_mmu_handle_hva(struct kvm *kvm,
+                                                 unsigned long addr,
+                                                 unsigned long data,
+                                                 tdp_handler_t handler)
+{
+       return kvm_tdp_mmu_handle_hva_range(kvm, addr, addr + 1, data, handler);
+}
+
 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
                                     struct kvm_memory_slot *slot,
                                     struct kvm_mmu_page *root, gfn_t start,
                                     gfn_t end, unsigned long unused)
 {
-       return zap_gfn_range(kvm, root, start, end, false);
+       return zap_gfn_range(kvm, root, start, end, false, false);
 }
 
 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
@@ -997,12 +991,12 @@ int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
 }
 
 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
-                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
-                       unsigned long unused2)
+                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
+                       unsigned long unused)
 {
        struct tdp_iter iter;
 
-       tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
+       tdp_root_for_each_leaf_pte(iter, root, gfn, end)
                if (is_accessed_spte(iter.old_spte))
                        return 1;
 
@@ -1011,8 +1005,7 @@ static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
-                                           test_age_gfn);
+       return kvm_tdp_mmu_handle_hva(kvm, hva, 0, test_age_gfn);
 }
 
 /*
@@ -1022,7 +1015,7 @@ int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
  * Returns non-zero if a flush is needed before releasing the MMU lock.
  */
 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
-                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
+                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
                        unsigned long data)
 {
        struct tdp_iter iter;
@@ -1033,7 +1026,7 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
 
        rcu_read_lock();
 
-       WARN_ON(pte_huge(*ptep));
+       WARN_ON(pte_huge(*ptep) || (gfn + 1) != end);
 
        new_pfn = pte_pfn(*ptep);
 
@@ -1044,10 +1037,14 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
                if (!is_shadow_present_pte(iter.old_spte))
                        break;
 
+               /*
+                * Note, when changing a read-only SPTE, it's not strictly
+                * necessary to zero the SPTE before setting the new PFN, but
+                * doing so preserves the invariant that the PFN of a present
+                * leaf SPTE can never change.  See __handle_changed_spte().
+                */
                tdp_mmu_set_spte(kvm, &iter, 0);
 
-               kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
-
                if (!pte_write(*ptep)) {
                        new_spte = kvm_mmu_changed_pte_notifier_make_spte(
                                        iter.old_spte, new_pfn);
@@ -1069,9 +1066,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
                             pte_t *host_ptep)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
-                                           (unsigned long)host_ptep,
-                                           set_tdp_spte);
+       return kvm_tdp_mmu_handle_hva(kvm, address, (unsigned long)host_ptep,
+                                     set_tdp_spte);
 }
 
 /*
@@ -1331,7 +1327,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
 
 /*
  * Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
  * Returns true if an SPTE was set and a TLB flush is needed.
  */
 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
@@ -1348,7 +1344,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
                        break;
 
                new_spte = iter.old_spte &
-                       ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+                       ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
 
                tdp_mmu_set_spte(kvm, &iter, new_spte);
                spte_set = true;
@@ -1361,7 +1357,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
 
 /*
  * Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
  * Returns true if an SPTE was set and a TLB flush is needed.
  */
 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,