arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = false;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  20                 return;
  21
  22         /* This should not be changed for the lifetime of the VM. */
  23         kvm->arch.tdp_mmu_enabled = true;
  24
  25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  28 }
  29
  30 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
  31                                                              bool shared)
  32 {
  33         if (shared)
  34                 lockdep_assert_held_read(&kvm->mmu_lock);
  35         else
  36                 lockdep_assert_held_write(&kvm->mmu_lock);
  37 }
  38
  39 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  40 {
  41         if (!kvm->arch.tdp_mmu_enabled)
  42                 return;
  43
  44         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  45
  46         /*
  47          * Ensure that all the outstanding RCU callbacks to free shadow pages
  48          * can run before the VM is torn down.
  49          */
  50         rcu_barrier();
  51 }
  52
  53 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  54                           gfn_t start, gfn_t end, bool can_yield, bool flush,
  55                           bool shared);
  56
  57 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
  58 {
  59         free_page((unsigned long)sp->spt);
  60         kmem_cache_free(mmu_page_header_cache, sp);
  61 }
  62
  63 /*
  64  * This is called through call_rcu in order to free TDP page table memory
  65  * safely with respect to other kernel threads that may be operating on
  66  * the memory.
  67  * By only accessing TDP MMU page table memory in an RCU read critical
  68  * section, and freeing it after a grace period, lockless access to that
  69  * memory won't use it after it is freed.
  70  */
  71 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
  72 {
  73         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
  74                                                rcu_head);
  75
  76         tdp_mmu_free_sp(sp);
  77 }
  78
  79 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
  80                           bool shared)
  81 {
  82         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
  83
  84         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
  85
  86         if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
  87                 return;
  88
  89         WARN_ON(!root->tdp_mmu_page);
  90
  91         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
  92         list_del_rcu(&root->link);
  93         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
  94
  95         zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
  96
  97         call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
  98 }
  99
 100 /*
 101  * Finds the next valid root after root (or the first valid root if root
 102  * is NULL), takes a reference on it, and returns that next root. If root
 103  * is not NULL, this thread should have already taken a reference on it, and
 104  * that reference will be dropped. If no valid root is found, this
 105  * function will return NULL.
 106  */
 107 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 108                                               struct kvm_mmu_page *prev_root,
 109                                               bool shared)
 110 {
 111         struct kvm_mmu_page *next_root;
 112
 113         rcu_read_lock();
 114
 115         if (prev_root)
 116                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 117                                                   &prev_root->link,
 118                                                   typeof(*prev_root), link);
 119         else
 120                 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 121                                                    typeof(*next_root), link);
 122
 123         while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
 124                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 125                                 &next_root->link, typeof(*next_root), link);
 126
 127         rcu_read_unlock();
 128
 129         if (prev_root)
 130                 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
 131
 132         return next_root;
 133 }
 134
 135 /*
 136  * Note: this iterator gets and puts references to the roots it iterates over.
 137  * This makes it safe to release the MMU lock and yield within the loop, but
 138  * if exiting the loop early, the caller must drop the reference to the most
 139  * recent root. (Unless keeping a live reference is desirable.)
 140  *
 141  * If shared is set, this function is operating under the MMU lock in read
 142  * mode. In the unlikely event that this thread must free a root, the lock
 143  * will be temporarily dropped and reacquired in write mode.
 144  */
 145 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)  \
 146         for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);            \
 147              _root;                                                     \
 148              _root = tdp_mmu_next_root(_kvm, _root, _shared))           \
 149                 if (kvm_mmu_page_as_id(_root) != _as_id) {              \
 150                 } else
 151
 152 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)                              \
 153         list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,         \
 154                                 lockdep_is_held_type(&kvm->mmu_lock, 0) ||      \
 155                                 lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
 156                 if (kvm_mmu_page_as_id(_root) != _as_id) {              \
 157                 } else
 158
 159 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
 160                                                    int level)
 161 {
 162         union kvm_mmu_page_role role;
 163
 164         role = vcpu->arch.mmu->mmu_role.base;
 165         role.level = level;
 166         role.direct = true;
 167         role.gpte_is_8_bytes = true;
 168         role.access = ACC_ALL;
 169
 170         return role;
 171 }
 172
 173 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 174                                                int level)
 175 {
 176         struct kvm_mmu_page *sp;
 177
 178         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 179         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 180         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 181
 182         sp->role.word = page_role_for_level(vcpu, level).word;
 183         sp->gfn = gfn;
 184         sp->tdp_mmu_page = true;
 185
 186         trace_kvm_mmu_get_page(sp, true);
 187
 188         return sp;
 189 }
 190
 191 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 192 {
 193         union kvm_mmu_page_role role;
 194         struct kvm *kvm = vcpu->kvm;
 195         struct kvm_mmu_page *root;
 196
 197         lockdep_assert_held_write(&kvm->mmu_lock);
 198
 199         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 200
 201         /* Check for an existing root before allocating a new one. */
 202         for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
 203                 if (root->role.word == role.word &&
 204                     kvm_tdp_mmu_get_root(kvm, root))
 205                         goto out;
 206         }
 207
 208         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 209         refcount_set(&root->tdp_mmu_root_count, 1);
 210
 211         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 212         list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
 213         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 214
 215 out:
 216         return __pa(root->spt);
 217 }
 218
 219 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 220                                 u64 old_spte, u64 new_spte, int level,
 221                                 bool shared);
 222
 223 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 224 {
 225         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 226                 return;
 227
 228         if (is_accessed_spte(old_spte) &&
 229             (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
 230              spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
 231                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 232 }
 233
 234 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 235                                           u64 old_spte, u64 new_spte, int level)
 236 {
 237         bool pfn_changed;
 238         struct kvm_memory_slot *slot;
 239
 240         if (level > PG_LEVEL_4K)
 241                 return;
 242
 243         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 244
 245         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 246             is_writable_pte(new_spte)) {
 247                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 248                 mark_page_dirty_in_slot(kvm, slot, gfn);
 249         }
 250 }
 251
 252 /**
 253  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
 254  *
 255  * @kvm: kvm instance
 256  * @sp: the new page
 257  * @shared: This operation may not be running under the exclusive use of
 258  *          the MMU lock and the operation must synchronize with other
 259  *          threads that might be adding or removing pages.
 260  * @account_nx: This page replaces a NX large page and should be marked for
 261  *              eventual reclaim.
 262  */
 263 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 264                               bool shared, bool account_nx)
 265 {
 266         if (shared)
 267                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 268         else
 269                 lockdep_assert_held_write(&kvm->mmu_lock);
 270
 271         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
 272         if (account_nx)
 273                 account_huge_nx_page(kvm, sp);
 274
 275         if (shared)
 276                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 277 }
 278
 279 /**
 280  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
 281  *
 282  * @kvm: kvm instance
 283  * @sp: the page to be removed
 284  * @shared: This operation may not be running under the exclusive use of
 285  *          the MMU lock and the operation must synchronize with other
 286  *          threads that might be adding or removing pages.
 287  */
 288 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 289                                 bool shared)
 290 {
 291         if (shared)
 292                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 293         else
 294                 lockdep_assert_held_write(&kvm->mmu_lock);
 295
 296         list_del(&sp->link);
 297         if (sp->lpage_disallowed)
 298                 unaccount_huge_nx_page(kvm, sp);
 299
 300         if (shared)
 301                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 302 }
 303
 304 /**
 305  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
 306  *
 307  * @kvm: kvm instance
 308  * @pt: the page removed from the paging structure
 309  * @shared: This operation may not be running under the exclusive use
 310  *          of the MMU lock and the operation must synchronize with other
 311  *          threads that might be modifying SPTEs.
 312  *
 313  * Given a page table that has been removed from the TDP paging structure,
 314  * iterates through the page table to clear SPTEs and free child page tables.
 315  *
 316  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 317  * protection. Since this thread removed it from the paging structure,
 318  * this thread will be responsible for ensuring the page is freed. Hence the
 319  * early rcu_dereferences in the function.
 320  */
 321 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
 322                                         bool shared)
 323 {
 324         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
 325         int level = sp->role.level;
 326         gfn_t base_gfn = sp->gfn;
 327         u64 old_child_spte;
 328         u64 *sptep;
 329         gfn_t gfn;
 330         int i;
 331
 332         trace_kvm_mmu_prepare_zap_page(sp);
 333
 334         tdp_mmu_unlink_page(kvm, sp, shared);
 335
 336         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 337                 sptep = rcu_dereference(pt) + i;
 338                 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 339
 340                 if (shared) {
 341                         /*
 342                          * Set the SPTE to a nonpresent value that other
 343                          * threads will not overwrite. If the SPTE was
 344                          * already marked as removed then another thread
 345                          * handling a page fault could overwrite it, so
 346                          * set the SPTE until it is set from some other
 347                          * value to the removed SPTE value.
 348                          */
 349                         for (;;) {
 350                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 351                                 if (!is_removed_spte(old_child_spte))
 352                                         break;
 353                                 cpu_relax();
 354                         }
 355                 } else {
 356                         /*
 357                          * If the SPTE is not MMU-present, there is no backing
 358                          * page associated with the SPTE and so no side effects
 359                          * that need to be recorded, and exclusive ownership of
 360                          * mmu_lock ensures the SPTE can't be made present.
 361                          * Note, zapping MMIO SPTEs is also unnecessary as they
 362                          * are guarded by the memslots generation, not by being
 363                          * unreachable.
 364                          */
 365                         old_child_spte = READ_ONCE(*sptep);
 366                         if (!is_shadow_present_pte(old_child_spte))
 367                                 continue;
 368
 369                         /*
 370                          * Marking the SPTE as a removed SPTE is not
 371                          * strictly necessary here as the MMU lock will
 372                          * stop other threads from concurrently modifying
 373                          * this SPTE. Using the removed SPTE value keeps
 374                          * the two branches consistent and simplifies
 375                          * the function.
 376                          */
 377                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 378                 }
 379                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 380                                     old_child_spte, REMOVED_SPTE, level - 1,
 381                                     shared);
 382         }
 383
 384         kvm_flush_remote_tlbs_with_address(kvm, gfn,
 385                                            KVM_PAGES_PER_HPAGE(level));
 386
 387         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 388 }
 389
 390 /**
 391  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
 392  * @kvm: kvm instance
 393  * @as_id: the address space of the paging structure the SPTE was a part of
 394  * @gfn: the base GFN that was mapped by the SPTE
 395  * @old_spte: The value of the SPTE before the change
 396  * @new_spte: The value of the SPTE after the change
 397  * @level: the level of the PT the SPTE is part of in the paging structure
 398  * @shared: This operation may not be running under the exclusive use of
 399  *          the MMU lock and the operation must synchronize with other
 400  *          threads that might be modifying SPTEs.
 401  *
 402  * Handle bookkeeping that might result from the modification of a SPTE.
 403  * This function must be called for all TDP SPTE modifications.
 404  */
 405 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 406                                   u64 old_spte, u64 new_spte, int level,
 407                                   bool shared)
 408 {
 409         bool was_present = is_shadow_present_pte(old_spte);
 410         bool is_present = is_shadow_present_pte(new_spte);
 411         bool was_leaf = was_present && is_last_spte(old_spte, level);
 412         bool is_leaf = is_present && is_last_spte(new_spte, level);
 413         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 414
 415         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 416         WARN_ON(level < PG_LEVEL_4K);
 417         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 418
 419         /*
 420          * If this warning were to trigger it would indicate that there was a
 421          * missing MMU notifier or a race with some notifier handler.
 422          * A present, leaf SPTE should never be directly replaced with another
 423          * present leaf SPTE pointing to a different PFN. A notifier handler
 424          * should be zapping the SPTE before the main MM's page table is
 425          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 426          * thread before replacement.
 427          */
 428         if (was_leaf && is_leaf && pfn_changed) {
 429                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 430                        "SPTE with another present leaf SPTE mapping a\n"
 431                        "different PFN!\n"
 432                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 433                        as_id, gfn, old_spte, new_spte, level);
 434
 435                 /*
 436                  * Crash the host to prevent error propagation and guest data
 437                  * corruption.
 438                  */
 439                 BUG();
 440         }
 441
 442         if (old_spte == new_spte)
 443                 return;
 444
 445         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 446
 447         if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
 448                 if (is_large_pte(old_spte))
 449                         atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
 450                 else
 451                         atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
 452         }
 453
 454         /*
 455          * The only times a SPTE should be changed from a non-present to
 456          * non-present state is when an MMIO entry is installed/modified/
 457          * removed. In that case, there is nothing to do here.
 458          */
 459         if (!was_present && !is_present) {
 460                 /*
 461                  * If this change does not involve a MMIO SPTE or removed SPTE,
 462                  * it is unexpected. Log the change, though it should not
 463                  * impact the guest since both the former and current SPTEs
 464                  * are nonpresent.
 465                  */
 466                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 467                             !is_mmio_spte(new_spte) &&
 468                             !is_removed_spte(new_spte)))
 469                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 470                                "should not be replaced with another,\n"
 471                                "different nonpresent SPTE, unless one or both\n"
 472                                "are MMIO SPTEs, or the new SPTE is\n"
 473                                "a temporary removed SPTE.\n"
 474                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 475                                as_id, gfn, old_spte, new_spte, level);
 476                 return;
 477         }
 478
 479
 480         if (was_leaf && is_dirty_spte(old_spte) &&
 481             (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
 482                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 483
 484         /*
 485          * Recursively handle child PTs if the change removed a subtree from
 486          * the paging structure.
 487          */
 488         if (was_present && !was_leaf && (pfn_changed || !is_present))
 489                 handle_removed_tdp_mmu_page(kvm,
 490                                 spte_to_child_pt(old_spte, level), shared);
 491 }
 492
 493 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 494                                 u64 old_spte, u64 new_spte, int level,
 495                                 bool shared)
 496 {
 497         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 498                               shared);
 499         handle_changed_spte_acc_track(old_spte, new_spte, level);
 500         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 501                                       new_spte, level);
 502 }
 503
 504 /*
 505  * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
 506  * and handle the associated bookkeeping, but do not mark the page dirty
 507  * in KVM's dirty bitmaps.
 508  *
 509  * @kvm: kvm instance
 510  * @iter: a tdp_iter instance currently on the SPTE that should be set
 511  * @new_spte: The value the SPTE should be set to
 512  * Returns: true if the SPTE was set, false if it was not. If false is returned,
 513  *          this function will have no side-effects.
 514  */
 515 static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
 516                                                         struct tdp_iter *iter,
 517                                                         u64 new_spte)
 518 {
 519         lockdep_assert_held_read(&kvm->mmu_lock);
 520
 521         /*
 522          * Do not change removed SPTEs. Only the thread that froze the SPTE
 523          * may modify it.
 524          */
 525         if (is_removed_spte(iter->old_spte))
 526                 return false;
 527
 528         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
 529                       new_spte) != iter->old_spte)
 530                 return false;
 531
 532         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 533                               new_spte, iter->level, true);
 534         handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
 535
 536         return true;
 537 }
 538
 539 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
 540                                            struct tdp_iter *iter,
 541                                            u64 new_spte)
 542 {
 543         if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
 544                 return false;
 545
 546         handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
 547                                       iter->old_spte, new_spte, iter->level);
 548         return true;
 549 }
 550
 551 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 552                                            struct tdp_iter *iter)
 553 {
 554         /*
 555          * Freeze the SPTE by setting it to a special,
 556          * non-present value. This will stop other threads from
 557          * immediately installing a present entry in its place
 558          * before the TLBs are flushed.
 559          */
 560         if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
 561                 return false;
 562
 563         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 564                                            KVM_PAGES_PER_HPAGE(iter->level));
 565
 566         /*
 567          * No other thread can overwrite the removed SPTE as they
 568          * must either wait on the MMU lock or use
 569          * tdp_mmu_set_spte_atomic which will not overwrite the
 570          * special removed SPTE value. No bookkeeping is needed
 571          * here since the SPTE is going from non-present
 572          * to non-present.
 573          */
 574         WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 575
 576         return true;
 577 }
 578
 579
 580 /*
 581  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 582  * @kvm: kvm instance
 583  * @iter: a tdp_iter instance currently on the SPTE that should be set
 584  * @new_spte: The value the SPTE should be set to
 585  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 586  *                    of the page. Should be set unless handling an MMU
 587  *                    notifier for access tracking. Leaving record_acc_track
 588  *                    unset in that case prevents page accesses from being
 589  *                    double counted.
 590  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 591  *                    appropriate for the change being made. Should be set
 592  *                    unless performing certain dirty logging operations.
 593  *                    Leaving record_dirty_log unset in that case prevents page
 594  *                    writes from being double counted.
 595  */
 596 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 597                                       u64 new_spte, bool record_acc_track,
 598                                       bool record_dirty_log)
 599 {
 600         lockdep_assert_held_write(&kvm->mmu_lock);
 601
 602         /*
 603          * No thread should be using this function to set SPTEs to the
 604          * temporary removed SPTE value.
 605          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 606          * should be used. If operating under the MMU lock in write mode, the
 607          * use of the removed SPTE should not be necessary.
 608          */
 609         WARN_ON(is_removed_spte(iter->old_spte));
 610
 611         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 612
 613         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 614                               new_spte, iter->level, false);
 615         if (record_acc_track)
 616                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 617                                               iter->level);
 618         if (record_dirty_log)
 619                 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
 620                                               iter->old_spte, new_spte,
 621                                               iter->level);
 622 }
 623
 624 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 625                                     u64 new_spte)
 626 {
 627         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 628 }
 629
 630 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 631                                                  struct tdp_iter *iter,
 632                                                  u64 new_spte)
 633 {
 634         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 635 }
 636
 637 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 638                                                  struct tdp_iter *iter,
 639                                                  u64 new_spte)
 640 {
 641         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 642 }
 643
 644 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 645         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 646
 647 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 648         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 649                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 650                     !is_last_spte(_iter.old_spte, _iter.level))         \
 651                         continue;                                       \
 652                 else
 653
 654 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 655         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 656                          _mmu->shadow_root_level, _start, _end)
 657
 658 /*
 659  * Yield if the MMU lock is contended or this thread needs to return control
 660  * to the scheduler.
 661  *
 662  * If this function should yield and flush is set, it will perform a remote
 663  * TLB flush before yielding.
 664  *
 665  * If this function yields, it will also reset the tdp_iter's walk over the
 666  * paging structure and the calling function should skip to the next
 667  * iteration to allow the iterator to continue its traversal from the
 668  * paging structure root.
 669  *
 670  * Return true if this function yielded and the iterator's traversal was reset.
 671  * Return false if a yield was not needed.
 672  */
 673 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 674                                              struct tdp_iter *iter, bool flush,
 675                                              bool shared)
 676 {
 677         /* Ensure forward progress has been made before yielding. */
 678         if (iter->next_last_level_gfn == iter->yielded_gfn)
 679                 return false;
 680
 681         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 682                 rcu_read_unlock();
 683
 684                 if (flush)
 685                         kvm_flush_remote_tlbs(kvm);
 686
 687                 if (shared)
 688                         cond_resched_rwlock_read(&kvm->mmu_lock);
 689                 else
 690                         cond_resched_rwlock_write(&kvm->mmu_lock);
 691
 692                 rcu_read_lock();
 693
 694                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 695
 696                 tdp_iter_restart(iter);
 697
 698                 return true;
 699         }
 700
 701         return false;
 702 }
 703
 704 /*
 705  * Tears down the mappings for the range of gfns, [start, end), and frees the
 706  * non-root pages mapping GFNs strictly within that range. Returns true if
 707  * SPTEs have been cleared and a TLB flush is needed before releasing the
 708  * MMU lock.
 709  *
 710  * If can_yield is true, will release the MMU lock and reschedule if the
 711  * scheduler needs the CPU or there is contention on the MMU lock. If this
 712  * function cannot yield, it will not release the MMU lock or reschedule and
 713  * the caller must ensure it does not supply too large a GFN range, or the
 714  * operation can cause a soft lockup.
 715  *
 716  * If shared is true, this thread holds the MMU lock in read mode and must
 717  * account for the possibility that other threads are modifying the paging
 718  * structures concurrently. If shared is false, this thread should hold the
 719  * MMU lock in write mode.
 720  */
 721 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 722                           gfn_t start, gfn_t end, bool can_yield, bool flush,
 723                           bool shared)
 724 {
 725         struct tdp_iter iter;
 726
 727         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 728
 729         rcu_read_lock();
 730
 731         tdp_root_for_each_pte(iter, root, start, end) {
 732 retry:
 733                 if (can_yield &&
 734                     tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
 735                         flush = false;
 736                         continue;
 737                 }
 738
 739                 if (!is_shadow_present_pte(iter.old_spte))
 740                         continue;
 741
 742                 /*
 743                  * If this is a non-last-level SPTE that covers a larger range
 744                  * than should be zapped, continue, and zap the mappings at a
 745                  * lower level.
 746                  */
 747                 if ((iter.gfn < start ||
 748                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 749                     !is_last_spte(iter.old_spte, iter.level))
 750                         continue;
 751
 752                 if (!shared) {
 753                         tdp_mmu_set_spte(kvm, &iter, 0);
 754                         flush = true;
 755                 } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
 756                         /*
 757                          * The iter must explicitly re-read the SPTE because
 758                          * the atomic cmpxchg failed.
 759                          */
 760                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
 761                         goto retry;
 762                 }
 763         }
 764
 765         rcu_read_unlock();
 766         return flush;
 767 }
 768
 769 /*
 770  * Tears down the mappings for the range of gfns, [start, end), and frees the
 771  * non-root pages mapping GFNs strictly within that range. Returns true if
 772  * SPTEs have been cleared and a TLB flush is needed before releasing the
 773  * MMU lock.
 774  *
 775  * If shared is true, this thread holds the MMU lock in read mode and must
 776  * account for the possibility that other threads are modifying the paging
 777  * structures concurrently. If shared is false, this thread should hold the
 778  * MMU in write mode.
 779  */
 780 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
 781                                  gfn_t end, bool can_yield, bool flush,
 782                                  bool shared)
 783 {
 784         struct kvm_mmu_page *root;
 785
 786         for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
 787                 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
 788                                       shared);
 789
 790         return flush;
 791 }
 792
 793 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 794 {
 795         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 796         bool flush = false;
 797         int i;
 798
 799         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
 800                 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
 801                                                   flush, false);
 802
 803         if (flush)
 804                 kvm_flush_remote_tlbs(kvm);
 805 }
 806
 807 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
 808                                                   struct kvm_mmu_page *prev_root)
 809 {
 810         struct kvm_mmu_page *next_root;
 811
 812         if (prev_root)
 813                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 814                                                   &prev_root->link,
 815                                                   typeof(*prev_root), link);
 816         else
 817                 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 818                                                    typeof(*next_root), link);
 819
 820         while (next_root && !(next_root->role.invalid &&
 821                               refcount_read(&next_root->tdp_mmu_root_count)))
 822                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 823                                                   &next_root->link,
 824                                                   typeof(*next_root), link);
 825
 826         return next_root;
 827 }
 828
 829 /*
 830  * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
 831  * invalidated root, they will not be freed until this function drops the
 832  * reference. Before dropping that reference, tear down the paging
 833  * structure so that whichever thread does drop the last reference
 834  * only has to do a trivial amount of work. Since the roots are invalid,
 835  * no new SPTEs should be created under them.
 836  */
 837 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
 838 {
 839         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 840         struct kvm_mmu_page *next_root;
 841         struct kvm_mmu_page *root;
 842         bool flush = false;
 843
 844         lockdep_assert_held_read(&kvm->mmu_lock);
 845
 846         rcu_read_lock();
 847
 848         root = next_invalidated_root(kvm, NULL);
 849
 850         while (root) {
 851                 next_root = next_invalidated_root(kvm, root);
 852
 853                 rcu_read_unlock();
 854
 855                 flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
 856                                       true);
 857
 858                 /*
 859                  * Put the reference acquired in
 860                  * kvm_tdp_mmu_invalidate_roots
 861                  */
 862                 kvm_tdp_mmu_put_root(kvm, root, true);
 863
 864                 root = next_root;
 865
 866                 rcu_read_lock();
 867         }
 868
 869         rcu_read_unlock();
 870
 871         if (flush)
 872                 kvm_flush_remote_tlbs(kvm);
 873 }
 874
 875 /*
 876  * Mark each TDP MMU root as invalid so that other threads
 877  * will drop their references and allow the root count to
 878  * go to 0.
 879  *
 880  * Also take a reference on all roots so that this thread
 881  * can do the bulk of the work required to free the roots
 882  * once they are invalidated. Without this reference, a
 883  * vCPU thread might drop the last reference to a root and
 884  * get stuck with tearing down the entire paging structure.
 885  *
 886  * Roots which have a zero refcount should be skipped as
 887  * they're already being torn down.
 888  * Already invalid roots should be referenced again so that
 889  * they aren't freed before kvm_tdp_mmu_zap_all_fast is
 890  * done with them.
 891  *
 892  * This has essentially the same effect for the TDP MMU
 893  * as updating mmu_valid_gen does for the shadow MMU.
 894  */
 895 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
 896 {
 897         struct kvm_mmu_page *root;
 898
 899         lockdep_assert_held_write(&kvm->mmu_lock);
 900         list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
 901                 if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
 902                         root->role.invalid = true;
 903 }
 904
 905 /*
 906  * Installs a last-level SPTE to handle a TDP page fault.
 907  * (NPT/EPT violation/misconfiguration)
 908  */
 909 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 910                                           int map_writable,
 911                                           struct tdp_iter *iter,
 912                                           kvm_pfn_t pfn, bool prefault)
 913 {
 914         u64 new_spte;
 915         int ret = 0;
 916         int make_spte_ret = 0;
 917
 918         if (unlikely(is_noslot_pfn(pfn)))
 919                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 920         else
 921                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 922                                          pfn, iter->old_spte, prefault, true,
 923                                          map_writable, !shadow_accessed_mask,
 924                                          &new_spte);
 925
 926         if (new_spte == iter->old_spte)
 927                 ret = RET_PF_SPURIOUS;
 928         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 929                 return RET_PF_RETRY;
 930
 931         /*
 932          * If the page fault was caused by a write but the page is write
 933          * protected, emulation is needed. If the emulation was skipped,
 934          * the vCPU would have the same fault again.
 935          */
 936         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 937                 if (write)
 938                         ret = RET_PF_EMULATE;
 939                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 940         }
 941
 942         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 943         if (unlikely(is_mmio_spte(new_spte))) {
 944                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 945                                      new_spte);
 946                 ret = RET_PF_EMULATE;
 947         } else {
 948                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 949                                        rcu_dereference(iter->sptep));
 950         }
 951
 952         if (!prefault)
 953                 vcpu->stat.pf_fixed++;
 954
 955         return ret;
 956 }
 957
 958 /*
 959  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 960  * page tables and SPTEs to translate the faulting guest physical address.
 961  */
 962 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 963                     int map_writable, int max_level, kvm_pfn_t pfn,
 964                     bool prefault)
 965 {
 966         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 967         bool write = error_code & PFERR_WRITE_MASK;
 968         bool exec = error_code & PFERR_FETCH_MASK;
 969         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 970         struct kvm_mmu *mmu = vcpu->arch.mmu;
 971         struct tdp_iter iter;
 972         struct kvm_mmu_page *sp;
 973         u64 *child_pt;
 974         u64 new_spte;
 975         int ret;
 976         gfn_t gfn = gpa >> PAGE_SHIFT;
 977         int level;
 978         int req_level;
 979
 980         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
 981                 return RET_PF_RETRY;
 982         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
 983                 return RET_PF_RETRY;
 984
 985         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
 986                                         huge_page_disallowed, &req_level);
 987
 988         trace_kvm_mmu_spte_requested(gpa, level, pfn);
 989
 990         rcu_read_lock();
 991
 992         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 993                 if (nx_huge_page_workaround_enabled)
 994                         disallowed_hugepage_adjust(iter.old_spte, gfn,
 995                                                    iter.level, &pfn, &level);
 996
 997                 if (iter.level == level)
 998                         break;
 999
1000                 /*
1001                  * If there is an SPTE mapping a large page at a higher level
1002                  * than the target, that SPTE must be cleared and replaced
1003                  * with a non-leaf SPTE.
1004                  */
1005                 if (is_shadow_present_pte(iter.old_spte) &&
1006                     is_large_pte(iter.old_spte)) {
1007                         if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1008                                 break;
1009
1010                         /*
1011                          * The iter must explicitly re-read the spte here
1012                          * because the new value informs the !present
1013                          * path below.
1014                          */
1015                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1016                 }
1017
1018                 if (!is_shadow_present_pte(iter.old_spte)) {
1019                         /*
1020                          * If SPTE has been forzen by another thread, just
1021                          * give up and retry, avoiding unnecessary page table
1022                          * allocation and free.
1023                          */
1024                         if (is_removed_spte(iter.old_spte))
1025                                 break;
1026
1027                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
1028                         child_pt = sp->spt;
1029
1030                         new_spte = make_nonleaf_spte(child_pt,
1031                                                      !shadow_accessed_mask);
1032
1033                         if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
1034                                                     new_spte)) {
1035                                 tdp_mmu_link_page(vcpu->kvm, sp, true,
1036                                                   huge_page_disallowed &&
1037                                                   req_level >= iter.level);
1038
1039                                 trace_kvm_mmu_get_page(sp, true);
1040                         } else {
1041                                 tdp_mmu_free_sp(sp);
1042                                 break;
1043                         }
1044                 }
1045         }
1046
1047         if (iter.level != level) {
1048                 rcu_read_unlock();
1049                 return RET_PF_RETRY;
1050         }
1051
1052         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
1053                                               pfn, prefault);
1054         rcu_read_unlock();
1055
1056         return ret;
1057 }
1058
1059 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1060                                  bool flush)
1061 {
1062         struct kvm_mmu_page *root;
1063
1064         for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
1065                 flush |= zap_gfn_range(kvm, root, range->start, range->end,
1066                                        range->may_block, flush, false);
1067
1068         return flush;
1069 }
1070
1071 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1072                               struct kvm_gfn_range *range);
1073
1074 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1075                                                    struct kvm_gfn_range *range,
1076                                                    tdp_handler_t handler)
1077 {
1078         struct kvm_mmu_page *root;
1079         struct tdp_iter iter;
1080         bool ret = false;
1081
1082         rcu_read_lock();
1083
1084         /*
1085          * Don't support rescheduling, none of the MMU notifiers that funnel
1086          * into this helper allow blocking; it'd be dead, wasteful code.
1087          */
1088         for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1089                 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1090                         ret |= handler(kvm, &iter, range);
1091         }
1092
1093         rcu_read_unlock();
1094
1095         return ret;
1096 }
1097
1098 /*
1099  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1100  * if any of the GFNs in the range have been accessed.
1101  */
1102 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1103                           struct kvm_gfn_range *range)
1104 {
1105         u64 new_spte = 0;
1106
1107         /* If we have a non-accessed entry we don't need to change the pte. */
1108         if (!is_accessed_spte(iter->old_spte))
1109                 return false;
1110
1111         new_spte = iter->old_spte;
1112
1113         if (spte_ad_enabled(new_spte)) {
1114                 new_spte &= ~shadow_accessed_mask;
1115         } else {
1116                 /*
1117                  * Capture the dirty status of the page, so that it doesn't get
1118                  * lost when the SPTE is marked for access tracking.
1119                  */
1120                 if (is_writable_pte(new_spte))
1121                         kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1122
1123                 new_spte = mark_spte_for_access_track(new_spte);
1124         }
1125
1126         tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1127
1128         return true;
1129 }
1130
1131 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1132 {
1133         return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1134 }
1135
1136 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1137                          struct kvm_gfn_range *range)
1138 {
1139         return is_accessed_spte(iter->old_spte);
1140 }
1141
1142 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1143 {
1144         return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1145 }
1146
1147 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1148                          struct kvm_gfn_range *range)
1149 {
1150         u64 new_spte;
1151
1152         /* Huge pages aren't expected to be modified without first being zapped. */
1153         WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1154
1155         if (iter->level != PG_LEVEL_4K ||
1156             !is_shadow_present_pte(iter->old_spte))
1157                 return false;
1158
1159         /*
1160          * Note, when changing a read-only SPTE, it's not strictly necessary to
1161          * zero the SPTE before setting the new PFN, but doing so preserves the
1162          * invariant that the PFN of a present * leaf SPTE can never change.
1163          * See __handle_changed_spte().
1164          */
1165         tdp_mmu_set_spte(kvm, iter, 0);
1166
1167         if (!pte_write(range->pte)) {
1168                 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1169                                                                   pte_pfn(range->pte));
1170
1171                 tdp_mmu_set_spte(kvm, iter, new_spte);
1172         }
1173
1174         return true;
1175 }
1176
1177 /*
1178  * Handle the changed_pte MMU notifier for the TDP MMU.
1179  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1180  * notifier.
1181  * Returns non-zero if a flush is needed before releasing the MMU lock.
1182  */
1183 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1184 {
1185         bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1186
1187         /* FIXME: return 'flush' instead of flushing here. */
1188         if (flush)
1189                 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1190
1191         return false;
1192 }
1193
1194 /*
1195  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1196  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1197  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1198  */
1199 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1200                              gfn_t start, gfn_t end, int min_level)
1201 {
1202         struct tdp_iter iter;
1203         u64 new_spte;
1204         bool spte_set = false;
1205
1206         rcu_read_lock();
1207
1208         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1209
1210         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1211                                    min_level, start, end) {
1212 retry:
1213                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1214                         continue;
1215
1216                 if (!is_shadow_present_pte(iter.old_spte) ||
1217                     !is_last_spte(iter.old_spte, iter.level) ||
1218                     !(iter.old_spte & PT_WRITABLE_MASK))
1219                         continue;
1220
1221                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1222
1223                 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1224                                                           new_spte)) {
1225                         /*
1226                          * The iter must explicitly re-read the SPTE because
1227                          * the atomic cmpxchg failed.
1228                          */
1229                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1230                         goto retry;
1231                 }
1232                 spte_set = true;
1233         }
1234
1235         rcu_read_unlock();
1236         return spte_set;
1237 }
1238
1239 /*
1240  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1241  * only affect leaf SPTEs down to min_level.
1242  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1243  */
1244 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1245                              int min_level)
1246 {
1247         struct kvm_mmu_page *root;
1248         bool spte_set = false;
1249
1250         lockdep_assert_held_read(&kvm->mmu_lock);
1251
1252         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1253                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1254                              slot->base_gfn + slot->npages, min_level);
1255
1256         return spte_set;
1257 }
1258
1259 /*
1260  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1261  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1262  * If AD bits are not enabled, this will require clearing the writable bit on
1263  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1264  * be flushed.
1265  */
1266 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1267                            gfn_t start, gfn_t end)
1268 {
1269         struct tdp_iter iter;
1270         u64 new_spte;
1271         bool spte_set = false;
1272
1273         rcu_read_lock();
1274
1275         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1276 retry:
1277                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1278                         continue;
1279
1280                 if (spte_ad_need_write_protect(iter.old_spte)) {
1281                         if (is_writable_pte(iter.old_spte))
1282                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1283                         else
1284                                 continue;
1285                 } else {
1286                         if (iter.old_spte & shadow_dirty_mask)
1287                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1288                         else
1289                                 continue;
1290                 }
1291
1292                 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1293                                                           new_spte)) {
1294                         /*
1295                          * The iter must explicitly re-read the SPTE because
1296                          * the atomic cmpxchg failed.
1297                          */
1298                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1299                         goto retry;
1300                 }
1301                 spte_set = true;
1302         }
1303
1304         rcu_read_unlock();
1305         return spte_set;
1306 }
1307
1308 /*
1309  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1310  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1311  * If AD bits are not enabled, this will require clearing the writable bit on
1312  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1313  * be flushed.
1314  */
1315 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1316 {
1317         struct kvm_mmu_page *root;
1318         bool spte_set = false;
1319
1320         lockdep_assert_held_read(&kvm->mmu_lock);
1321
1322         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1323                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1324                                 slot->base_gfn + slot->npages);
1325
1326         return spte_set;
1327 }
1328
1329 /*
1330  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1331  * set in mask, starting at gfn. The given memslot is expected to contain all
1332  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1333  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1334  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1335  */
1336 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1337                                   gfn_t gfn, unsigned long mask, bool wrprot)
1338 {
1339         struct tdp_iter iter;
1340         u64 new_spte;
1341
1342         rcu_read_lock();
1343
1344         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1345                                     gfn + BITS_PER_LONG) {
1346                 if (!mask)
1347                         break;
1348
1349                 if (iter.level > PG_LEVEL_4K ||
1350                     !(mask & (1UL << (iter.gfn - gfn))))
1351                         continue;
1352
1353                 mask &= ~(1UL << (iter.gfn - gfn));
1354
1355                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1356                         if (is_writable_pte(iter.old_spte))
1357                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1358                         else
1359                                 continue;
1360                 } else {
1361                         if (iter.old_spte & shadow_dirty_mask)
1362                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1363                         else
1364                                 continue;
1365                 }
1366
1367                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1368         }
1369
1370         rcu_read_unlock();
1371 }
1372
1373 /*
1374  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1375  * set in mask, starting at gfn. The given memslot is expected to contain all
1376  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1377  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1378  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1379  */
1380 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1381                                        struct kvm_memory_slot *slot,
1382                                        gfn_t gfn, unsigned long mask,
1383                                        bool wrprot)
1384 {
1385         struct kvm_mmu_page *root;
1386
1387         lockdep_assert_held_write(&kvm->mmu_lock);
1388         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1389                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1390 }
1391
1392 /*
1393  * Clear leaf entries which could be replaced by large mappings, for
1394  * GFNs within the slot.
1395  */
1396 static bool zap_collapsible_spte_range(struct kvm *kvm,
1397                                        struct kvm_mmu_page *root,
1398                                        const struct kvm_memory_slot *slot,
1399                                        bool flush)
1400 {
1401         gfn_t start = slot->base_gfn;
1402         gfn_t end = start + slot->npages;
1403         struct tdp_iter iter;
1404         kvm_pfn_t pfn;
1405
1406         rcu_read_lock();
1407
1408         tdp_root_for_each_pte(iter, root, start, end) {
1409 retry:
1410                 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1411                         flush = false;
1412                         continue;
1413                 }
1414
1415                 if (!is_shadow_present_pte(iter.old_spte) ||
1416                     !is_last_spte(iter.old_spte, iter.level))
1417                         continue;
1418
1419                 pfn = spte_to_pfn(iter.old_spte);
1420                 if (kvm_is_reserved_pfn(pfn) ||
1421                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1422                                                             pfn, PG_LEVEL_NUM))
1423                         continue;
1424
1425                 if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1426                         /*
1427                          * The iter must explicitly re-read the SPTE because
1428                          * the atomic cmpxchg failed.
1429                          */
1430                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1431                         goto retry;
1432                 }
1433                 flush = true;
1434         }
1435
1436         rcu_read_unlock();
1437
1438         return flush;
1439 }
1440
1441 /*
1442  * Clear non-leaf entries (and free associated page tables) which could
1443  * be replaced by large mappings, for GFNs within the slot.
1444  */
1445 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1446                                        const struct kvm_memory_slot *slot,
1447                                        bool flush)
1448 {
1449         struct kvm_mmu_page *root;
1450
1451         lockdep_assert_held_read(&kvm->mmu_lock);
1452
1453         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1454                 flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1455
1456         return flush;
1457 }
1458
1459 /*
1460  * Removes write access on the last level SPTE mapping this GFN and unsets the
1461  * MMU-writable bit to ensure future writes continue to be intercepted.
1462  * Returns true if an SPTE was set and a TLB flush is needed.
1463  */
1464 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1465                               gfn_t gfn)
1466 {
1467         struct tdp_iter iter;
1468         u64 new_spte;
1469         bool spte_set = false;
1470
1471         rcu_read_lock();
1472
1473         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1474                 if (!is_writable_pte(iter.old_spte))
1475                         break;
1476
1477                 new_spte = iter.old_spte &
1478                         ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1479
1480                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1481                 spte_set = true;
1482         }
1483
1484         rcu_read_unlock();
1485
1486         return spte_set;
1487 }
1488
1489 /*
1490  * Removes write access on the last level SPTE mapping this GFN and unsets the
1491  * MMU-writable bit to ensure future writes continue to be intercepted.
1492  * Returns true if an SPTE was set and a TLB flush is needed.
1493  */
1494 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1495                                    struct kvm_memory_slot *slot, gfn_t gfn)
1496 {
1497         struct kvm_mmu_page *root;
1498         bool spte_set = false;
1499
1500         lockdep_assert_held_write(&kvm->mmu_lock);
1501         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1502                 spte_set |= write_protect_gfn(kvm, root, gfn);
1503
1504         return spte_set;
1505 }
1506
1507 /*
1508  * Return the level of the lowest level SPTE added to sptes.
1509  * That SPTE may be non-present.
1510  */
1511 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1512                          int *root_level)
1513 {
1514         struct tdp_iter iter;
1515         struct kvm_mmu *mmu = vcpu->arch.mmu;
1516         gfn_t gfn = addr >> PAGE_SHIFT;
1517         int leaf = -1;
1518
1519         *root_level = vcpu->arch.mmu->shadow_root_level;
1520
1521         rcu_read_lock();
1522
1523         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1524                 leaf = iter.level;
1525                 sptes[leaf] = iter.old_spte;
1526         }
1527
1528         rcu_read_unlock();
1529
1530         return leaf;
1531 }