arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = false;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  20                 return;
  21
  22         /* This should not be changed for the lifetime of the VM. */
  23         kvm->arch.tdp_mmu_enabled = true;
  24
  25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  28 }
  29
  30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  31 {
  32         if (!kvm->arch.tdp_mmu_enabled)
  33                 return;
  34
  35         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  36
  37         /*
  38          * Ensure that all the outstanding RCU callbacks to free shadow pages
  39          * can run before the VM is torn down.
  40          */
  41         rcu_barrier();
  42 }
  43
  44 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  45                           gfn_t start, gfn_t end, bool can_yield, bool flush);
  46
  47 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
  48 {
  49         free_page((unsigned long)sp->spt);
  50         kmem_cache_free(mmu_page_header_cache, sp);
  51 }
  52
  53 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
  54 {
  55         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
  56
  57         lockdep_assert_held_write(&kvm->mmu_lock);
  58
  59         if (--root->root_count)
  60                 return;
  61
  62         WARN_ON(!root->tdp_mmu_page);
  63
  64         list_del(&root->link);
  65
  66         zap_gfn_range(kvm, root, 0, max_gfn, false, false);
  67
  68         tdp_mmu_free_sp(root);
  69 }
  70
  71 /*
  72  * Finds the next valid root after root (or the first valid root if root
  73  * is NULL), takes a reference on it, and returns that next root. If root
  74  * is not NULL, this thread should have already taken a reference on it, and
  75  * that reference will be dropped. If no valid root is found, this
  76  * function will return NULL.
  77  */
  78 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  79                                               struct kvm_mmu_page *prev_root)
  80 {
  81         struct kvm_mmu_page *next_root;
  82
  83         lockdep_assert_held_write(&kvm->mmu_lock);
  84
  85         if (prev_root)
  86                 next_root = list_next_entry(prev_root, link);
  87         else
  88                 next_root = list_first_entry(&kvm->arch.tdp_mmu_roots,
  89                                              typeof(*next_root), link);
  90
  91         if (list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link))
  92                 next_root = NULL;
  93         else
  94                 kvm_tdp_mmu_get_root(kvm, next_root);
  95
  96         if (prev_root)
  97                 kvm_tdp_mmu_put_root(kvm, prev_root);
  98
  99         return next_root;
 100 }
 101
 102 /*
 103  * Note: this iterator gets and puts references to the roots it iterates over.
 104  * This makes it safe to release the MMU lock and yield within the loop, but
 105  * if exiting the loop early, the caller must drop the reference to the most
 106  * recent root. (Unless keeping a live reference is desirable.)
 107  */
 108 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)           \
 109         for (_root = tdp_mmu_next_root(_kvm, NULL);             \
 110              _root;                                             \
 111              _root = tdp_mmu_next_root(_kvm, _root))            \
 112                 if (kvm_mmu_page_as_id(_root) != _as_id) {      \
 113                 } else
 114
 115 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)                      \
 116         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)     \
 117                 if (kvm_mmu_page_as_id(_root) != _as_id) {              \
 118                 } else
 119
 120 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
 121                                                    int level)
 122 {
 123         union kvm_mmu_page_role role;
 124
 125         role = vcpu->arch.mmu->mmu_role.base;
 126         role.level = level;
 127         role.direct = true;
 128         role.gpte_is_8_bytes = true;
 129         role.access = ACC_ALL;
 130
 131         return role;
 132 }
 133
 134 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 135                                                int level)
 136 {
 137         struct kvm_mmu_page *sp;
 138
 139         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 140         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 141         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 142
 143         sp->role.word = page_role_for_level(vcpu, level).word;
 144         sp->gfn = gfn;
 145         sp->tdp_mmu_page = true;
 146
 147         trace_kvm_mmu_get_page(sp, true);
 148
 149         return sp;
 150 }
 151
 152 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 153 {
 154         union kvm_mmu_page_role role;
 155         struct kvm *kvm = vcpu->kvm;
 156         struct kvm_mmu_page *root;
 157
 158         lockdep_assert_held_write(&kvm->mmu_lock);
 159
 160         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 161
 162         /* Check for an existing root before allocating a new one. */
 163         for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
 164                 if (root->role.word == role.word) {
 165                         kvm_tdp_mmu_get_root(kvm, root);
 166                         goto out;
 167                 }
 168         }
 169
 170         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 171         root->root_count = 1;
 172
 173         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 174
 175 out:
 176         return __pa(root->spt);
 177 }
 178
 179 /*
 180  * This is called through call_rcu in order to free TDP page table memory
 181  * safely with respect to other kernel threads that may be operating on
 182  * the memory.
 183  * By only accessing TDP MMU page table memory in an RCU read critical
 184  * section, and freeing it after a grace period, lockless access to that
 185  * memory won't use it after it is freed.
 186  */
 187 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
 188 {
 189         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
 190                                                rcu_head);
 191
 192         tdp_mmu_free_sp(sp);
 193 }
 194
 195 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 196                                 u64 old_spte, u64 new_spte, int level,
 197                                 bool shared);
 198
 199 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 200 {
 201         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 202                 return;
 203
 204         if (is_accessed_spte(old_spte) &&
 205             (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
 206              spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
 207                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 208 }
 209
 210 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 211                                           u64 old_spte, u64 new_spte, int level)
 212 {
 213         bool pfn_changed;
 214         struct kvm_memory_slot *slot;
 215
 216         if (level > PG_LEVEL_4K)
 217                 return;
 218
 219         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 220
 221         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 222             is_writable_pte(new_spte)) {
 223                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 224                 mark_page_dirty_in_slot(kvm, slot, gfn);
 225         }
 226 }
 227
 228 /**
 229  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
 230  *
 231  * @kvm: kvm instance
 232  * @sp: the new page
 233  * @shared: This operation may not be running under the exclusive use of
 234  *          the MMU lock and the operation must synchronize with other
 235  *          threads that might be adding or removing pages.
 236  * @account_nx: This page replaces a NX large page and should be marked for
 237  *              eventual reclaim.
 238  */
 239 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 240                               bool shared, bool account_nx)
 241 {
 242         if (shared)
 243                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 244         else
 245                 lockdep_assert_held_write(&kvm->mmu_lock);
 246
 247         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
 248         if (account_nx)
 249                 account_huge_nx_page(kvm, sp);
 250
 251         if (shared)
 252                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 253 }
 254
 255 /**
 256  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
 257  *
 258  * @kvm: kvm instance
 259  * @sp: the page to be removed
 260  * @shared: This operation may not be running under the exclusive use of
 261  *          the MMU lock and the operation must synchronize with other
 262  *          threads that might be adding or removing pages.
 263  */
 264 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 265                                 bool shared)
 266 {
 267         if (shared)
 268                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 269         else
 270                 lockdep_assert_held_write(&kvm->mmu_lock);
 271
 272         list_del(&sp->link);
 273         if (sp->lpage_disallowed)
 274                 unaccount_huge_nx_page(kvm, sp);
 275
 276         if (shared)
 277                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 278 }
 279
 280 /**
 281  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
 282  *
 283  * @kvm: kvm instance
 284  * @pt: the page removed from the paging structure
 285  * @shared: This operation may not be running under the exclusive use
 286  *          of the MMU lock and the operation must synchronize with other
 287  *          threads that might be modifying SPTEs.
 288  *
 289  * Given a page table that has been removed from the TDP paging structure,
 290  * iterates through the page table to clear SPTEs and free child page tables.
 291  *
 292  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 293  * protection. Since this thread removed it from the paging structure,
 294  * this thread will be responsible for ensuring the page is freed. Hence the
 295  * early rcu_dereferences in the function.
 296  */
 297 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
 298                                         bool shared)
 299 {
 300         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
 301         int level = sp->role.level;
 302         gfn_t base_gfn = sp->gfn;
 303         u64 old_child_spte;
 304         u64 *sptep;
 305         gfn_t gfn;
 306         int i;
 307
 308         trace_kvm_mmu_prepare_zap_page(sp);
 309
 310         tdp_mmu_unlink_page(kvm, sp, shared);
 311
 312         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 313                 sptep = rcu_dereference(pt) + i;
 314                 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 315
 316                 if (shared) {
 317                         /*
 318                          * Set the SPTE to a nonpresent value that other
 319                          * threads will not overwrite. If the SPTE was
 320                          * already marked as removed then another thread
 321                          * handling a page fault could overwrite it, so
 322                          * set the SPTE until it is set from some other
 323                          * value to the removed SPTE value.
 324                          */
 325                         for (;;) {
 326                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 327                                 if (!is_removed_spte(old_child_spte))
 328                                         break;
 329                                 cpu_relax();
 330                         }
 331                 } else {
 332                         /*
 333                          * If the SPTE is not MMU-present, there is no backing
 334                          * page associated with the SPTE and so no side effects
 335                          * that need to be recorded, and exclusive ownership of
 336                          * mmu_lock ensures the SPTE can't be made present.
 337                          * Note, zapping MMIO SPTEs is also unnecessary as they
 338                          * are guarded by the memslots generation, not by being
 339                          * unreachable.
 340                          */
 341                         old_child_spte = READ_ONCE(*sptep);
 342                         if (!is_shadow_present_pte(old_child_spte))
 343                                 continue;
 344
 345                         /*
 346                          * Marking the SPTE as a removed SPTE is not
 347                          * strictly necessary here as the MMU lock will
 348                          * stop other threads from concurrently modifying
 349                          * this SPTE. Using the removed SPTE value keeps
 350                          * the two branches consistent and simplifies
 351                          * the function.
 352                          */
 353                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 354                 }
 355                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 356                                     old_child_spte, REMOVED_SPTE, level - 1,
 357                                     shared);
 358         }
 359
 360         kvm_flush_remote_tlbs_with_address(kvm, gfn,
 361                                            KVM_PAGES_PER_HPAGE(level));
 362
 363         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 364 }
 365
 366 /**
 367  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 368  * @kvm: kvm instance
 369  * @as_id: the address space of the paging structure the SPTE was a part of
 370  * @gfn: the base GFN that was mapped by the SPTE
 371  * @old_spte: The value of the SPTE before the change
 372  * @new_spte: The value of the SPTE after the change
 373  * @level: the level of the PT the SPTE is part of in the paging structure
 374  * @shared: This operation may not be running under the exclusive use of
 375  *          the MMU lock and the operation must synchronize with other
 376  *          threads that might be modifying SPTEs.
 377  *
 378  * Handle bookkeeping that might result from the modification of a SPTE.
 379  * This function must be called for all TDP SPTE modifications.
 380  */
 381 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 382                                   u64 old_spte, u64 new_spte, int level,
 383                                   bool shared)
 384 {
 385         bool was_present = is_shadow_present_pte(old_spte);
 386         bool is_present = is_shadow_present_pte(new_spte);
 387         bool was_leaf = was_present && is_last_spte(old_spte, level);
 388         bool is_leaf = is_present && is_last_spte(new_spte, level);
 389         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 390
 391         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 392         WARN_ON(level < PG_LEVEL_4K);
 393         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 394
 395         /*
 396          * If this warning were to trigger it would indicate that there was a
 397          * missing MMU notifier or a race with some notifier handler.
 398          * A present, leaf SPTE should never be directly replaced with another
 399          * present leaf SPTE pointing to a differnt PFN. A notifier handler
 400          * should be zapping the SPTE before the main MM's page table is
 401          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 402          * thread before replacement.
 403          */
 404         if (was_leaf && is_leaf && pfn_changed) {
 405                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 406                        "SPTE with another present leaf SPTE mapping a\n"
 407                        "different PFN!\n"
 408                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 409                        as_id, gfn, old_spte, new_spte, level);
 410
 411                 /*
 412                  * Crash the host to prevent error propagation and guest data
 413                  * courruption.
 414                  */
 415                 BUG();
 416         }
 417
 418         if (old_spte == new_spte)
 419                 return;
 420
 421         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 422
 423         /*
 424          * The only times a SPTE should be changed from a non-present to
 425          * non-present state is when an MMIO entry is installed/modified/
 426          * removed. In that case, there is nothing to do here.
 427          */
 428         if (!was_present && !is_present) {
 429                 /*
 430                  * If this change does not involve a MMIO SPTE or removed SPTE,
 431                  * it is unexpected. Log the change, though it should not
 432                  * impact the guest since both the former and current SPTEs
 433                  * are nonpresent.
 434                  */
 435                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 436                             !is_mmio_spte(new_spte) &&
 437                             !is_removed_spte(new_spte)))
 438                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 439                                "should not be replaced with another,\n"
 440                                "different nonpresent SPTE, unless one or both\n"
 441                                "are MMIO SPTEs, or the new SPTE is\n"
 442                                "a temporary removed SPTE.\n"
 443                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 444                                as_id, gfn, old_spte, new_spte, level);
 445                 return;
 446         }
 447
 448
 449         if (was_leaf && is_dirty_spte(old_spte) &&
 450             (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
 451                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 452
 453         /*
 454          * Recursively handle child PTs if the change removed a subtree from
 455          * the paging structure.
 456          */
 457         if (was_present && !was_leaf && (pfn_changed || !is_present))
 458                 handle_removed_tdp_mmu_page(kvm,
 459                                 spte_to_child_pt(old_spte, level), shared);
 460 }
 461
 462 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 463                                 u64 old_spte, u64 new_spte, int level,
 464                                 bool shared)
 465 {
 466         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 467                               shared);
 468         handle_changed_spte_acc_track(old_spte, new_spte, level);
 469         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 470                                       new_spte, level);
 471 }
 472
 473 /*
 474  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
 475  * associated bookkeeping
 476  *
 477  * @kvm: kvm instance
 478  * @iter: a tdp_iter instance currently on the SPTE that should be set
 479  * @new_spte: The value the SPTE should be set to
 480  * Returns: true if the SPTE was set, false if it was not. If false is returned,
 481  *          this function will have no side-effects.
 482  */
 483 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
 484                                            struct tdp_iter *iter,
 485                                            u64 new_spte)
 486 {
 487         lockdep_assert_held_read(&kvm->mmu_lock);
 488
 489         /*
 490          * Do not change removed SPTEs. Only the thread that froze the SPTE
 491          * may modify it.
 492          */
 493         if (is_removed_spte(iter->old_spte))
 494                 return false;
 495
 496         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
 497                       new_spte) != iter->old_spte)
 498                 return false;
 499
 500         handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 501                             new_spte, iter->level, true);
 502
 503         return true;
 504 }
 505
 506 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 507                                            struct tdp_iter *iter)
 508 {
 509         /*
 510          * Freeze the SPTE by setting it to a special,
 511          * non-present value. This will stop other threads from
 512          * immediately installing a present entry in its place
 513          * before the TLBs are flushed.
 514          */
 515         if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
 516                 return false;
 517
 518         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 519                                            KVM_PAGES_PER_HPAGE(iter->level));
 520
 521         /*
 522          * No other thread can overwrite the removed SPTE as they
 523          * must either wait on the MMU lock or use
 524          * tdp_mmu_set_spte_atomic which will not overrite the
 525          * special removed SPTE value. No bookkeeping is needed
 526          * here since the SPTE is going from non-present
 527          * to non-present.
 528          */
 529         WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 530
 531         return true;
 532 }
 533
 534
 535 /*
 536  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 537  * @kvm: kvm instance
 538  * @iter: a tdp_iter instance currently on the SPTE that should be set
 539  * @new_spte: The value the SPTE should be set to
 540  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 541  *                    of the page. Should be set unless handling an MMU
 542  *                    notifier for access tracking. Leaving record_acc_track
 543  *                    unset in that case prevents page accesses from being
 544  *                    double counted.
 545  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 546  *                    appropriate for the change being made. Should be set
 547  *                    unless performing certain dirty logging operations.
 548  *                    Leaving record_dirty_log unset in that case prevents page
 549  *                    writes from being double counted.
 550  */
 551 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 552                                       u64 new_spte, bool record_acc_track,
 553                                       bool record_dirty_log)
 554 {
 555         lockdep_assert_held_write(&kvm->mmu_lock);
 556
 557         /*
 558          * No thread should be using this function to set SPTEs to the
 559          * temporary removed SPTE value.
 560          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 561          * should be used. If operating under the MMU lock in write mode, the
 562          * use of the removed SPTE should not be necessary.
 563          */
 564         WARN_ON(is_removed_spte(iter->old_spte));
 565
 566         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 567
 568         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 569                               new_spte, iter->level, false);
 570         if (record_acc_track)
 571                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 572                                               iter->level);
 573         if (record_dirty_log)
 574                 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
 575                                               iter->old_spte, new_spte,
 576                                               iter->level);
 577 }
 578
 579 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 580                                     u64 new_spte)
 581 {
 582         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 583 }
 584
 585 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 586                                                  struct tdp_iter *iter,
 587                                                  u64 new_spte)
 588 {
 589         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 590 }
 591
 592 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 593                                                  struct tdp_iter *iter,
 594                                                  u64 new_spte)
 595 {
 596         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 597 }
 598
 599 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 600         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 601
 602 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 603         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 604                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 605                     !is_last_spte(_iter.old_spte, _iter.level))         \
 606                         continue;                                       \
 607                 else
 608
 609 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 610         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 611                          _mmu->shadow_root_level, _start, _end)
 612
 613 /*
 614  * Yield if the MMU lock is contended or this thread needs to return control
 615  * to the scheduler.
 616  *
 617  * If this function should yield and flush is set, it will perform a remote
 618  * TLB flush before yielding.
 619  *
 620  * If this function yields, it will also reset the tdp_iter's walk over the
 621  * paging structure and the calling function should skip to the next
 622  * iteration to allow the iterator to continue its traversal from the
 623  * paging structure root.
 624  *
 625  * Return true if this function yielded and the iterator's traversal was reset.
 626  * Return false if a yield was not needed.
 627  */
 628 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 629                                              struct tdp_iter *iter, bool flush)
 630 {
 631         /* Ensure forward progress has been made before yielding. */
 632         if (iter->next_last_level_gfn == iter->yielded_gfn)
 633                 return false;
 634
 635         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 636                 rcu_read_unlock();
 637
 638                 if (flush)
 639                         kvm_flush_remote_tlbs(kvm);
 640
 641                 cond_resched_rwlock_write(&kvm->mmu_lock);
 642                 rcu_read_lock();
 643
 644                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 645
 646                 tdp_iter_restart(iter);
 647
 648                 return true;
 649         }
 650
 651         return false;
 652 }
 653
 654 /*
 655  * Tears down the mappings for the range of gfns, [start, end), and frees the
 656  * non-root pages mapping GFNs strictly within that range. Returns true if
 657  * SPTEs have been cleared and a TLB flush is needed before releasing the
 658  * MMU lock.
 659  * If can_yield is true, will release the MMU lock and reschedule if the
 660  * scheduler needs the CPU or there is contention on the MMU lock. If this
 661  * function cannot yield, it will not release the MMU lock or reschedule and
 662  * the caller must ensure it does not supply too large a GFN range, or the
 663  * operation can cause a soft lockup.  Note, in some use cases a flush may be
 664  * required by prior actions.  Ensure the pending flush is performed prior to
 665  * yielding.
 666  */
 667 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 668                           gfn_t start, gfn_t end, bool can_yield, bool flush)
 669 {
 670         struct tdp_iter iter;
 671
 672         rcu_read_lock();
 673
 674         tdp_root_for_each_pte(iter, root, start, end) {
 675                 if (can_yield &&
 676                     tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
 677                         flush = false;
 678                         continue;
 679                 }
 680
 681                 if (!is_shadow_present_pte(iter.old_spte))
 682                         continue;
 683
 684                 /*
 685                  * If this is a non-last-level SPTE that covers a larger range
 686                  * than should be zapped, continue, and zap the mappings at a
 687                  * lower level.
 688                  */
 689                 if ((iter.gfn < start ||
 690                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 691                     !is_last_spte(iter.old_spte, iter.level))
 692                         continue;
 693
 694                 tdp_mmu_set_spte(kvm, &iter, 0);
 695                 flush = true;
 696         }
 697
 698         rcu_read_unlock();
 699         return flush;
 700 }
 701
 702 /*
 703  * Tears down the mappings for the range of gfns, [start, end), and frees the
 704  * non-root pages mapping GFNs strictly within that range. Returns true if
 705  * SPTEs have been cleared and a TLB flush is needed before releasing the
 706  * MMU lock.
 707  */
 708 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
 709                                  gfn_t end, bool can_yield, bool flush)
 710 {
 711         struct kvm_mmu_page *root;
 712
 713         for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
 714                 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
 715
 716         return flush;
 717 }
 718
 719 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 720 {
 721         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 722         bool flush = false;
 723         int i;
 724
 725         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
 726                 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, flush);
 727
 728         if (flush)
 729                 kvm_flush_remote_tlbs(kvm);
 730 }
 731
 732 /*
 733  * Installs a last-level SPTE to handle a TDP page fault.
 734  * (NPT/EPT violation/misconfiguration)
 735  */
 736 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 737                                           int map_writable,
 738                                           struct tdp_iter *iter,
 739                                           kvm_pfn_t pfn, bool prefault)
 740 {
 741         u64 new_spte;
 742         int ret = 0;
 743         int make_spte_ret = 0;
 744
 745         if (unlikely(is_noslot_pfn(pfn)))
 746                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 747         else
 748                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 749                                          pfn, iter->old_spte, prefault, true,
 750                                          map_writable, !shadow_accessed_mask,
 751                                          &new_spte);
 752
 753         if (new_spte == iter->old_spte)
 754                 ret = RET_PF_SPURIOUS;
 755         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 756                 return RET_PF_RETRY;
 757
 758         /*
 759          * If the page fault was caused by a write but the page is write
 760          * protected, emulation is needed. If the emulation was skipped,
 761          * the vCPU would have the same fault again.
 762          */
 763         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 764                 if (write)
 765                         ret = RET_PF_EMULATE;
 766                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 767         }
 768
 769         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 770         if (unlikely(is_mmio_spte(new_spte))) {
 771                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 772                                      new_spte);
 773                 ret = RET_PF_EMULATE;
 774         } else {
 775                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 776                                        rcu_dereference(iter->sptep));
 777         }
 778
 779         if (!prefault)
 780                 vcpu->stat.pf_fixed++;
 781
 782         return ret;
 783 }
 784
 785 /*
 786  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 787  * page tables and SPTEs to translate the faulting guest physical address.
 788  */
 789 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 790                     int map_writable, int max_level, kvm_pfn_t pfn,
 791                     bool prefault)
 792 {
 793         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 794         bool write = error_code & PFERR_WRITE_MASK;
 795         bool exec = error_code & PFERR_FETCH_MASK;
 796         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 797         struct kvm_mmu *mmu = vcpu->arch.mmu;
 798         struct tdp_iter iter;
 799         struct kvm_mmu_page *sp;
 800         u64 *child_pt;
 801         u64 new_spte;
 802         int ret;
 803         gfn_t gfn = gpa >> PAGE_SHIFT;
 804         int level;
 805         int req_level;
 806
 807         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
 808                 return RET_PF_RETRY;
 809         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
 810                 return RET_PF_RETRY;
 811
 812         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
 813                                         huge_page_disallowed, &req_level);
 814
 815         trace_kvm_mmu_spte_requested(gpa, level, pfn);
 816
 817         rcu_read_lock();
 818
 819         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 820                 if (nx_huge_page_workaround_enabled)
 821                         disallowed_hugepage_adjust(iter.old_spte, gfn,
 822                                                    iter.level, &pfn, &level);
 823
 824                 if (iter.level == level)
 825                         break;
 826
 827                 /*
 828                  * If there is an SPTE mapping a large page at a higher level
 829                  * than the target, that SPTE must be cleared and replaced
 830                  * with a non-leaf SPTE.
 831                  */
 832                 if (is_shadow_present_pte(iter.old_spte) &&
 833                     is_large_pte(iter.old_spte)) {
 834                         if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
 835                                 break;
 836
 837                         /*
 838                          * The iter must explicitly re-read the spte here
 839                          * because the new value informs the !present
 840                          * path below.
 841                          */
 842                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
 843                 }
 844
 845                 if (!is_shadow_present_pte(iter.old_spte)) {
 846                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 847                         child_pt = sp->spt;
 848
 849                         new_spte = make_nonleaf_spte(child_pt,
 850                                                      !shadow_accessed_mask);
 851
 852                         if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
 853                                                     new_spte)) {
 854                                 tdp_mmu_link_page(vcpu->kvm, sp, true,
 855                                                   huge_page_disallowed &&
 856                                                   req_level >= iter.level);
 857
 858                                 trace_kvm_mmu_get_page(sp, true);
 859                         } else {
 860                                 tdp_mmu_free_sp(sp);
 861                                 break;
 862                         }
 863                 }
 864         }
 865
 866         if (iter.level != level) {
 867                 rcu_read_unlock();
 868                 return RET_PF_RETRY;
 869         }
 870
 871         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
 872                                               pfn, prefault);
 873         rcu_read_unlock();
 874
 875         return ret;
 876 }
 877
 878 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
 879                                  bool flush)
 880 {
 881         struct kvm_mmu_page *root;
 882
 883         for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
 884                 flush |= zap_gfn_range(kvm, root, range->start, range->end,
 885                                        range->may_block, flush);
 886
 887         return flush;
 888 }
 889
 890 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
 891                               struct kvm_gfn_range *range);
 892
 893 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
 894                                                    struct kvm_gfn_range *range,
 895                                                    tdp_handler_t handler)
 896 {
 897         struct kvm_mmu_page *root;
 898         struct tdp_iter iter;
 899         bool ret = false;
 900
 901         rcu_read_lock();
 902
 903         /*
 904          * Don't support rescheduling, none of the MMU notifiers that funnel
 905          * into this helper allow blocking; it'd be dead, wasteful code.
 906          */
 907         for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
 908                 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
 909                         ret |= handler(kvm, &iter, range);
 910         }
 911
 912         rcu_read_unlock();
 913
 914         return ret;
 915 }
 916
 917 /*
 918  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
 919  * if any of the GFNs in the range have been accessed.
 920  */
 921 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
 922                           struct kvm_gfn_range *range)
 923 {
 924         u64 new_spte = 0;
 925
 926         /* If we have a non-accessed entry we don't need to change the pte. */
 927         if (!is_accessed_spte(iter->old_spte))
 928                 return false;
 929
 930         new_spte = iter->old_spte;
 931
 932         if (spte_ad_enabled(new_spte)) {
 933                 new_spte &= ~shadow_accessed_mask;
 934         } else {
 935                 /*
 936                  * Capture the dirty status of the page, so that it doesn't get
 937                  * lost when the SPTE is marked for access tracking.
 938                  */
 939                 if (is_writable_pte(new_spte))
 940                         kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 941
 942                 new_spte = mark_spte_for_access_track(new_spte);
 943         }
 944
 945         tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
 946
 947         return true;
 948 }
 949
 950 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 951 {
 952         return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
 953 }
 954
 955 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
 956                          struct kvm_gfn_range *range)
 957 {
 958         return is_accessed_spte(iter->old_spte);
 959 }
 960
 961 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 962 {
 963         return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
 964 }
 965
 966 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
 967                          struct kvm_gfn_range *range)
 968 {
 969         u64 new_spte;
 970
 971         /* Huge pages aren't expected to be modified without first being zapped. */
 972         WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
 973
 974         if (iter->level != PG_LEVEL_4K ||
 975             !is_shadow_present_pte(iter->old_spte))
 976                 return false;
 977
 978         /*
 979          * Note, when changing a read-only SPTE, it's not strictly necessary to
 980          * zero the SPTE before setting the new PFN, but doing so preserves the
 981          * invariant that the PFN of a present * leaf SPTE can never change.
 982          * See __handle_changed_spte().
 983          */
 984         tdp_mmu_set_spte(kvm, iter, 0);
 985
 986         if (!pte_write(range->pte)) {
 987                 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
 988                                                                   pte_pfn(range->pte));
 989
 990                 tdp_mmu_set_spte(kvm, iter, new_spte);
 991         }
 992
 993         return true;
 994 }
 995
 996 /*
 997  * Handle the changed_pte MMU notifier for the TDP MMU.
 998  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
 999  * notifier.
1000  * Returns non-zero if a flush is needed before releasing the MMU lock.
1001  */
1002 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1003 {
1004         bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1005
1006         /* FIXME: return 'flush' instead of flushing here. */
1007         if (flush)
1008                 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1009
1010         return false;
1011 }
1012
1013 /*
1014  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1015  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1016  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1017  */
1018 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1019                              gfn_t start, gfn_t end, int min_level)
1020 {
1021         struct tdp_iter iter;
1022         u64 new_spte;
1023         bool spte_set = false;
1024
1025         rcu_read_lock();
1026
1027         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1028
1029         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1030                                    min_level, start, end) {
1031                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1032                         continue;
1033
1034                 if (!is_shadow_present_pte(iter.old_spte) ||
1035                     !is_last_spte(iter.old_spte, iter.level) ||
1036                     !(iter.old_spte & PT_WRITABLE_MASK))
1037                         continue;
1038
1039                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1040
1041                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1042                 spte_set = true;
1043         }
1044
1045         rcu_read_unlock();
1046         return spte_set;
1047 }
1048
1049 /*
1050  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1051  * only affect leaf SPTEs down to min_level.
1052  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1053  */
1054 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1055                              int min_level)
1056 {
1057         struct kvm_mmu_page *root;
1058         bool spte_set = false;
1059
1060         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1061                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1062                              slot->base_gfn + slot->npages, min_level);
1063
1064         return spte_set;
1065 }
1066
1067 /*
1068  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1069  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1070  * If AD bits are not enabled, this will require clearing the writable bit on
1071  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1072  * be flushed.
1073  */
1074 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1075                            gfn_t start, gfn_t end)
1076 {
1077         struct tdp_iter iter;
1078         u64 new_spte;
1079         bool spte_set = false;
1080
1081         rcu_read_lock();
1082
1083         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1084                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1085                         continue;
1086
1087                 if (spte_ad_need_write_protect(iter.old_spte)) {
1088                         if (is_writable_pte(iter.old_spte))
1089                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1090                         else
1091                                 continue;
1092                 } else {
1093                         if (iter.old_spte & shadow_dirty_mask)
1094                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1095                         else
1096                                 continue;
1097                 }
1098
1099                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1100                 spte_set = true;
1101         }
1102
1103         rcu_read_unlock();
1104         return spte_set;
1105 }
1106
1107 /*
1108  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1109  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1110  * If AD bits are not enabled, this will require clearing the writable bit on
1111  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1112  * be flushed.
1113  */
1114 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1115 {
1116         struct kvm_mmu_page *root;
1117         bool spte_set = false;
1118
1119         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1120                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1121                                 slot->base_gfn + slot->npages);
1122
1123         return spte_set;
1124 }
1125
1126 /*
1127  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1128  * set in mask, starting at gfn. The given memslot is expected to contain all
1129  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1130  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1131  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1132  */
1133 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1134                                   gfn_t gfn, unsigned long mask, bool wrprot)
1135 {
1136         struct tdp_iter iter;
1137         u64 new_spte;
1138
1139         rcu_read_lock();
1140
1141         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1142                                     gfn + BITS_PER_LONG) {
1143                 if (!mask)
1144                         break;
1145
1146                 if (iter.level > PG_LEVEL_4K ||
1147                     !(mask & (1UL << (iter.gfn - gfn))))
1148                         continue;
1149
1150                 mask &= ~(1UL << (iter.gfn - gfn));
1151
1152                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1153                         if (is_writable_pte(iter.old_spte))
1154                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1155                         else
1156                                 continue;
1157                 } else {
1158                         if (iter.old_spte & shadow_dirty_mask)
1159                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1160                         else
1161                                 continue;
1162                 }
1163
1164                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1165         }
1166
1167         rcu_read_unlock();
1168 }
1169
1170 /*
1171  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1172  * set in mask, starting at gfn. The given memslot is expected to contain all
1173  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1174  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1175  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1176  */
1177 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1178                                        struct kvm_memory_slot *slot,
1179                                        gfn_t gfn, unsigned long mask,
1180                                        bool wrprot)
1181 {
1182         struct kvm_mmu_page *root;
1183
1184         lockdep_assert_held_write(&kvm->mmu_lock);
1185         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1186                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1187 }
1188
1189 /*
1190  * Clear leaf entries which could be replaced by large mappings, for
1191  * GFNs within the slot.
1192  */
1193 static bool zap_collapsible_spte_range(struct kvm *kvm,
1194                                        struct kvm_mmu_page *root,
1195                                        const struct kvm_memory_slot *slot,
1196                                        bool flush)
1197 {
1198         gfn_t start = slot->base_gfn;
1199         gfn_t end = start + slot->npages;
1200         struct tdp_iter iter;
1201         kvm_pfn_t pfn;
1202
1203         rcu_read_lock();
1204
1205         tdp_root_for_each_pte(iter, root, start, end) {
1206                 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
1207                         flush = false;
1208                         continue;
1209                 }
1210
1211                 if (!is_shadow_present_pte(iter.old_spte) ||
1212                     !is_last_spte(iter.old_spte, iter.level))
1213                         continue;
1214
1215                 pfn = spte_to_pfn(iter.old_spte);
1216                 if (kvm_is_reserved_pfn(pfn) ||
1217                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1218                                                             pfn, PG_LEVEL_NUM))
1219                         continue;
1220
1221                 tdp_mmu_set_spte(kvm, &iter, 0);
1222
1223                 flush = true;
1224         }
1225
1226         rcu_read_unlock();
1227
1228         return flush;
1229 }
1230
1231 /*
1232  * Clear non-leaf entries (and free associated page tables) which could
1233  * be replaced by large mappings, for GFNs within the slot.
1234  */
1235 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1236                                        const struct kvm_memory_slot *slot,
1237                                        bool flush)
1238 {
1239         struct kvm_mmu_page *root;
1240
1241         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1242                 flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1243
1244         return flush;
1245 }
1246
1247 /*
1248  * Removes write access on the last level SPTE mapping this GFN and unsets the
1249  * MMU-writable bit to ensure future writes continue to be intercepted.
1250  * Returns true if an SPTE was set and a TLB flush is needed.
1251  */
1252 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1253                               gfn_t gfn)
1254 {
1255         struct tdp_iter iter;
1256         u64 new_spte;
1257         bool spte_set = false;
1258
1259         rcu_read_lock();
1260
1261         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1262                 if (!is_writable_pte(iter.old_spte))
1263                         break;
1264
1265                 new_spte = iter.old_spte &
1266                         ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1267
1268                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1269                 spte_set = true;
1270         }
1271
1272         rcu_read_unlock();
1273
1274         return spte_set;
1275 }
1276
1277 /*
1278  * Removes write access on the last level SPTE mapping this GFN and unsets the
1279  * MMU-writable bit to ensure future writes continue to be intercepted.
1280  * Returns true if an SPTE was set and a TLB flush is needed.
1281  */
1282 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1283                                    struct kvm_memory_slot *slot, gfn_t gfn)
1284 {
1285         struct kvm_mmu_page *root;
1286         bool spte_set = false;
1287
1288         lockdep_assert_held_write(&kvm->mmu_lock);
1289         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1290                 spte_set |= write_protect_gfn(kvm, root, gfn);
1291
1292         return spte_set;
1293 }
1294
1295 /*
1296  * Return the level of the lowest level SPTE added to sptes.
1297  * That SPTE may be non-present.
1298  */
1299 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1300                          int *root_level)
1301 {
1302         struct tdp_iter iter;
1303         struct kvm_mmu *mmu = vcpu->arch.mmu;
1304         gfn_t gfn = addr >> PAGE_SHIFT;
1305         int leaf = -1;
1306
1307         *root_level = vcpu->arch.mmu->shadow_root_level;
1308
1309         rcu_read_lock();
1310
1311         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1312                 leaf = iter.level;
1313                 sptes[leaf] = iter.old_spte;
1314         }
1315
1316         rcu_read_unlock();
1317
1318         return leaf;
1319 }