arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = false;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  20                 return;
  21
  22         /* This should not be changed for the lifetime of the VM. */
  23         kvm->arch.tdp_mmu_enabled = true;
  24
  25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  28 }
  29
  30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  31 {
  32         if (!kvm->arch.tdp_mmu_enabled)
  33                 return;
  34
  35         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  36
  37         /*
  38          * Ensure that all the outstanding RCU callbacks to free shadow pages
  39          * can run before the VM is torn down.
  40          */
  41         rcu_barrier();
  42 }
  43
  44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
  45 {
  46         if (kvm_mmu_put_root(kvm, root))
  47                 kvm_tdp_mmu_free_root(kvm, root);
  48 }
  49
  50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
  51                                            struct kvm_mmu_page *root)
  52 {
  53         lockdep_assert_held_write(&kvm->mmu_lock);
  54
  55         if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
  56                 return false;
  57
  58         kvm_mmu_get_root(kvm, root);
  59         return true;
  60
  61 }
  62
  63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  64                                                      struct kvm_mmu_page *root)
  65 {
  66         struct kvm_mmu_page *next_root;
  67
  68         next_root = list_next_entry(root, link);
  69         tdp_mmu_put_root(kvm, root);
  70         return next_root;
  71 }
  72
  73 /*
  74  * Note: this iterator gets and puts references to the roots it iterates over.
  75  * This makes it safe to release the MMU lock and yield within the loop, but
  76  * if exiting the loop early, the caller must drop the reference to the most
  77  * recent root. (Unless keeping a live reference is desirable.)
  78  */
  79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                           \
  80         for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,        \
  81                                       typeof(*_root), link);            \
  82              tdp_mmu_next_root_valid(_kvm, _root);                      \
  83              _root = tdp_mmu_next_root(_kvm, _root))
  84
  85 #define for_each_tdp_mmu_root(_kvm, _root)                              \
  86         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
  87
  88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  89                           gfn_t start, gfn_t end, bool can_yield);
  90
  91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  92 {
  93         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
  94
  95         lockdep_assert_held_write(&kvm->mmu_lock);
  96
  97         WARN_ON(root->root_count);
  98         WARN_ON(!root->tdp_mmu_page);
  99
 100         list_del(&root->link);
 101
 102         zap_gfn_range(kvm, root, 0, max_gfn, false);
 103
 104         free_page((unsigned long)root->spt);
 105         kmem_cache_free(mmu_page_header_cache, root);
 106 }
 107
 108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
 109                                                    int level)
 110 {
 111         union kvm_mmu_page_role role;
 112
 113         role = vcpu->arch.mmu->mmu_role.base;
 114         role.level = level;
 115         role.direct = true;
 116         role.gpte_is_8_bytes = true;
 117         role.access = ACC_ALL;
 118
 119         return role;
 120 }
 121
 122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 123                                                int level)
 124 {
 125         struct kvm_mmu_page *sp;
 126
 127         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 128         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 129         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 130
 131         sp->role.word = page_role_for_level(vcpu, level).word;
 132         sp->gfn = gfn;
 133         sp->tdp_mmu_page = true;
 134
 135         trace_kvm_mmu_get_page(sp, true);
 136
 137         return sp;
 138 }
 139
 140 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
 141 {
 142         union kvm_mmu_page_role role;
 143         struct kvm *kvm = vcpu->kvm;
 144         struct kvm_mmu_page *root;
 145
 146         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 147
 148         write_lock(&kvm->mmu_lock);
 149
 150         /* Check for an existing root before allocating a new one. */
 151         for_each_tdp_mmu_root(kvm, root) {
 152                 if (root->role.word == role.word) {
 153                         kvm_mmu_get_root(kvm, root);
 154                         write_unlock(&kvm->mmu_lock);
 155                         return root;
 156                 }
 157         }
 158
 159         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 160         root->root_count = 1;
 161
 162         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 163
 164         write_unlock(&kvm->mmu_lock);
 165
 166         return root;
 167 }
 168
 169 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 170 {
 171         struct kvm_mmu_page *root;
 172
 173         root = get_tdp_mmu_vcpu_root(vcpu);
 174         if (!root)
 175                 return INVALID_PAGE;
 176
 177         return __pa(root->spt);
 178 }
 179
 180 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 181 {
 182         free_page((unsigned long)sp->spt);
 183         kmem_cache_free(mmu_page_header_cache, sp);
 184 }
 185
 186 /*
 187  * This is called through call_rcu in order to free TDP page table memory
 188  * safely with respect to other kernel threads that may be operating on
 189  * the memory.
 190  * By only accessing TDP MMU page table memory in an RCU read critical
 191  * section, and freeing it after a grace period, lockless access to that
 192  * memory won't use it after it is freed.
 193  */
 194 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
 195 {
 196         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
 197                                                rcu_head);
 198
 199         tdp_mmu_free_sp(sp);
 200 }
 201
 202 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 203                                 u64 old_spte, u64 new_spte, int level,
 204                                 bool shared);
 205
 206 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
 207 {
 208         return sp->role.smm ? 1 : 0;
 209 }
 210
 211 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 212 {
 213         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 214
 215         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 216                 return;
 217
 218         if (is_accessed_spte(old_spte) &&
 219             (!is_accessed_spte(new_spte) || pfn_changed))
 220                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 221 }
 222
 223 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 224                                           u64 old_spte, u64 new_spte, int level)
 225 {
 226         bool pfn_changed;
 227         struct kvm_memory_slot *slot;
 228
 229         if (level > PG_LEVEL_4K)
 230                 return;
 231
 232         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 233
 234         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 235             is_writable_pte(new_spte)) {
 236                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 237                 mark_page_dirty_in_slot(kvm, slot, gfn);
 238         }
 239 }
 240
 241 /**
 242  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
 243  *
 244  * @kvm: kvm instance
 245  * @sp: the new page
 246  * @shared: This operation may not be running under the exclusive use of
 247  *          the MMU lock and the operation must synchronize with other
 248  *          threads that might be adding or removing pages.
 249  * @account_nx: This page replaces a NX large page and should be marked for
 250  *              eventual reclaim.
 251  */
 252 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 253                               bool shared, bool account_nx)
 254 {
 255         if (shared)
 256                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 257         else
 258                 lockdep_assert_held_write(&kvm->mmu_lock);
 259
 260         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
 261         if (account_nx)
 262                 account_huge_nx_page(kvm, sp);
 263
 264         if (shared)
 265                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 266 }
 267
 268 /**
 269  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
 270  *
 271  * @kvm: kvm instance
 272  * @sp: the page to be removed
 273  * @shared: This operation may not be running under the exclusive use of
 274  *          the MMU lock and the operation must synchronize with other
 275  *          threads that might be adding or removing pages.
 276  */
 277 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 278                                 bool shared)
 279 {
 280         if (shared)
 281                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 282         else
 283                 lockdep_assert_held_write(&kvm->mmu_lock);
 284
 285         list_del(&sp->link);
 286         if (sp->lpage_disallowed)
 287                 unaccount_huge_nx_page(kvm, sp);
 288
 289         if (shared)
 290                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 291 }
 292
 293 /**
 294  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
 295  *
 296  * @kvm: kvm instance
 297  * @pt: the page removed from the paging structure
 298  * @shared: This operation may not be running under the exclusive use
 299  *          of the MMU lock and the operation must synchronize with other
 300  *          threads that might be modifying SPTEs.
 301  *
 302  * Given a page table that has been removed from the TDP paging structure,
 303  * iterates through the page table to clear SPTEs and free child page tables.
 304  */
 305 static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
 306                                         bool shared)
 307 {
 308         struct kvm_mmu_page *sp = sptep_to_sp(pt);
 309         int level = sp->role.level;
 310         gfn_t base_gfn = sp->gfn;
 311         u64 old_child_spte;
 312         u64 *sptep;
 313         gfn_t gfn;
 314         int i;
 315
 316         trace_kvm_mmu_prepare_zap_page(sp);
 317
 318         tdp_mmu_unlink_page(kvm, sp, shared);
 319
 320         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 321                 sptep = pt + i;
 322                 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 323
 324                 if (shared) {
 325                         /*
 326                          * Set the SPTE to a nonpresent value that other
 327                          * threads will not overwrite. If the SPTE was
 328                          * already marked as removed then another thread
 329                          * handling a page fault could overwrite it, so
 330                          * set the SPTE until it is set from some other
 331                          * value to the removed SPTE value.
 332                          */
 333                         for (;;) {
 334                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 335                                 if (!is_removed_spte(old_child_spte))
 336                                         break;
 337                                 cpu_relax();
 338                         }
 339                 } else {
 340                         old_child_spte = READ_ONCE(*sptep);
 341
 342                         /*
 343                          * Marking the SPTE as a removed SPTE is not
 344                          * strictly necessary here as the MMU lock will
 345                          * stop other threads from concurrently modifying
 346                          * this SPTE. Using the removed SPTE value keeps
 347                          * the two branches consistent and simplifies
 348                          * the function.
 349                          */
 350                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 351                 }
 352                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 353                                     old_child_spte, REMOVED_SPTE, level - 1,
 354                                     shared);
 355         }
 356
 357         kvm_flush_remote_tlbs_with_address(kvm, gfn,
 358                                            KVM_PAGES_PER_HPAGE(level));
 359
 360         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 361 }
 362
 363 /**
 364  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 365  * @kvm: kvm instance
 366  * @as_id: the address space of the paging structure the SPTE was a part of
 367  * @gfn: the base GFN that was mapped by the SPTE
 368  * @old_spte: The value of the SPTE before the change
 369  * @new_spte: The value of the SPTE after the change
 370  * @level: the level of the PT the SPTE is part of in the paging structure
 371  * @shared: This operation may not be running under the exclusive use of
 372  *          the MMU lock and the operation must synchronize with other
 373  *          threads that might be modifying SPTEs.
 374  *
 375  * Handle bookkeeping that might result from the modification of a SPTE.
 376  * This function must be called for all TDP SPTE modifications.
 377  */
 378 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 379                                   u64 old_spte, u64 new_spte, int level,
 380                                   bool shared)
 381 {
 382         bool was_present = is_shadow_present_pte(old_spte);
 383         bool is_present = is_shadow_present_pte(new_spte);
 384         bool was_leaf = was_present && is_last_spte(old_spte, level);
 385         bool is_leaf = is_present && is_last_spte(new_spte, level);
 386         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 387
 388         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 389         WARN_ON(level < PG_LEVEL_4K);
 390         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 391
 392         /*
 393          * If this warning were to trigger it would indicate that there was a
 394          * missing MMU notifier or a race with some notifier handler.
 395          * A present, leaf SPTE should never be directly replaced with another
 396          * present leaf SPTE pointing to a differnt PFN. A notifier handler
 397          * should be zapping the SPTE before the main MM's page table is
 398          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 399          * thread before replacement.
 400          */
 401         if (was_leaf && is_leaf && pfn_changed) {
 402                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 403                        "SPTE with another present leaf SPTE mapping a\n"
 404                        "different PFN!\n"
 405                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 406                        as_id, gfn, old_spte, new_spte, level);
 407
 408                 /*
 409                  * Crash the host to prevent error propagation and guest data
 410                  * courruption.
 411                  */
 412                 BUG();
 413         }
 414
 415         if (old_spte == new_spte)
 416                 return;
 417
 418         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 419
 420         /*
 421          * The only times a SPTE should be changed from a non-present to
 422          * non-present state is when an MMIO entry is installed/modified/
 423          * removed. In that case, there is nothing to do here.
 424          */
 425         if (!was_present && !is_present) {
 426                 /*
 427                  * If this change does not involve a MMIO SPTE or removed SPTE,
 428                  * it is unexpected. Log the change, though it should not
 429                  * impact the guest since both the former and current SPTEs
 430                  * are nonpresent.
 431                  */
 432                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 433                             !is_mmio_spte(new_spte) &&
 434                             !is_removed_spte(new_spte)))
 435                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 436                                "should not be replaced with another,\n"
 437                                "different nonpresent SPTE, unless one or both\n"
 438                                "are MMIO SPTEs, or the new SPTE is\n"
 439                                "a temporary removed SPTE.\n"
 440                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 441                                as_id, gfn, old_spte, new_spte, level);
 442                 return;
 443         }
 444
 445
 446         if (was_leaf && is_dirty_spte(old_spte) &&
 447             (!is_dirty_spte(new_spte) || pfn_changed))
 448                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 449
 450         /*
 451          * Recursively handle child PTs if the change removed a subtree from
 452          * the paging structure.
 453          */
 454         if (was_present && !was_leaf && (pfn_changed || !is_present))
 455                 handle_removed_tdp_mmu_page(kvm,
 456                                 spte_to_child_pt(old_spte, level), shared);
 457 }
 458
 459 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 460                                 u64 old_spte, u64 new_spte, int level,
 461                                 bool shared)
 462 {
 463         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 464                               shared);
 465         handle_changed_spte_acc_track(old_spte, new_spte, level);
 466         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 467                                       new_spte, level);
 468 }
 469
 470 /*
 471  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
 472  * associated bookkeeping
 473  *
 474  * @kvm: kvm instance
 475  * @iter: a tdp_iter instance currently on the SPTE that should be set
 476  * @new_spte: The value the SPTE should be set to
 477  * Returns: true if the SPTE was set, false if it was not. If false is returned,
 478  *          this function will have no side-effects.
 479  */
 480 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
 481                                            struct tdp_iter *iter,
 482                                            u64 new_spte)
 483 {
 484         u64 *root_pt = tdp_iter_root_pt(iter);
 485         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
 486         int as_id = kvm_mmu_page_as_id(root);
 487
 488         lockdep_assert_held_read(&kvm->mmu_lock);
 489
 490         /*
 491          * Do not change removed SPTEs. Only the thread that froze the SPTE
 492          * may modify it.
 493          */
 494         if (iter->old_spte == REMOVED_SPTE)
 495                 return false;
 496
 497         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
 498                       new_spte) != iter->old_spte)
 499                 return false;
 500
 501         handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
 502                             iter->level, true);
 503
 504         return true;
 505 }
 506
 507 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 508                                            struct tdp_iter *iter)
 509 {
 510         /*
 511          * Freeze the SPTE by setting it to a special,
 512          * non-present value. This will stop other threads from
 513          * immediately installing a present entry in its place
 514          * before the TLBs are flushed.
 515          */
 516         if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
 517                 return false;
 518
 519         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 520                                            KVM_PAGES_PER_HPAGE(iter->level));
 521
 522         /*
 523          * No other thread can overwrite the removed SPTE as they
 524          * must either wait on the MMU lock or use
 525          * tdp_mmu_set_spte_atomic which will not overrite the
 526          * special removed SPTE value. No bookkeeping is needed
 527          * here since the SPTE is going from non-present
 528          * to non-present.
 529          */
 530         WRITE_ONCE(*iter->sptep, 0);
 531
 532         return true;
 533 }
 534
 535
 536 /*
 537  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 538  * @kvm: kvm instance
 539  * @iter: a tdp_iter instance currently on the SPTE that should be set
 540  * @new_spte: The value the SPTE should be set to
 541  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 542  *                    of the page. Should be set unless handling an MMU
 543  *                    notifier for access tracking. Leaving record_acc_track
 544  *                    unset in that case prevents page accesses from being
 545  *                    double counted.
 546  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 547  *                    appropriate for the change being made. Should be set
 548  *                    unless performing certain dirty logging operations.
 549  *                    Leaving record_dirty_log unset in that case prevents page
 550  *                    writes from being double counted.
 551  */
 552 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 553                                       u64 new_spte, bool record_acc_track,
 554                                       bool record_dirty_log)
 555 {
 556         tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
 557         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
 558         int as_id = kvm_mmu_page_as_id(root);
 559
 560         lockdep_assert_held_write(&kvm->mmu_lock);
 561
 562         /*
 563          * No thread should be using this function to set SPTEs to the
 564          * temporary removed SPTE value.
 565          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 566          * should be used. If operating under the MMU lock in write mode, the
 567          * use of the removed SPTE should not be necessary.
 568          */
 569         WARN_ON(iter->old_spte == REMOVED_SPTE);
 570
 571         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 572
 573         __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
 574                               iter->level, false);
 575         if (record_acc_track)
 576                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 577                                               iter->level);
 578         if (record_dirty_log)
 579                 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
 580                                               iter->old_spte, new_spte,
 581                                               iter->level);
 582 }
 583
 584 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 585                                     u64 new_spte)
 586 {
 587         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 588 }
 589
 590 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 591                                                  struct tdp_iter *iter,
 592                                                  u64 new_spte)
 593 {
 594         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 595 }
 596
 597 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 598                                                  struct tdp_iter *iter,
 599                                                  u64 new_spte)
 600 {
 601         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 602 }
 603
 604 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 605         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 606
 607 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 608         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 609                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 610                     !is_last_spte(_iter.old_spte, _iter.level))         \
 611                         continue;                                       \
 612                 else
 613
 614 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 615         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 616                          _mmu->shadow_root_level, _start, _end)
 617
 618 /*
 619  * Yield if the MMU lock is contended or this thread needs to return control
 620  * to the scheduler.
 621  *
 622  * If this function should yield and flush is set, it will perform a remote
 623  * TLB flush before yielding.
 624  *
 625  * If this function yields, it will also reset the tdp_iter's walk over the
 626  * paging structure and the calling function should skip to the next
 627  * iteration to allow the iterator to continue its traversal from the
 628  * paging structure root.
 629  *
 630  * Return true if this function yielded and the iterator's traversal was reset.
 631  * Return false if a yield was not needed.
 632  */
 633 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 634                                              struct tdp_iter *iter, bool flush)
 635 {
 636         /* Ensure forward progress has been made before yielding. */
 637         if (iter->next_last_level_gfn == iter->yielded_gfn)
 638                 return false;
 639
 640         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 641                 rcu_read_unlock();
 642
 643                 if (flush)
 644                         kvm_flush_remote_tlbs(kvm);
 645
 646                 cond_resched_rwlock_write(&kvm->mmu_lock);
 647                 rcu_read_lock();
 648
 649                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 650
 651                 tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
 652                                iter->root_level, iter->min_level,
 653                                iter->next_last_level_gfn);
 654
 655                 return true;
 656         }
 657
 658         return false;
 659 }
 660
 661 /*
 662  * Tears down the mappings for the range of gfns, [start, end), and frees the
 663  * non-root pages mapping GFNs strictly within that range. Returns true if
 664  * SPTEs have been cleared and a TLB flush is needed before releasing the
 665  * MMU lock.
 666  * If can_yield is true, will release the MMU lock and reschedule if the
 667  * scheduler needs the CPU or there is contention on the MMU lock. If this
 668  * function cannot yield, it will not release the MMU lock or reschedule and
 669  * the caller must ensure it does not supply too large a GFN range, or the
 670  * operation can cause a soft lockup.
 671  */
 672 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 673                           gfn_t start, gfn_t end, bool can_yield)
 674 {
 675         struct tdp_iter iter;
 676         bool flush_needed = false;
 677
 678         rcu_read_lock();
 679
 680         tdp_root_for_each_pte(iter, root, start, end) {
 681                 if (can_yield &&
 682                     tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
 683                         flush_needed = false;
 684                         continue;
 685                 }
 686
 687                 if (!is_shadow_present_pte(iter.old_spte))
 688                         continue;
 689
 690                 /*
 691                  * If this is a non-last-level SPTE that covers a larger range
 692                  * than should be zapped, continue, and zap the mappings at a
 693                  * lower level.
 694                  */
 695                 if ((iter.gfn < start ||
 696                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 697                     !is_last_spte(iter.old_spte, iter.level))
 698                         continue;
 699
 700                 tdp_mmu_set_spte(kvm, &iter, 0);
 701                 flush_needed = true;
 702         }
 703
 704         rcu_read_unlock();
 705         return flush_needed;
 706 }
 707
 708 /*
 709  * Tears down the mappings for the range of gfns, [start, end), and frees the
 710  * non-root pages mapping GFNs strictly within that range. Returns true if
 711  * SPTEs have been cleared and a TLB flush is needed before releasing the
 712  * MMU lock.
 713  */
 714 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
 715 {
 716         struct kvm_mmu_page *root;
 717         bool flush = false;
 718
 719         for_each_tdp_mmu_root_yield_safe(kvm, root)
 720                 flush |= zap_gfn_range(kvm, root, start, end, true);
 721
 722         return flush;
 723 }
 724
 725 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 726 {
 727         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 728         bool flush;
 729
 730         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
 731         if (flush)
 732                 kvm_flush_remote_tlbs(kvm);
 733 }
 734
 735 /*
 736  * Installs a last-level SPTE to handle a TDP page fault.
 737  * (NPT/EPT violation/misconfiguration)
 738  */
 739 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 740                                           int map_writable,
 741                                           struct tdp_iter *iter,
 742                                           kvm_pfn_t pfn, bool prefault)
 743 {
 744         u64 new_spte;
 745         int ret = 0;
 746         int make_spte_ret = 0;
 747
 748         if (unlikely(is_noslot_pfn(pfn)))
 749                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 750         else
 751                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 752                                          pfn, iter->old_spte, prefault, true,
 753                                          map_writable, !shadow_accessed_mask,
 754                                          &new_spte);
 755
 756         if (new_spte == iter->old_spte)
 757                 ret = RET_PF_SPURIOUS;
 758         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 759                 return RET_PF_RETRY;
 760
 761         /*
 762          * If the page fault was caused by a write but the page is write
 763          * protected, emulation is needed. If the emulation was skipped,
 764          * the vCPU would have the same fault again.
 765          */
 766         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 767                 if (write)
 768                         ret = RET_PF_EMULATE;
 769                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 770         }
 771
 772         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 773         if (unlikely(is_mmio_spte(new_spte))) {
 774                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 775                                      new_spte);
 776                 ret = RET_PF_EMULATE;
 777         } else
 778                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 779                                        rcu_dereference(iter->sptep));
 780
 781         trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 782                                rcu_dereference(iter->sptep));
 783         if (!prefault)
 784                 vcpu->stat.pf_fixed++;
 785
 786         return ret;
 787 }
 788
 789 /*
 790  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 791  * page tables and SPTEs to translate the faulting guest physical address.
 792  */
 793 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 794                     int map_writable, int max_level, kvm_pfn_t pfn,
 795                     bool prefault)
 796 {
 797         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 798         bool write = error_code & PFERR_WRITE_MASK;
 799         bool exec = error_code & PFERR_FETCH_MASK;
 800         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 801         struct kvm_mmu *mmu = vcpu->arch.mmu;
 802         struct tdp_iter iter;
 803         struct kvm_mmu_page *sp;
 804         u64 *child_pt;
 805         u64 new_spte;
 806         int ret;
 807         gfn_t gfn = gpa >> PAGE_SHIFT;
 808         int level;
 809         int req_level;
 810
 811         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
 812                 return RET_PF_RETRY;
 813         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
 814                 return RET_PF_RETRY;
 815
 816         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
 817                                         huge_page_disallowed, &req_level);
 818
 819         trace_kvm_mmu_spte_requested(gpa, level, pfn);
 820
 821         rcu_read_lock();
 822
 823         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 824                 if (nx_huge_page_workaround_enabled)
 825                         disallowed_hugepage_adjust(iter.old_spte, gfn,
 826                                                    iter.level, &pfn, &level);
 827
 828                 if (iter.level == level)
 829                         break;
 830
 831                 /*
 832                  * If there is an SPTE mapping a large page at a higher level
 833                  * than the target, that SPTE must be cleared and replaced
 834                  * with a non-leaf SPTE.
 835                  */
 836                 if (is_shadow_present_pte(iter.old_spte) &&
 837                     is_large_pte(iter.old_spte)) {
 838                         if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
 839                                 break;
 840
 841                         /*
 842                          * The iter must explicitly re-read the spte here
 843                          * because the new value informs the !present
 844                          * path below.
 845                          */
 846                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
 847                 }
 848
 849                 if (!is_shadow_present_pte(iter.old_spte)) {
 850                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 851                         child_pt = sp->spt;
 852
 853                         new_spte = make_nonleaf_spte(child_pt,
 854                                                      !shadow_accessed_mask);
 855
 856                         if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
 857                                                     new_spte)) {
 858                                 tdp_mmu_link_page(vcpu->kvm, sp, true,
 859                                                   huge_page_disallowed &&
 860                                                   req_level >= iter.level);
 861
 862                                 trace_kvm_mmu_get_page(sp, true);
 863                         } else {
 864                                 tdp_mmu_free_sp(sp);
 865                                 break;
 866                         }
 867                 }
 868         }
 869
 870         if (iter.level != level) {
 871                 rcu_read_unlock();
 872                 return RET_PF_RETRY;
 873         }
 874
 875         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
 876                                               pfn, prefault);
 877         rcu_read_unlock();
 878
 879         return ret;
 880 }
 881
 882 static __always_inline int
 883 kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
 884                              unsigned long start,
 885                              unsigned long end,
 886                              unsigned long data,
 887                              int (*handler)(struct kvm *kvm,
 888                                             struct kvm_memory_slot *slot,
 889                                             struct kvm_mmu_page *root,
 890                                             gfn_t start,
 891                                             gfn_t end,
 892                                             unsigned long data))
 893 {
 894         struct kvm_memslots *slots;
 895         struct kvm_memory_slot *memslot;
 896         struct kvm_mmu_page *root;
 897         int ret = 0;
 898         int as_id;
 899
 900         for_each_tdp_mmu_root_yield_safe(kvm, root) {
 901                 as_id = kvm_mmu_page_as_id(root);
 902                 slots = __kvm_memslots(kvm, as_id);
 903                 kvm_for_each_memslot(memslot, slots) {
 904                         unsigned long hva_start, hva_end;
 905                         gfn_t gfn_start, gfn_end;
 906
 907                         hva_start = max(start, memslot->userspace_addr);
 908                         hva_end = min(end, memslot->userspace_addr +
 909                                       (memslot->npages << PAGE_SHIFT));
 910                         if (hva_start >= hva_end)
 911                                 continue;
 912                         /*
 913                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 914                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 915                          */
 916                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
 917                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 918
 919                         ret |= handler(kvm, memslot, root, gfn_start,
 920                                        gfn_end, data);
 921                 }
 922         }
 923
 924         return ret;
 925 }
 926
 927 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
 928                                      struct kvm_memory_slot *slot,
 929                                      struct kvm_mmu_page *root, gfn_t start,
 930                                      gfn_t end, unsigned long unused)
 931 {
 932         return zap_gfn_range(kvm, root, start, end, false);
 933 }
 934
 935 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
 936                               unsigned long end)
 937 {
 938         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 939                                             zap_gfn_range_hva_wrapper);
 940 }
 941
 942 /*
 943  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
 944  * if any of the GFNs in the range have been accessed.
 945  */
 946 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
 947                          struct kvm_mmu_page *root, gfn_t start, gfn_t end,
 948                          unsigned long unused)
 949 {
 950         struct tdp_iter iter;
 951         int young = 0;
 952         u64 new_spte = 0;
 953
 954         rcu_read_lock();
 955
 956         tdp_root_for_each_leaf_pte(iter, root, start, end) {
 957                 /*
 958                  * If we have a non-accessed entry we don't need to change the
 959                  * pte.
 960                  */
 961                 if (!is_accessed_spte(iter.old_spte))
 962                         continue;
 963
 964                 new_spte = iter.old_spte;
 965
 966                 if (spte_ad_enabled(new_spte)) {
 967                         clear_bit((ffs(shadow_accessed_mask) - 1),
 968                                   (unsigned long *)&new_spte);
 969                 } else {
 970                         /*
 971                          * Capture the dirty status of the page, so that it doesn't get
 972                          * lost when the SPTE is marked for access tracking.
 973                          */
 974                         if (is_writable_pte(new_spte))
 975                                 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 976
 977                         new_spte = mark_spte_for_access_track(new_spte);
 978                 }
 979                 new_spte &= ~shadow_dirty_mask;
 980
 981                 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
 982                 young = 1;
 983
 984                 trace_kvm_age_page(iter.gfn, iter.level, slot, young);
 985         }
 986
 987         rcu_read_unlock();
 988
 989         return young;
 990 }
 991
 992 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
 993                               unsigned long end)
 994 {
 995         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 996                                             age_gfn_range);
 997 }
 998
 999 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
1000                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1001                         unsigned long unused2)
1002 {
1003         struct tdp_iter iter;
1004
1005         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
1006                 if (is_accessed_spte(iter.old_spte))
1007                         return 1;
1008
1009         return 0;
1010 }
1011
1012 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1013 {
1014         return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
1015                                             test_age_gfn);
1016 }
1017
1018 /*
1019  * Handle the changed_pte MMU notifier for the TDP MMU.
1020  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1021  * notifier.
1022  * Returns non-zero if a flush is needed before releasing the MMU lock.
1023  */
1024 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1025                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1026                         unsigned long data)
1027 {
1028         struct tdp_iter iter;
1029         pte_t *ptep = (pte_t *)data;
1030         kvm_pfn_t new_pfn;
1031         u64 new_spte;
1032         int need_flush = 0;
1033
1034         rcu_read_lock();
1035
1036         WARN_ON(pte_huge(*ptep));
1037
1038         new_pfn = pte_pfn(*ptep);
1039
1040         tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
1041                 if (iter.level != PG_LEVEL_4K)
1042                         continue;
1043
1044                 if (!is_shadow_present_pte(iter.old_spte))
1045                         break;
1046
1047                 tdp_mmu_set_spte(kvm, &iter, 0);
1048
1049                 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
1050
1051                 if (!pte_write(*ptep)) {
1052                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1053                                         iter.old_spte, new_pfn);
1054
1055                         tdp_mmu_set_spte(kvm, &iter, new_spte);
1056                 }
1057
1058                 need_flush = 1;
1059         }
1060
1061         if (need_flush)
1062                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1063
1064         rcu_read_unlock();
1065
1066         return 0;
1067 }
1068
1069 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1070                              pte_t *host_ptep)
1071 {
1072         return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
1073                                             (unsigned long)host_ptep,
1074                                             set_tdp_spte);
1075 }
1076
1077 /*
1078  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1079  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1080  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1081  */
1082 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1083                              gfn_t start, gfn_t end, int min_level)
1084 {
1085         struct tdp_iter iter;
1086         u64 new_spte;
1087         bool spte_set = false;
1088
1089         rcu_read_lock();
1090
1091         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1092
1093         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1094                                    min_level, start, end) {
1095                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1096                         continue;
1097
1098                 if (!is_shadow_present_pte(iter.old_spte) ||
1099                     !is_last_spte(iter.old_spte, iter.level) ||
1100                     !(iter.old_spte & PT_WRITABLE_MASK))
1101                         continue;
1102
1103                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1104
1105                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1106                 spte_set = true;
1107         }
1108
1109         rcu_read_unlock();
1110         return spte_set;
1111 }
1112
1113 /*
1114  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1115  * only affect leaf SPTEs down to min_level.
1116  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1117  */
1118 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1119                              int min_level)
1120 {
1121         struct kvm_mmu_page *root;
1122         int root_as_id;
1123         bool spte_set = false;
1124
1125         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1126                 root_as_id = kvm_mmu_page_as_id(root);
1127                 if (root_as_id != slot->as_id)
1128                         continue;
1129
1130                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1131                              slot->base_gfn + slot->npages, min_level);
1132         }
1133
1134         return spte_set;
1135 }
1136
1137 /*
1138  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1139  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1140  * If AD bits are not enabled, this will require clearing the writable bit on
1141  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1142  * be flushed.
1143  */
1144 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1145                            gfn_t start, gfn_t end)
1146 {
1147         struct tdp_iter iter;
1148         u64 new_spte;
1149         bool spte_set = false;
1150
1151         rcu_read_lock();
1152
1153         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1154                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1155                         continue;
1156
1157                 if (spte_ad_need_write_protect(iter.old_spte)) {
1158                         if (is_writable_pte(iter.old_spte))
1159                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1160                         else
1161                                 continue;
1162                 } else {
1163                         if (iter.old_spte & shadow_dirty_mask)
1164                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1165                         else
1166                                 continue;
1167                 }
1168
1169                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1170                 spte_set = true;
1171         }
1172
1173         rcu_read_unlock();
1174         return spte_set;
1175 }
1176
1177 /*
1178  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1179  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1180  * If AD bits are not enabled, this will require clearing the writable bit on
1181  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1182  * be flushed.
1183  */
1184 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1185 {
1186         struct kvm_mmu_page *root;
1187         int root_as_id;
1188         bool spte_set = false;
1189
1190         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1191                 root_as_id = kvm_mmu_page_as_id(root);
1192                 if (root_as_id != slot->as_id)
1193                         continue;
1194
1195                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1196                                 slot->base_gfn + slot->npages);
1197         }
1198
1199         return spte_set;
1200 }
1201
1202 /*
1203  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1204  * set in mask, starting at gfn. The given memslot is expected to contain all
1205  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1206  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1207  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1208  */
1209 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1210                                   gfn_t gfn, unsigned long mask, bool wrprot)
1211 {
1212         struct tdp_iter iter;
1213         u64 new_spte;
1214
1215         rcu_read_lock();
1216
1217         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1218                                     gfn + BITS_PER_LONG) {
1219                 if (!mask)
1220                         break;
1221
1222                 if (iter.level > PG_LEVEL_4K ||
1223                     !(mask & (1UL << (iter.gfn - gfn))))
1224                         continue;
1225
1226                 mask &= ~(1UL << (iter.gfn - gfn));
1227
1228                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1229                         if (is_writable_pte(iter.old_spte))
1230                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1231                         else
1232                                 continue;
1233                 } else {
1234                         if (iter.old_spte & shadow_dirty_mask)
1235                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1236                         else
1237                                 continue;
1238                 }
1239
1240                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1241         }
1242
1243         rcu_read_unlock();
1244 }
1245
1246 /*
1247  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1248  * set in mask, starting at gfn. The given memslot is expected to contain all
1249  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1250  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1251  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1252  */
1253 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1254                                        struct kvm_memory_slot *slot,
1255                                        gfn_t gfn, unsigned long mask,
1256                                        bool wrprot)
1257 {
1258         struct kvm_mmu_page *root;
1259         int root_as_id;
1260
1261         lockdep_assert_held_write(&kvm->mmu_lock);
1262         for_each_tdp_mmu_root(kvm, root) {
1263                 root_as_id = kvm_mmu_page_as_id(root);
1264                 if (root_as_id != slot->as_id)
1265                         continue;
1266
1267                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1268         }
1269 }
1270
1271 /*
1272  * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
1273  * only used for PML, and so will involve setting the dirty bit on each SPTE.
1274  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1275  */
1276 static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1277                                 gfn_t start, gfn_t end)
1278 {
1279         struct tdp_iter iter;
1280         u64 new_spte;
1281         bool spte_set = false;
1282
1283         rcu_read_lock();
1284
1285         tdp_root_for_each_pte(iter, root, start, end) {
1286                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1287                         continue;
1288
1289                 if (!is_shadow_present_pte(iter.old_spte) ||
1290                     iter.old_spte & shadow_dirty_mask)
1291                         continue;
1292
1293                 new_spte = iter.old_spte | shadow_dirty_mask;
1294
1295                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1296                 spte_set = true;
1297         }
1298
1299         rcu_read_unlock();
1300         return spte_set;
1301 }
1302
1303 /*
1304  * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
1305  * only used for PML, and so will involve setting the dirty bit on each SPTE.
1306  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1307  */
1308 bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
1309 {
1310         struct kvm_mmu_page *root;
1311         int root_as_id;
1312         bool spte_set = false;
1313
1314         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1315                 root_as_id = kvm_mmu_page_as_id(root);
1316                 if (root_as_id != slot->as_id)
1317                         continue;
1318
1319                 spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn,
1320                                 slot->base_gfn + slot->npages);
1321         }
1322         return spte_set;
1323 }
1324
1325 /*
1326  * Clear leaf entries which could be replaced by large mappings, for
1327  * GFNs within the slot.
1328  */
1329 static void zap_collapsible_spte_range(struct kvm *kvm,
1330                                        struct kvm_mmu_page *root,
1331                                        gfn_t start, gfn_t end)
1332 {
1333         struct tdp_iter iter;
1334         kvm_pfn_t pfn;
1335         bool spte_set = false;
1336
1337         rcu_read_lock();
1338
1339         tdp_root_for_each_pte(iter, root, start, end) {
1340                 if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
1341                         spte_set = false;
1342                         continue;
1343                 }
1344
1345                 if (!is_shadow_present_pte(iter.old_spte) ||
1346                     !is_last_spte(iter.old_spte, iter.level))
1347                         continue;
1348
1349                 pfn = spte_to_pfn(iter.old_spte);
1350                 if (kvm_is_reserved_pfn(pfn) ||
1351                     !PageTransCompoundMap(pfn_to_page(pfn)))
1352                         continue;
1353
1354                 tdp_mmu_set_spte(kvm, &iter, 0);
1355
1356                 spte_set = true;
1357         }
1358
1359         rcu_read_unlock();
1360         if (spte_set)
1361                 kvm_flush_remote_tlbs(kvm);
1362 }
1363
1364 /*
1365  * Clear non-leaf entries (and free associated page tables) which could
1366  * be replaced by large mappings, for GFNs within the slot.
1367  */
1368 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1369                                        const struct kvm_memory_slot *slot)
1370 {
1371         struct kvm_mmu_page *root;
1372         int root_as_id;
1373
1374         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1375                 root_as_id = kvm_mmu_page_as_id(root);
1376                 if (root_as_id != slot->as_id)
1377                         continue;
1378
1379                 zap_collapsible_spte_range(kvm, root, slot->base_gfn,
1380                                            slot->base_gfn + slot->npages);
1381         }
1382 }
1383
1384 /*
1385  * Removes write access on the last level SPTE mapping this GFN and unsets the
1386  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1387  * Returns true if an SPTE was set and a TLB flush is needed.
1388  */
1389 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1390                               gfn_t gfn)
1391 {
1392         struct tdp_iter iter;
1393         u64 new_spte;
1394         bool spte_set = false;
1395
1396         rcu_read_lock();
1397
1398         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1399                 if (!is_writable_pte(iter.old_spte))
1400                         break;
1401
1402                 new_spte = iter.old_spte &
1403                         ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
1404
1405                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1406                 spte_set = true;
1407         }
1408
1409         rcu_read_unlock();
1410
1411         return spte_set;
1412 }
1413
1414 /*
1415  * Removes write access on the last level SPTE mapping this GFN and unsets the
1416  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1417  * Returns true if an SPTE was set and a TLB flush is needed.
1418  */
1419 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1420                                    struct kvm_memory_slot *slot, gfn_t gfn)
1421 {
1422         struct kvm_mmu_page *root;
1423         int root_as_id;
1424         bool spte_set = false;
1425
1426         lockdep_assert_held_write(&kvm->mmu_lock);
1427         for_each_tdp_mmu_root(kvm, root) {
1428                 root_as_id = kvm_mmu_page_as_id(root);
1429                 if (root_as_id != slot->as_id)
1430                         continue;
1431
1432                 spte_set |= write_protect_gfn(kvm, root, gfn);
1433         }
1434         return spte_set;
1435 }
1436
1437 /*
1438  * Return the level of the lowest level SPTE added to sptes.
1439  * That SPTE may be non-present.
1440  */
1441 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1442                          int *root_level)
1443 {
1444         struct tdp_iter iter;
1445         struct kvm_mmu *mmu = vcpu->arch.mmu;
1446         gfn_t gfn = addr >> PAGE_SHIFT;
1447         int leaf = -1;
1448
1449         *root_level = vcpu->arch.mmu->shadow_root_level;
1450
1451         rcu_read_lock();
1452
1453         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1454                 leaf = iter.level;
1455                 sptes[leaf] = iter.old_spte;
1456         }
1457
1458         rcu_read_unlock();
1459
1460         return leaf;
1461 }