arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = false;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  20                 return;
  21
  22         /* This should not be changed for the lifetime of the VM. */
  23         kvm->arch.tdp_mmu_enabled = true;
  24
  25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  28 }
  29
  30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  31 {
  32         if (!kvm->arch.tdp_mmu_enabled)
  33                 return;
  34
  35         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  36
  37         /*
  38          * Ensure that all the outstanding RCU callbacks to free shadow pages
  39          * can run before the VM is torn down.
  40          */
  41         rcu_barrier();
  42 }
  43
  44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
  45 {
  46         if (kvm_mmu_put_root(kvm, root))
  47                 kvm_tdp_mmu_free_root(kvm, root);
  48 }
  49
  50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
  51                                            struct kvm_mmu_page *root)
  52 {
  53         lockdep_assert_held_write(&kvm->mmu_lock);
  54
  55         if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
  56                 return false;
  57
  58         kvm_mmu_get_root(kvm, root);
  59         return true;
  60
  61 }
  62
  63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  64                                                      struct kvm_mmu_page *root)
  65 {
  66         struct kvm_mmu_page *next_root;
  67
  68         next_root = list_next_entry(root, link);
  69         tdp_mmu_put_root(kvm, root);
  70         return next_root;
  71 }
  72
  73 /*
  74  * Note: this iterator gets and puts references to the roots it iterates over.
  75  * This makes it safe to release the MMU lock and yield within the loop, but
  76  * if exiting the loop early, the caller must drop the reference to the most
  77  * recent root. (Unless keeping a live reference is desirable.)
  78  */
  79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                           \
  80         for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,        \
  81                                       typeof(*_root), link);            \
  82              tdp_mmu_next_root_valid(_kvm, _root);                      \
  83              _root = tdp_mmu_next_root(_kvm, _root))
  84
  85 #define for_each_tdp_mmu_root(_kvm, _root)                              \
  86         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
  87
  88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  89                           gfn_t start, gfn_t end, bool can_yield, bool flush);
  90
  91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  92 {
  93         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
  94
  95         lockdep_assert_held_write(&kvm->mmu_lock);
  96
  97         WARN_ON(root->root_count);
  98         WARN_ON(!root->tdp_mmu_page);
  99
 100         list_del(&root->link);
 101
 102         zap_gfn_range(kvm, root, 0, max_gfn, false, false);
 103
 104         free_page((unsigned long)root->spt);
 105         kmem_cache_free(mmu_page_header_cache, root);
 106 }
 107
 108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
 109                                                    int level)
 110 {
 111         union kvm_mmu_page_role role;
 112
 113         role = vcpu->arch.mmu->mmu_role.base;
 114         role.level = level;
 115         role.direct = true;
 116         role.gpte_is_8_bytes = true;
 117         role.access = ACC_ALL;
 118
 119         return role;
 120 }
 121
 122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 123                                                int level)
 124 {
 125         struct kvm_mmu_page *sp;
 126
 127         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 128         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 129         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 130
 131         sp->role.word = page_role_for_level(vcpu, level).word;
 132         sp->gfn = gfn;
 133         sp->tdp_mmu_page = true;
 134
 135         trace_kvm_mmu_get_page(sp, true);
 136
 137         return sp;
 138 }
 139
 140 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
 141 {
 142         union kvm_mmu_page_role role;
 143         struct kvm *kvm = vcpu->kvm;
 144         struct kvm_mmu_page *root;
 145
 146         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 147
 148         write_lock(&kvm->mmu_lock);
 149
 150         /* Check for an existing root before allocating a new one. */
 151         for_each_tdp_mmu_root(kvm, root) {
 152                 if (root->role.word == role.word) {
 153                         kvm_mmu_get_root(kvm, root);
 154                         write_unlock(&kvm->mmu_lock);
 155                         return root;
 156                 }
 157         }
 158
 159         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 160         root->root_count = 1;
 161
 162         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 163
 164         write_unlock(&kvm->mmu_lock);
 165
 166         return root;
 167 }
 168
 169 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 170 {
 171         struct kvm_mmu_page *root;
 172
 173         root = get_tdp_mmu_vcpu_root(vcpu);
 174         if (!root)
 175                 return INVALID_PAGE;
 176
 177         return __pa(root->spt);
 178 }
 179
 180 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 181 {
 182         free_page((unsigned long)sp->spt);
 183         kmem_cache_free(mmu_page_header_cache, sp);
 184 }
 185
 186 /*
 187  * This is called through call_rcu in order to free TDP page table memory
 188  * safely with respect to other kernel threads that may be operating on
 189  * the memory.
 190  * By only accessing TDP MMU page table memory in an RCU read critical
 191  * section, and freeing it after a grace period, lockless access to that
 192  * memory won't use it after it is freed.
 193  */
 194 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
 195 {
 196         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
 197                                                rcu_head);
 198
 199         tdp_mmu_free_sp(sp);
 200 }
 201
 202 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 203                                 u64 old_spte, u64 new_spte, int level,
 204                                 bool shared);
 205
 206 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 207 {
 208         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 209
 210         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 211                 return;
 212
 213         if (is_accessed_spte(old_spte) &&
 214             (!is_accessed_spte(new_spte) || pfn_changed))
 215                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 216 }
 217
 218 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 219                                           u64 old_spte, u64 new_spte, int level)
 220 {
 221         bool pfn_changed;
 222         struct kvm_memory_slot *slot;
 223
 224         if (level > PG_LEVEL_4K)
 225                 return;
 226
 227         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 228
 229         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 230             is_writable_pte(new_spte)) {
 231                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 232                 mark_page_dirty_in_slot(kvm, slot, gfn);
 233         }
 234 }
 235
 236 /**
 237  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
 238  *
 239  * @kvm: kvm instance
 240  * @sp: the new page
 241  * @shared: This operation may not be running under the exclusive use of
 242  *          the MMU lock and the operation must synchronize with other
 243  *          threads that might be adding or removing pages.
 244  * @account_nx: This page replaces a NX large page and should be marked for
 245  *              eventual reclaim.
 246  */
 247 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 248                               bool shared, bool account_nx)
 249 {
 250         if (shared)
 251                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 252         else
 253                 lockdep_assert_held_write(&kvm->mmu_lock);
 254
 255         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
 256         if (account_nx)
 257                 account_huge_nx_page(kvm, sp);
 258
 259         if (shared)
 260                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 261 }
 262
 263 /**
 264  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
 265  *
 266  * @kvm: kvm instance
 267  * @sp: the page to be removed
 268  * @shared: This operation may not be running under the exclusive use of
 269  *          the MMU lock and the operation must synchronize with other
 270  *          threads that might be adding or removing pages.
 271  */
 272 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 273                                 bool shared)
 274 {
 275         if (shared)
 276                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 277         else
 278                 lockdep_assert_held_write(&kvm->mmu_lock);
 279
 280         list_del(&sp->link);
 281         if (sp->lpage_disallowed)
 282                 unaccount_huge_nx_page(kvm, sp);
 283
 284         if (shared)
 285                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 286 }
 287
 288 /**
 289  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
 290  *
 291  * @kvm: kvm instance
 292  * @pt: the page removed from the paging structure
 293  * @shared: This operation may not be running under the exclusive use
 294  *          of the MMU lock and the operation must synchronize with other
 295  *          threads that might be modifying SPTEs.
 296  *
 297  * Given a page table that has been removed from the TDP paging structure,
 298  * iterates through the page table to clear SPTEs and free child page tables.
 299  *
 300  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 301  * protection. Since this thread removed it from the paging structure,
 302  * this thread will be responsible for ensuring the page is freed. Hence the
 303  * early rcu_dereferences in the function.
 304  */
 305 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
 306                                         bool shared)
 307 {
 308         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
 309         int level = sp->role.level;
 310         gfn_t base_gfn = sp->gfn;
 311         u64 old_child_spte;
 312         u64 *sptep;
 313         gfn_t gfn;
 314         int i;
 315
 316         trace_kvm_mmu_prepare_zap_page(sp);
 317
 318         tdp_mmu_unlink_page(kvm, sp, shared);
 319
 320         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 321                 sptep = rcu_dereference(pt) + i;
 322                 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 323
 324                 if (shared) {
 325                         /*
 326                          * Set the SPTE to a nonpresent value that other
 327                          * threads will not overwrite. If the SPTE was
 328                          * already marked as removed then another thread
 329                          * handling a page fault could overwrite it, so
 330                          * set the SPTE until it is set from some other
 331                          * value to the removed SPTE value.
 332                          */
 333                         for (;;) {
 334                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 335                                 if (!is_removed_spte(old_child_spte))
 336                                         break;
 337                                 cpu_relax();
 338                         }
 339                 } else {
 340                         /*
 341                          * If the SPTE is not MMU-present, there is no backing
 342                          * page associated with the SPTE and so no side effects
 343                          * that need to be recorded, and exclusive ownership of
 344                          * mmu_lock ensures the SPTE can't be made present.
 345                          * Note, zapping MMIO SPTEs is also unnecessary as they
 346                          * are guarded by the memslots generation, not by being
 347                          * unreachable.
 348                          */
 349                         old_child_spte = READ_ONCE(*sptep);
 350                         if (!is_shadow_present_pte(old_child_spte))
 351                                 continue;
 352
 353                         /*
 354                          * Marking the SPTE as a removed SPTE is not
 355                          * strictly necessary here as the MMU lock will
 356                          * stop other threads from concurrently modifying
 357                          * this SPTE. Using the removed SPTE value keeps
 358                          * the two branches consistent and simplifies
 359                          * the function.
 360                          */
 361                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 362                 }
 363                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 364                                     old_child_spte, REMOVED_SPTE, level - 1,
 365                                     shared);
 366         }
 367
 368         kvm_flush_remote_tlbs_with_address(kvm, gfn,
 369                                            KVM_PAGES_PER_HPAGE(level));
 370
 371         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 372 }
 373
 374 /**
 375  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 376  * @kvm: kvm instance
 377  * @as_id: the address space of the paging structure the SPTE was a part of
 378  * @gfn: the base GFN that was mapped by the SPTE
 379  * @old_spte: The value of the SPTE before the change
 380  * @new_spte: The value of the SPTE after the change
 381  * @level: the level of the PT the SPTE is part of in the paging structure
 382  * @shared: This operation may not be running under the exclusive use of
 383  *          the MMU lock and the operation must synchronize with other
 384  *          threads that might be modifying SPTEs.
 385  *
 386  * Handle bookkeeping that might result from the modification of a SPTE.
 387  * This function must be called for all TDP SPTE modifications.
 388  */
 389 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 390                                   u64 old_spte, u64 new_spte, int level,
 391                                   bool shared)
 392 {
 393         bool was_present = is_shadow_present_pte(old_spte);
 394         bool is_present = is_shadow_present_pte(new_spte);
 395         bool was_leaf = was_present && is_last_spte(old_spte, level);
 396         bool is_leaf = is_present && is_last_spte(new_spte, level);
 397         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 398
 399         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 400         WARN_ON(level < PG_LEVEL_4K);
 401         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 402
 403         /*
 404          * If this warning were to trigger it would indicate that there was a
 405          * missing MMU notifier or a race with some notifier handler.
 406          * A present, leaf SPTE should never be directly replaced with another
 407          * present leaf SPTE pointing to a different PFN. A notifier handler
 408          * should be zapping the SPTE before the main MM's page table is
 409          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 410          * thread before replacement.
 411          */
 412         if (was_leaf && is_leaf && pfn_changed) {
 413                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 414                        "SPTE with another present leaf SPTE mapping a\n"
 415                        "different PFN!\n"
 416                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 417                        as_id, gfn, old_spte, new_spte, level);
 418
 419                 /*
 420                  * Crash the host to prevent error propagation and guest data
 421                  * corruption.
 422                  */
 423                 BUG();
 424         }
 425
 426         if (old_spte == new_spte)
 427                 return;
 428
 429         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 430
 431         /*
 432          * The only times a SPTE should be changed from a non-present to
 433          * non-present state is when an MMIO entry is installed/modified/
 434          * removed. In that case, there is nothing to do here.
 435          */
 436         if (!was_present && !is_present) {
 437                 /*
 438                  * If this change does not involve a MMIO SPTE or removed SPTE,
 439                  * it is unexpected. Log the change, though it should not
 440                  * impact the guest since both the former and current SPTEs
 441                  * are nonpresent.
 442                  */
 443                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 444                             !is_mmio_spte(new_spte) &&
 445                             !is_removed_spte(new_spte)))
 446                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 447                                "should not be replaced with another,\n"
 448                                "different nonpresent SPTE, unless one or both\n"
 449                                "are MMIO SPTEs, or the new SPTE is\n"
 450                                "a temporary removed SPTE.\n"
 451                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 452                                as_id, gfn, old_spte, new_spte, level);
 453                 return;
 454         }
 455
 456
 457         if (was_leaf && is_dirty_spte(old_spte) &&
 458             (!is_dirty_spte(new_spte) || pfn_changed))
 459                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 460
 461         /*
 462          * Recursively handle child PTs if the change removed a subtree from
 463          * the paging structure.
 464          */
 465         if (was_present && !was_leaf && (pfn_changed || !is_present))
 466                 handle_removed_tdp_mmu_page(kvm,
 467                                 spte_to_child_pt(old_spte, level), shared);
 468 }
 469
 470 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 471                                 u64 old_spte, u64 new_spte, int level,
 472                                 bool shared)
 473 {
 474         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 475                               shared);
 476         handle_changed_spte_acc_track(old_spte, new_spte, level);
 477         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 478                                       new_spte, level);
 479 }
 480
 481 /*
 482  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
 483  * associated bookkeeping
 484  *
 485  * @kvm: kvm instance
 486  * @iter: a tdp_iter instance currently on the SPTE that should be set
 487  * @new_spte: The value the SPTE should be set to
 488  * Returns: true if the SPTE was set, false if it was not. If false is returned,
 489  *          this function will have no side-effects.
 490  */
 491 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
 492                                            struct tdp_iter *iter,
 493                                            u64 new_spte)
 494 {
 495         lockdep_assert_held_read(&kvm->mmu_lock);
 496
 497         /*
 498          * Do not change removed SPTEs. Only the thread that froze the SPTE
 499          * may modify it.
 500          */
 501         if (iter->old_spte == REMOVED_SPTE)
 502                 return false;
 503
 504         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
 505                       new_spte) != iter->old_spte)
 506                 return false;
 507
 508         handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 509                             new_spte, iter->level, true);
 510
 511         return true;
 512 }
 513
 514 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 515                                            struct tdp_iter *iter)
 516 {
 517         /*
 518          * Freeze the SPTE by setting it to a special,
 519          * non-present value. This will stop other threads from
 520          * immediately installing a present entry in its place
 521          * before the TLBs are flushed.
 522          */
 523         if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
 524                 return false;
 525
 526         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 527                                            KVM_PAGES_PER_HPAGE(iter->level));
 528
 529         /*
 530          * No other thread can overwrite the removed SPTE as they
 531          * must either wait on the MMU lock or use
 532          * tdp_mmu_set_spte_atomic which will not overwrite the
 533          * special removed SPTE value. No bookkeeping is needed
 534          * here since the SPTE is going from non-present
 535          * to non-present.
 536          */
 537         WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 538
 539         return true;
 540 }
 541
 542
 543 /*
 544  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 545  * @kvm: kvm instance
 546  * @iter: a tdp_iter instance currently on the SPTE that should be set
 547  * @new_spte: The value the SPTE should be set to
 548  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 549  *                    of the page. Should be set unless handling an MMU
 550  *                    notifier for access tracking. Leaving record_acc_track
 551  *                    unset in that case prevents page accesses from being
 552  *                    double counted.
 553  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 554  *                    appropriate for the change being made. Should be set
 555  *                    unless performing certain dirty logging operations.
 556  *                    Leaving record_dirty_log unset in that case prevents page
 557  *                    writes from being double counted.
 558  */
 559 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 560                                       u64 new_spte, bool record_acc_track,
 561                                       bool record_dirty_log)
 562 {
 563         lockdep_assert_held_write(&kvm->mmu_lock);
 564
 565         /*
 566          * No thread should be using this function to set SPTEs to the
 567          * temporary removed SPTE value.
 568          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 569          * should be used. If operating under the MMU lock in write mode, the
 570          * use of the removed SPTE should not be necessary.
 571          */
 572         WARN_ON(iter->old_spte == REMOVED_SPTE);
 573
 574         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 575
 576         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 577                               new_spte, iter->level, false);
 578         if (record_acc_track)
 579                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 580                                               iter->level);
 581         if (record_dirty_log)
 582                 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
 583                                               iter->old_spte, new_spte,
 584                                               iter->level);
 585 }
 586
 587 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 588                                     u64 new_spte)
 589 {
 590         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 591 }
 592
 593 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 594                                                  struct tdp_iter *iter,
 595                                                  u64 new_spte)
 596 {
 597         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 598 }
 599
 600 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 601                                                  struct tdp_iter *iter,
 602                                                  u64 new_spte)
 603 {
 604         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 605 }
 606
 607 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 608         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 609
 610 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 611         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 612                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 613                     !is_last_spte(_iter.old_spte, _iter.level))         \
 614                         continue;                                       \
 615                 else
 616
 617 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 618         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 619                          _mmu->shadow_root_level, _start, _end)
 620
 621 /*
 622  * Yield if the MMU lock is contended or this thread needs to return control
 623  * to the scheduler.
 624  *
 625  * If this function should yield and flush is set, it will perform a remote
 626  * TLB flush before yielding.
 627  *
 628  * If this function yields, it will also reset the tdp_iter's walk over the
 629  * paging structure and the calling function should skip to the next
 630  * iteration to allow the iterator to continue its traversal from the
 631  * paging structure root.
 632  *
 633  * Return true if this function yielded and the iterator's traversal was reset.
 634  * Return false if a yield was not needed.
 635  */
 636 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 637                                              struct tdp_iter *iter, bool flush)
 638 {
 639         /* Ensure forward progress has been made before yielding. */
 640         if (iter->next_last_level_gfn == iter->yielded_gfn)
 641                 return false;
 642
 643         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 644                 rcu_read_unlock();
 645
 646                 if (flush)
 647                         kvm_flush_remote_tlbs(kvm);
 648
 649                 cond_resched_rwlock_write(&kvm->mmu_lock);
 650                 rcu_read_lock();
 651
 652                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 653
 654                 tdp_iter_restart(iter);
 655
 656                 return true;
 657         }
 658
 659         return false;
 660 }
 661
 662 /*
 663  * Tears down the mappings for the range of gfns, [start, end), and frees the
 664  * non-root pages mapping GFNs strictly within that range. Returns true if
 665  * SPTEs have been cleared and a TLB flush is needed before releasing the
 666  * MMU lock.
 667  * If can_yield is true, will release the MMU lock and reschedule if the
 668  * scheduler needs the CPU or there is contention on the MMU lock. If this
 669  * function cannot yield, it will not release the MMU lock or reschedule and
 670  * the caller must ensure it does not supply too large a GFN range, or the
 671  * operation can cause a soft lockup.  Note, in some use cases a flush may be
 672  * required by prior actions.  Ensure the pending flush is performed prior to
 673  * yielding.
 674  */
 675 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 676                           gfn_t start, gfn_t end, bool can_yield, bool flush)
 677 {
 678         struct tdp_iter iter;
 679
 680         rcu_read_lock();
 681
 682         tdp_root_for_each_pte(iter, root, start, end) {
 683                 if (can_yield &&
 684                     tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
 685                         flush = false;
 686                         continue;
 687                 }
 688
 689                 if (!is_shadow_present_pte(iter.old_spte))
 690                         continue;
 691
 692                 /*
 693                  * If this is a non-last-level SPTE that covers a larger range
 694                  * than should be zapped, continue, and zap the mappings at a
 695                  * lower level.
 696                  */
 697                 if ((iter.gfn < start ||
 698                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 699                     !is_last_spte(iter.old_spte, iter.level))
 700                         continue;
 701
 702                 tdp_mmu_set_spte(kvm, &iter, 0);
 703                 flush = true;
 704         }
 705
 706         rcu_read_unlock();
 707         return flush;
 708 }
 709
 710 /*
 711  * Tears down the mappings for the range of gfns, [start, end), and frees the
 712  * non-root pages mapping GFNs strictly within that range. Returns true if
 713  * SPTEs have been cleared and a TLB flush is needed before releasing the
 714  * MMU lock.
 715  */
 716 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
 717                                  bool can_yield)
 718 {
 719         struct kvm_mmu_page *root;
 720         bool flush = false;
 721
 722         for_each_tdp_mmu_root_yield_safe(kvm, root)
 723                 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
 724
 725         return flush;
 726 }
 727
 728 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 729 {
 730         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 731         bool flush;
 732
 733         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
 734         if (flush)
 735                 kvm_flush_remote_tlbs(kvm);
 736 }
 737
 738 /*
 739  * Installs a last-level SPTE to handle a TDP page fault.
 740  * (NPT/EPT violation/misconfiguration)
 741  */
 742 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 743                                           int map_writable,
 744                                           struct tdp_iter *iter,
 745                                           kvm_pfn_t pfn, bool prefault)
 746 {
 747         u64 new_spte;
 748         int ret = 0;
 749         int make_spte_ret = 0;
 750
 751         if (unlikely(is_noslot_pfn(pfn)))
 752                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 753         else
 754                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 755                                          pfn, iter->old_spte, prefault, true,
 756                                          map_writable, !shadow_accessed_mask,
 757                                          &new_spte);
 758
 759         if (new_spte == iter->old_spte)
 760                 ret = RET_PF_SPURIOUS;
 761         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 762                 return RET_PF_RETRY;
 763
 764         /*
 765          * If the page fault was caused by a write but the page is write
 766          * protected, emulation is needed. If the emulation was skipped,
 767          * the vCPU would have the same fault again.
 768          */
 769         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 770                 if (write)
 771                         ret = RET_PF_EMULATE;
 772                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 773         }
 774
 775         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 776         if (unlikely(is_mmio_spte(new_spte))) {
 777                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 778                                      new_spte);
 779                 ret = RET_PF_EMULATE;
 780         } else
 781                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 782                                        rcu_dereference(iter->sptep));
 783
 784         trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 785                                rcu_dereference(iter->sptep));
 786         if (!prefault)
 787                 vcpu->stat.pf_fixed++;
 788
 789         return ret;
 790 }
 791
 792 /*
 793  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 794  * page tables and SPTEs to translate the faulting guest physical address.
 795  */
 796 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 797                     int map_writable, int max_level, kvm_pfn_t pfn,
 798                     bool prefault)
 799 {
 800         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 801         bool write = error_code & PFERR_WRITE_MASK;
 802         bool exec = error_code & PFERR_FETCH_MASK;
 803         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 804         struct kvm_mmu *mmu = vcpu->arch.mmu;
 805         struct tdp_iter iter;
 806         struct kvm_mmu_page *sp;
 807         u64 *child_pt;
 808         u64 new_spte;
 809         int ret;
 810         gfn_t gfn = gpa >> PAGE_SHIFT;
 811         int level;
 812         int req_level;
 813
 814         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
 815                 return RET_PF_RETRY;
 816         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
 817                 return RET_PF_RETRY;
 818
 819         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
 820                                         huge_page_disallowed, &req_level);
 821
 822         trace_kvm_mmu_spte_requested(gpa, level, pfn);
 823
 824         rcu_read_lock();
 825
 826         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 827                 if (nx_huge_page_workaround_enabled)
 828                         disallowed_hugepage_adjust(iter.old_spte, gfn,
 829                                                    iter.level, &pfn, &level);
 830
 831                 if (iter.level == level)
 832                         break;
 833
 834                 /*
 835                  * If there is an SPTE mapping a large page at a higher level
 836                  * than the target, that SPTE must be cleared and replaced
 837                  * with a non-leaf SPTE.
 838                  */
 839                 if (is_shadow_present_pte(iter.old_spte) &&
 840                     is_large_pte(iter.old_spte)) {
 841                         if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
 842                                 break;
 843
 844                         /*
 845                          * The iter must explicitly re-read the spte here
 846                          * because the new value informs the !present
 847                          * path below.
 848                          */
 849                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
 850                 }
 851
 852                 if (!is_shadow_present_pte(iter.old_spte)) {
 853                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 854                         child_pt = sp->spt;
 855
 856                         new_spte = make_nonleaf_spte(child_pt,
 857                                                      !shadow_accessed_mask);
 858
 859                         if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
 860                                                     new_spte)) {
 861                                 tdp_mmu_link_page(vcpu->kvm, sp, true,
 862                                                   huge_page_disallowed &&
 863                                                   req_level >= iter.level);
 864
 865                                 trace_kvm_mmu_get_page(sp, true);
 866                         } else {
 867                                 tdp_mmu_free_sp(sp);
 868                                 break;
 869                         }
 870                 }
 871         }
 872
 873         if (iter.level != level) {
 874                 rcu_read_unlock();
 875                 return RET_PF_RETRY;
 876         }
 877
 878         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
 879                                               pfn, prefault);
 880         rcu_read_unlock();
 881
 882         return ret;
 883 }
 884
 885 static __always_inline int
 886 kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
 887                              unsigned long start,
 888                              unsigned long end,
 889                              unsigned long data,
 890                              int (*handler)(struct kvm *kvm,
 891                                             struct kvm_memory_slot *slot,
 892                                             struct kvm_mmu_page *root,
 893                                             gfn_t start,
 894                                             gfn_t end,
 895                                             unsigned long data))
 896 {
 897         struct kvm_memslots *slots;
 898         struct kvm_memory_slot *memslot;
 899         struct kvm_mmu_page *root;
 900         int ret = 0;
 901         int as_id;
 902
 903         for_each_tdp_mmu_root_yield_safe(kvm, root) {
 904                 as_id = kvm_mmu_page_as_id(root);
 905                 slots = __kvm_memslots(kvm, as_id);
 906                 kvm_for_each_memslot(memslot, slots) {
 907                         unsigned long hva_start, hva_end;
 908                         gfn_t gfn_start, gfn_end;
 909
 910                         hva_start = max(start, memslot->userspace_addr);
 911                         hva_end = min(end, memslot->userspace_addr +
 912                                       (memslot->npages << PAGE_SHIFT));
 913                         if (hva_start >= hva_end)
 914                                 continue;
 915                         /*
 916                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 917                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 918                          */
 919                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
 920                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 921
 922                         ret |= handler(kvm, memslot, root, gfn_start,
 923                                        gfn_end, data);
 924                 }
 925         }
 926
 927         return ret;
 928 }
 929
 930 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
 931                                      struct kvm_memory_slot *slot,
 932                                      struct kvm_mmu_page *root, gfn_t start,
 933                                      gfn_t end, unsigned long unused)
 934 {
 935         return zap_gfn_range(kvm, root, start, end, false, false);
 936 }
 937
 938 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
 939                               unsigned long end)
 940 {
 941         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 942                                             zap_gfn_range_hva_wrapper);
 943 }
 944
 945 /*
 946  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
 947  * if any of the GFNs in the range have been accessed.
 948  */
 949 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
 950                          struct kvm_mmu_page *root, gfn_t start, gfn_t end,
 951                          unsigned long unused)
 952 {
 953         struct tdp_iter iter;
 954         int young = 0;
 955         u64 new_spte = 0;
 956
 957         rcu_read_lock();
 958
 959         tdp_root_for_each_leaf_pte(iter, root, start, end) {
 960                 /*
 961                  * If we have a non-accessed entry we don't need to change the
 962                  * pte.
 963                  */
 964                 if (!is_accessed_spte(iter.old_spte))
 965                         continue;
 966
 967                 new_spte = iter.old_spte;
 968
 969                 if (spte_ad_enabled(new_spte)) {
 970                         clear_bit((ffs(shadow_accessed_mask) - 1),
 971                                   (unsigned long *)&new_spte);
 972                 } else {
 973                         /*
 974                          * Capture the dirty status of the page, so that it doesn't get
 975                          * lost when the SPTE is marked for access tracking.
 976                          */
 977                         if (is_writable_pte(new_spte))
 978                                 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 979
 980                         new_spte = mark_spte_for_access_track(new_spte);
 981                 }
 982                 new_spte &= ~shadow_dirty_mask;
 983
 984                 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
 985                 young = 1;
 986
 987                 trace_kvm_age_page(iter.gfn, iter.level, slot, young);
 988         }
 989
 990         rcu_read_unlock();
 991
 992         return young;
 993 }
 994
 995 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
 996                               unsigned long end)
 997 {
 998         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 999                                             age_gfn_range);
1000 }
1001
1002 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
1003                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1004                         unsigned long unused2)
1005 {
1006         struct tdp_iter iter;
1007
1008         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
1009                 if (is_accessed_spte(iter.old_spte))
1010                         return 1;
1011
1012         return 0;
1013 }
1014
1015 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1016 {
1017         return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
1018                                             test_age_gfn);
1019 }
1020
1021 /*
1022  * Handle the changed_pte MMU notifier for the TDP MMU.
1023  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1024  * notifier.
1025  * Returns non-zero if a flush is needed before releasing the MMU lock.
1026  */
1027 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1028                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1029                         unsigned long data)
1030 {
1031         struct tdp_iter iter;
1032         pte_t *ptep = (pte_t *)data;
1033         kvm_pfn_t new_pfn;
1034         u64 new_spte;
1035         int need_flush = 0;
1036
1037         rcu_read_lock();
1038
1039         WARN_ON(pte_huge(*ptep));
1040
1041         new_pfn = pte_pfn(*ptep);
1042
1043         tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
1044                 if (iter.level != PG_LEVEL_4K)
1045                         continue;
1046
1047                 if (!is_shadow_present_pte(iter.old_spte))
1048                         break;
1049
1050                 tdp_mmu_set_spte(kvm, &iter, 0);
1051
1052                 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
1053
1054                 if (!pte_write(*ptep)) {
1055                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1056                                         iter.old_spte, new_pfn);
1057
1058                         tdp_mmu_set_spte(kvm, &iter, new_spte);
1059                 }
1060
1061                 need_flush = 1;
1062         }
1063
1064         if (need_flush)
1065                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1066
1067         rcu_read_unlock();
1068
1069         return 0;
1070 }
1071
1072 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1073                              pte_t *host_ptep)
1074 {
1075         return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
1076                                             (unsigned long)host_ptep,
1077                                             set_tdp_spte);
1078 }
1079
1080 /*
1081  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1082  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1083  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1084  */
1085 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1086                              gfn_t start, gfn_t end, int min_level)
1087 {
1088         struct tdp_iter iter;
1089         u64 new_spte;
1090         bool spte_set = false;
1091
1092         rcu_read_lock();
1093
1094         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1095
1096         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1097                                    min_level, start, end) {
1098                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1099                         continue;
1100
1101                 if (!is_shadow_present_pte(iter.old_spte) ||
1102                     !is_last_spte(iter.old_spte, iter.level) ||
1103                     !(iter.old_spte & PT_WRITABLE_MASK))
1104                         continue;
1105
1106                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1107
1108                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1109                 spte_set = true;
1110         }
1111
1112         rcu_read_unlock();
1113         return spte_set;
1114 }
1115
1116 /*
1117  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1118  * only affect leaf SPTEs down to min_level.
1119  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1120  */
1121 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1122                              int min_level)
1123 {
1124         struct kvm_mmu_page *root;
1125         int root_as_id;
1126         bool spte_set = false;
1127
1128         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1129                 root_as_id = kvm_mmu_page_as_id(root);
1130                 if (root_as_id != slot->as_id)
1131                         continue;
1132
1133                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1134                              slot->base_gfn + slot->npages, min_level);
1135         }
1136
1137         return spte_set;
1138 }
1139
1140 /*
1141  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1142  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1143  * If AD bits are not enabled, this will require clearing the writable bit on
1144  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1145  * be flushed.
1146  */
1147 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1148                            gfn_t start, gfn_t end)
1149 {
1150         struct tdp_iter iter;
1151         u64 new_spte;
1152         bool spte_set = false;
1153
1154         rcu_read_lock();
1155
1156         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1157                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1158                         continue;
1159
1160                 if (spte_ad_need_write_protect(iter.old_spte)) {
1161                         if (is_writable_pte(iter.old_spte))
1162                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1163                         else
1164                                 continue;
1165                 } else {
1166                         if (iter.old_spte & shadow_dirty_mask)
1167                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1168                         else
1169                                 continue;
1170                 }
1171
1172                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1173                 spte_set = true;
1174         }
1175
1176         rcu_read_unlock();
1177         return spte_set;
1178 }
1179
1180 /*
1181  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1182  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1183  * If AD bits are not enabled, this will require clearing the writable bit on
1184  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1185  * be flushed.
1186  */
1187 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1188 {
1189         struct kvm_mmu_page *root;
1190         int root_as_id;
1191         bool spte_set = false;
1192
1193         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1194                 root_as_id = kvm_mmu_page_as_id(root);
1195                 if (root_as_id != slot->as_id)
1196                         continue;
1197
1198                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1199                                 slot->base_gfn + slot->npages);
1200         }
1201
1202         return spte_set;
1203 }
1204
1205 /*
1206  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1207  * set in mask, starting at gfn. The given memslot is expected to contain all
1208  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1209  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1210  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1211  */
1212 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1213                                   gfn_t gfn, unsigned long mask, bool wrprot)
1214 {
1215         struct tdp_iter iter;
1216         u64 new_spte;
1217
1218         rcu_read_lock();
1219
1220         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1221                                     gfn + BITS_PER_LONG) {
1222                 if (!mask)
1223                         break;
1224
1225                 if (iter.level > PG_LEVEL_4K ||
1226                     !(mask & (1UL << (iter.gfn - gfn))))
1227                         continue;
1228
1229                 mask &= ~(1UL << (iter.gfn - gfn));
1230
1231                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1232                         if (is_writable_pte(iter.old_spte))
1233                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1234                         else
1235                                 continue;
1236                 } else {
1237                         if (iter.old_spte & shadow_dirty_mask)
1238                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1239                         else
1240                                 continue;
1241                 }
1242
1243                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1244         }
1245
1246         rcu_read_unlock();
1247 }
1248
1249 /*
1250  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1251  * set in mask, starting at gfn. The given memslot is expected to contain all
1252  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1253  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1254  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1255  */
1256 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1257                                        struct kvm_memory_slot *slot,
1258                                        gfn_t gfn, unsigned long mask,
1259                                        bool wrprot)
1260 {
1261         struct kvm_mmu_page *root;
1262         int root_as_id;
1263
1264         lockdep_assert_held_write(&kvm->mmu_lock);
1265         for_each_tdp_mmu_root(kvm, root) {
1266                 root_as_id = kvm_mmu_page_as_id(root);
1267                 if (root_as_id != slot->as_id)
1268                         continue;
1269
1270                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1271         }
1272 }
1273
1274 /*
1275  * Clear leaf entries which could be replaced by large mappings, for
1276  * GFNs within the slot.
1277  */
1278 static void zap_collapsible_spte_range(struct kvm *kvm,
1279                                        struct kvm_mmu_page *root,
1280                                        struct kvm_memory_slot *slot)
1281 {
1282         gfn_t start = slot->base_gfn;
1283         gfn_t end = start + slot->npages;
1284         struct tdp_iter iter;
1285         kvm_pfn_t pfn;
1286         bool spte_set = false;
1287
1288         rcu_read_lock();
1289
1290         tdp_root_for_each_pte(iter, root, start, end) {
1291                 if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
1292                         spte_set = false;
1293                         continue;
1294                 }
1295
1296                 if (!is_shadow_present_pte(iter.old_spte) ||
1297                     !is_last_spte(iter.old_spte, iter.level))
1298                         continue;
1299
1300                 pfn = spte_to_pfn(iter.old_spte);
1301                 if (kvm_is_reserved_pfn(pfn) ||
1302                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1303                                                             pfn, PG_LEVEL_NUM))
1304                         continue;
1305
1306                 tdp_mmu_set_spte(kvm, &iter, 0);
1307
1308                 spte_set = true;
1309         }
1310
1311         rcu_read_unlock();
1312         if (spte_set)
1313                 kvm_flush_remote_tlbs(kvm);
1314 }
1315
1316 /*
1317  * Clear non-leaf entries (and free associated page tables) which could
1318  * be replaced by large mappings, for GFNs within the slot.
1319  */
1320 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1321                                        struct kvm_memory_slot *slot)
1322 {
1323         struct kvm_mmu_page *root;
1324         int root_as_id;
1325
1326         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1327                 root_as_id = kvm_mmu_page_as_id(root);
1328                 if (root_as_id != slot->as_id)
1329                         continue;
1330
1331                 zap_collapsible_spte_range(kvm, root, slot);
1332         }
1333 }
1334
1335 /*
1336  * Removes write access on the last level SPTE mapping this GFN and unsets the
1337  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1338  * Returns true if an SPTE was set and a TLB flush is needed.
1339  */
1340 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1341                               gfn_t gfn)
1342 {
1343         struct tdp_iter iter;
1344         u64 new_spte;
1345         bool spte_set = false;
1346
1347         rcu_read_lock();
1348
1349         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1350                 if (!is_writable_pte(iter.old_spte))
1351                         break;
1352
1353                 new_spte = iter.old_spte &
1354                         ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
1355
1356                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1357                 spte_set = true;
1358         }
1359
1360         rcu_read_unlock();
1361
1362         return spte_set;
1363 }
1364
1365 /*
1366  * Removes write access on the last level SPTE mapping this GFN and unsets the
1367  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1368  * Returns true if an SPTE was set and a TLB flush is needed.
1369  */
1370 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1371                                    struct kvm_memory_slot *slot, gfn_t gfn)
1372 {
1373         struct kvm_mmu_page *root;
1374         int root_as_id;
1375         bool spte_set = false;
1376
1377         lockdep_assert_held_write(&kvm->mmu_lock);
1378         for_each_tdp_mmu_root(kvm, root) {
1379                 root_as_id = kvm_mmu_page_as_id(root);
1380                 if (root_as_id != slot->as_id)
1381                         continue;
1382
1383                 spte_set |= write_protect_gfn(kvm, root, gfn);
1384         }
1385         return spte_set;
1386 }
1387
1388 /*
1389  * Return the level of the lowest level SPTE added to sptes.
1390  * That SPTE may be non-present.
1391  */
1392 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1393                          int *root_level)
1394 {
1395         struct tdp_iter iter;
1396         struct kvm_mmu *mmu = vcpu->arch.mmu;
1397         gfn_t gfn = addr >> PAGE_SHIFT;
1398         int leaf = -1;
1399
1400         *root_level = vcpu->arch.mmu->shadow_root_level;
1401
1402         rcu_read_lock();
1403
1404         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1405                 leaf = iter.level;
1406                 sptes[leaf] = iter.old_spte;
1407         }
1408
1409         rcu_read_unlock();
1410
1411         return leaf;
1412 }