arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = false;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  20                 return;
  21
  22         /* This should not be changed for the lifetime of the VM. */
  23         kvm->arch.tdp_mmu_enabled = true;
  24
  25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  28 }
  29
  30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  31 {
  32         if (!kvm->arch.tdp_mmu_enabled)
  33                 return;
  34
  35         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  36
  37         /*
  38          * Ensure that all the outstanding RCU callbacks to free shadow pages
  39          * can run before the VM is torn down.
  40          */
  41         rcu_barrier();
  42 }
  43
  44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
  45 {
  46         if (kvm_mmu_put_root(kvm, root))
  47                 kvm_tdp_mmu_free_root(kvm, root);
  48 }
  49
  50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
  51                                            struct kvm_mmu_page *root)
  52 {
  53         lockdep_assert_held_write(&kvm->mmu_lock);
  54
  55         if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
  56                 return false;
  57
  58         kvm_mmu_get_root(kvm, root);
  59         return true;
  60
  61 }
  62
  63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  64                                                      struct kvm_mmu_page *root)
  65 {
  66         struct kvm_mmu_page *next_root;
  67
  68         next_root = list_next_entry(root, link);
  69         tdp_mmu_put_root(kvm, root);
  70         return next_root;
  71 }
  72
  73 /*
  74  * Note: this iterator gets and puts references to the roots it iterates over.
  75  * This makes it safe to release the MMU lock and yield within the loop, but
  76  * if exiting the loop early, the caller must drop the reference to the most
  77  * recent root. (Unless keeping a live reference is desirable.)
  78  */
  79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                           \
  80         for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,        \
  81                                       typeof(*_root), link);            \
  82              tdp_mmu_next_root_valid(_kvm, _root);                      \
  83              _root = tdp_mmu_next_root(_kvm, _root))
  84
  85 #define for_each_tdp_mmu_root(_kvm, _root)                              \
  86         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
  87
  88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  89                           gfn_t start, gfn_t end, bool can_yield);
  90
  91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  92 {
  93         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
  94
  95         lockdep_assert_held_write(&kvm->mmu_lock);
  96
  97         WARN_ON(root->root_count);
  98         WARN_ON(!root->tdp_mmu_page);
  99
 100         list_del(&root->link);
 101
 102         zap_gfn_range(kvm, root, 0, max_gfn, false);
 103
 104         free_page((unsigned long)root->spt);
 105         kmem_cache_free(mmu_page_header_cache, root);
 106 }
 107
 108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
 109                                                    int level)
 110 {
 111         union kvm_mmu_page_role role;
 112
 113         role = vcpu->arch.mmu->mmu_role.base;
 114         role.level = level;
 115         role.direct = true;
 116         role.gpte_is_8_bytes = true;
 117         role.access = ACC_ALL;
 118
 119         return role;
 120 }
 121
 122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 123                                                int level)
 124 {
 125         struct kvm_mmu_page *sp;
 126
 127         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 128         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 129         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 130
 131         sp->role.word = page_role_for_level(vcpu, level).word;
 132         sp->gfn = gfn;
 133         sp->tdp_mmu_page = true;
 134
 135         trace_kvm_mmu_get_page(sp, true);
 136
 137         return sp;
 138 }
 139
 140 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
 141 {
 142         union kvm_mmu_page_role role;
 143         struct kvm *kvm = vcpu->kvm;
 144         struct kvm_mmu_page *root;
 145
 146         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 147
 148         write_lock(&kvm->mmu_lock);
 149
 150         /* Check for an existing root before allocating a new one. */
 151         for_each_tdp_mmu_root(kvm, root) {
 152                 if (root->role.word == role.word) {
 153                         kvm_mmu_get_root(kvm, root);
 154                         write_unlock(&kvm->mmu_lock);
 155                         return root;
 156                 }
 157         }
 158
 159         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 160         root->root_count = 1;
 161
 162         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 163
 164         write_unlock(&kvm->mmu_lock);
 165
 166         return root;
 167 }
 168
 169 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 170 {
 171         struct kvm_mmu_page *root;
 172
 173         root = get_tdp_mmu_vcpu_root(vcpu);
 174         if (!root)
 175                 return INVALID_PAGE;
 176
 177         return __pa(root->spt);
 178 }
 179
 180 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 181 {
 182         free_page((unsigned long)sp->spt);
 183         kmem_cache_free(mmu_page_header_cache, sp);
 184 }
 185
 186 /*
 187  * This is called through call_rcu in order to free TDP page table memory
 188  * safely with respect to other kernel threads that may be operating on
 189  * the memory.
 190  * By only accessing TDP MMU page table memory in an RCU read critical
 191  * section, and freeing it after a grace period, lockless access to that
 192  * memory won't use it after it is freed.
 193  */
 194 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
 195 {
 196         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
 197                                                rcu_head);
 198
 199         tdp_mmu_free_sp(sp);
 200 }
 201
 202 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 203                                 u64 old_spte, u64 new_spte, int level,
 204                                 bool shared);
 205
 206 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 207 {
 208         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 209
 210         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 211                 return;
 212
 213         if (is_accessed_spte(old_spte) &&
 214             (!is_accessed_spte(new_spte) || pfn_changed))
 215                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 216 }
 217
 218 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 219                                           u64 old_spte, u64 new_spte, int level)
 220 {
 221         bool pfn_changed;
 222         struct kvm_memory_slot *slot;
 223
 224         if (level > PG_LEVEL_4K)
 225                 return;
 226
 227         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 228
 229         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 230             is_writable_pte(new_spte)) {
 231                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 232                 mark_page_dirty_in_slot(kvm, slot, gfn);
 233         }
 234 }
 235
 236 /**
 237  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
 238  *
 239  * @kvm: kvm instance
 240  * @sp: the new page
 241  * @shared: This operation may not be running under the exclusive use of
 242  *          the MMU lock and the operation must synchronize with other
 243  *          threads that might be adding or removing pages.
 244  * @account_nx: This page replaces a NX large page and should be marked for
 245  *              eventual reclaim.
 246  */
 247 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 248                               bool shared, bool account_nx)
 249 {
 250         if (shared)
 251                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 252         else
 253                 lockdep_assert_held_write(&kvm->mmu_lock);
 254
 255         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
 256         if (account_nx)
 257                 account_huge_nx_page(kvm, sp);
 258
 259         if (shared)
 260                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 261 }
 262
 263 /**
 264  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
 265  *
 266  * @kvm: kvm instance
 267  * @sp: the page to be removed
 268  * @shared: This operation may not be running under the exclusive use of
 269  *          the MMU lock and the operation must synchronize with other
 270  *          threads that might be adding or removing pages.
 271  */
 272 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 273                                 bool shared)
 274 {
 275         if (shared)
 276                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 277         else
 278                 lockdep_assert_held_write(&kvm->mmu_lock);
 279
 280         list_del(&sp->link);
 281         if (sp->lpage_disallowed)
 282                 unaccount_huge_nx_page(kvm, sp);
 283
 284         if (shared)
 285                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 286 }
 287
 288 /**
 289  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
 290  *
 291  * @kvm: kvm instance
 292  * @pt: the page removed from the paging structure
 293  * @shared: This operation may not be running under the exclusive use
 294  *          of the MMU lock and the operation must synchronize with other
 295  *          threads that might be modifying SPTEs.
 296  *
 297  * Given a page table that has been removed from the TDP paging structure,
 298  * iterates through the page table to clear SPTEs and free child page tables.
 299  *
 300  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 301  * protection. Since this thread removed it from the paging structure,
 302  * this thread will be responsible for ensuring the page is freed. Hence the
 303  * early rcu_dereferences in the function.
 304  */
 305 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
 306                                         bool shared)
 307 {
 308         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
 309         int level = sp->role.level;
 310         gfn_t base_gfn = sp->gfn;
 311         u64 old_child_spte;
 312         u64 *sptep;
 313         gfn_t gfn;
 314         int i;
 315
 316         trace_kvm_mmu_prepare_zap_page(sp);
 317
 318         tdp_mmu_unlink_page(kvm, sp, shared);
 319
 320         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 321                 sptep = rcu_dereference(pt) + i;
 322                 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 323
 324                 if (shared) {
 325                         /*
 326                          * Set the SPTE to a nonpresent value that other
 327                          * threads will not overwrite. If the SPTE was
 328                          * already marked as removed then another thread
 329                          * handling a page fault could overwrite it, so
 330                          * set the SPTE until it is set from some other
 331                          * value to the removed SPTE value.
 332                          */
 333                         for (;;) {
 334                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 335                                 if (!is_removed_spte(old_child_spte))
 336                                         break;
 337                                 cpu_relax();
 338                         }
 339                 } else {
 340                         /*
 341                          * If the SPTE is not MMU-present, there is no backing
 342                          * page associated with the SPTE and so no side effects
 343                          * that need to be recorded, and exclusive ownership of
 344                          * mmu_lock ensures the SPTE can't be made present.
 345                          * Note, zapping MMIO SPTEs is also unnecessary as they
 346                          * are guarded by the memslots generation, not by being
 347                          * unreachable.
 348                          */
 349                         old_child_spte = READ_ONCE(*sptep);
 350                         if (!is_shadow_present_pte(old_child_spte))
 351                                 continue;
 352
 353                         /*
 354                          * Marking the SPTE as a removed SPTE is not
 355                          * strictly necessary here as the MMU lock will
 356                          * stop other threads from concurrently modifying
 357                          * this SPTE. Using the removed SPTE value keeps
 358                          * the two branches consistent and simplifies
 359                          * the function.
 360                          */
 361                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 362                 }
 363                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 364                                     old_child_spte, REMOVED_SPTE, level - 1,
 365                                     shared);
 366         }
 367
 368         kvm_flush_remote_tlbs_with_address(kvm, gfn,
 369                                            KVM_PAGES_PER_HPAGE(level));
 370
 371         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 372 }
 373
 374 /**
 375  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 376  * @kvm: kvm instance
 377  * @as_id: the address space of the paging structure the SPTE was a part of
 378  * @gfn: the base GFN that was mapped by the SPTE
 379  * @old_spte: The value of the SPTE before the change
 380  * @new_spte: The value of the SPTE after the change
 381  * @level: the level of the PT the SPTE is part of in the paging structure
 382  * @shared: This operation may not be running under the exclusive use of
 383  *          the MMU lock and the operation must synchronize with other
 384  *          threads that might be modifying SPTEs.
 385  *
 386  * Handle bookkeeping that might result from the modification of a SPTE.
 387  * This function must be called for all TDP SPTE modifications.
 388  */
 389 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 390                                   u64 old_spte, u64 new_spte, int level,
 391                                   bool shared)
 392 {
 393         bool was_present = is_shadow_present_pte(old_spte);
 394         bool is_present = is_shadow_present_pte(new_spte);
 395         bool was_leaf = was_present && is_last_spte(old_spte, level);
 396         bool is_leaf = is_present && is_last_spte(new_spte, level);
 397         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 398
 399         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 400         WARN_ON(level < PG_LEVEL_4K);
 401         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 402
 403         /*
 404          * If this warning were to trigger it would indicate that there was a
 405          * missing MMU notifier or a race with some notifier handler.
 406          * A present, leaf SPTE should never be directly replaced with another
 407          * present leaf SPTE pointing to a differnt PFN. A notifier handler
 408          * should be zapping the SPTE before the main MM's page table is
 409          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 410          * thread before replacement.
 411          */
 412         if (was_leaf && is_leaf && pfn_changed) {
 413                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 414                        "SPTE with another present leaf SPTE mapping a\n"
 415                        "different PFN!\n"
 416                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 417                        as_id, gfn, old_spte, new_spte, level);
 418
 419                 /*
 420                  * Crash the host to prevent error propagation and guest data
 421                  * courruption.
 422                  */
 423                 BUG();
 424         }
 425
 426         if (old_spte == new_spte)
 427                 return;
 428
 429         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 430
 431         /*
 432          * The only times a SPTE should be changed from a non-present to
 433          * non-present state is when an MMIO entry is installed/modified/
 434          * removed. In that case, there is nothing to do here.
 435          */
 436         if (!was_present && !is_present) {
 437                 /*
 438                  * If this change does not involve a MMIO SPTE or removed SPTE,
 439                  * it is unexpected. Log the change, though it should not
 440                  * impact the guest since both the former and current SPTEs
 441                  * are nonpresent.
 442                  */
 443                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 444                             !is_mmio_spte(new_spte) &&
 445                             !is_removed_spte(new_spte)))
 446                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 447                                "should not be replaced with another,\n"
 448                                "different nonpresent SPTE, unless one or both\n"
 449                                "are MMIO SPTEs, or the new SPTE is\n"
 450                                "a temporary removed SPTE.\n"
 451                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 452                                as_id, gfn, old_spte, new_spte, level);
 453                 return;
 454         }
 455
 456
 457         if (was_leaf && is_dirty_spte(old_spte) &&
 458             (!is_dirty_spte(new_spte) || pfn_changed))
 459                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 460
 461         /*
 462          * Recursively handle child PTs if the change removed a subtree from
 463          * the paging structure.
 464          */
 465         if (was_present && !was_leaf && (pfn_changed || !is_present))
 466                 handle_removed_tdp_mmu_page(kvm,
 467                                 spte_to_child_pt(old_spte, level), shared);
 468 }
 469
 470 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 471                                 u64 old_spte, u64 new_spte, int level,
 472                                 bool shared)
 473 {
 474         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 475                               shared);
 476         handle_changed_spte_acc_track(old_spte, new_spte, level);
 477         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 478                                       new_spte, level);
 479 }
 480
 481 /*
 482  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
 483  * associated bookkeeping
 484  *
 485  * @kvm: kvm instance
 486  * @iter: a tdp_iter instance currently on the SPTE that should be set
 487  * @new_spte: The value the SPTE should be set to
 488  * Returns: true if the SPTE was set, false if it was not. If false is returned,
 489  *          this function will have no side-effects.
 490  */
 491 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
 492                                            struct tdp_iter *iter,
 493                                            u64 new_spte)
 494 {
 495         lockdep_assert_held_read(&kvm->mmu_lock);
 496
 497         /*
 498          * Do not change removed SPTEs. Only the thread that froze the SPTE
 499          * may modify it.
 500          */
 501         if (iter->old_spte == REMOVED_SPTE)
 502                 return false;
 503
 504         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
 505                       new_spte) != iter->old_spte)
 506                 return false;
 507
 508         handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 509                             new_spte, iter->level, true);
 510
 511         return true;
 512 }
 513
 514 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 515                                            struct tdp_iter *iter)
 516 {
 517         /*
 518          * Freeze the SPTE by setting it to a special,
 519          * non-present value. This will stop other threads from
 520          * immediately installing a present entry in its place
 521          * before the TLBs are flushed.
 522          */
 523         if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
 524                 return false;
 525
 526         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 527                                            KVM_PAGES_PER_HPAGE(iter->level));
 528
 529         /*
 530          * No other thread can overwrite the removed SPTE as they
 531          * must either wait on the MMU lock or use
 532          * tdp_mmu_set_spte_atomic which will not overrite the
 533          * special removed SPTE value. No bookkeeping is needed
 534          * here since the SPTE is going from non-present
 535          * to non-present.
 536          */
 537         WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 538
 539         return true;
 540 }
 541
 542
 543 /*
 544  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 545  * @kvm: kvm instance
 546  * @iter: a tdp_iter instance currently on the SPTE that should be set
 547  * @new_spte: The value the SPTE should be set to
 548  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 549  *                    of the page. Should be set unless handling an MMU
 550  *                    notifier for access tracking. Leaving record_acc_track
 551  *                    unset in that case prevents page accesses from being
 552  *                    double counted.
 553  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 554  *                    appropriate for the change being made. Should be set
 555  *                    unless performing certain dirty logging operations.
 556  *                    Leaving record_dirty_log unset in that case prevents page
 557  *                    writes from being double counted.
 558  */
 559 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 560                                       u64 new_spte, bool record_acc_track,
 561                                       bool record_dirty_log)
 562 {
 563         lockdep_assert_held_write(&kvm->mmu_lock);
 564
 565         /*
 566          * No thread should be using this function to set SPTEs to the
 567          * temporary removed SPTE value.
 568          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 569          * should be used. If operating under the MMU lock in write mode, the
 570          * use of the removed SPTE should not be necessary.
 571          */
 572         WARN_ON(iter->old_spte == REMOVED_SPTE);
 573
 574         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 575
 576         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 577                               new_spte, iter->level, false);
 578         if (record_acc_track)
 579                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 580                                               iter->level);
 581         if (record_dirty_log)
 582                 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
 583                                               iter->old_spte, new_spte,
 584                                               iter->level);
 585 }
 586
 587 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 588                                     u64 new_spte)
 589 {
 590         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 591 }
 592
 593 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 594                                                  struct tdp_iter *iter,
 595                                                  u64 new_spte)
 596 {
 597         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 598 }
 599
 600 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 601                                                  struct tdp_iter *iter,
 602                                                  u64 new_spte)
 603 {
 604         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 605 }
 606
 607 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 608         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 609
 610 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 611         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 612                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 613                     !is_last_spte(_iter.old_spte, _iter.level))         \
 614                         continue;                                       \
 615                 else
 616
 617 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 618         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 619                          _mmu->shadow_root_level, _start, _end)
 620
 621 /*
 622  * Yield if the MMU lock is contended or this thread needs to return control
 623  * to the scheduler.
 624  *
 625  * If this function should yield and flush is set, it will perform a remote
 626  * TLB flush before yielding.
 627  *
 628  * If this function yields, it will also reset the tdp_iter's walk over the
 629  * paging structure and the calling function should skip to the next
 630  * iteration to allow the iterator to continue its traversal from the
 631  * paging structure root.
 632  *
 633  * Return true if this function yielded and the iterator's traversal was reset.
 634  * Return false if a yield was not needed.
 635  */
 636 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 637                                              struct tdp_iter *iter, bool flush)
 638 {
 639         /* Ensure forward progress has been made before yielding. */
 640         if (iter->next_last_level_gfn == iter->yielded_gfn)
 641                 return false;
 642
 643         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 644                 rcu_read_unlock();
 645
 646                 if (flush)
 647                         kvm_flush_remote_tlbs(kvm);
 648
 649                 cond_resched_rwlock_write(&kvm->mmu_lock);
 650                 rcu_read_lock();
 651
 652                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 653
 654                 tdp_iter_restart(iter);
 655
 656                 return true;
 657         }
 658
 659         return false;
 660 }
 661
 662 /*
 663  * Tears down the mappings for the range of gfns, [start, end), and frees the
 664  * non-root pages mapping GFNs strictly within that range. Returns true if
 665  * SPTEs have been cleared and a TLB flush is needed before releasing the
 666  * MMU lock.
 667  * If can_yield is true, will release the MMU lock and reschedule if the
 668  * scheduler needs the CPU or there is contention on the MMU lock. If this
 669  * function cannot yield, it will not release the MMU lock or reschedule and
 670  * the caller must ensure it does not supply too large a GFN range, or the
 671  * operation can cause a soft lockup.
 672  */
 673 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 674                           gfn_t start, gfn_t end, bool can_yield)
 675 {
 676         struct tdp_iter iter;
 677         bool flush_needed = false;
 678
 679         rcu_read_lock();
 680
 681         tdp_root_for_each_pte(iter, root, start, end) {
 682                 if (can_yield &&
 683                     tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
 684                         flush_needed = false;
 685                         continue;
 686                 }
 687
 688                 if (!is_shadow_present_pte(iter.old_spte))
 689                         continue;
 690
 691                 /*
 692                  * If this is a non-last-level SPTE that covers a larger range
 693                  * than should be zapped, continue, and zap the mappings at a
 694                  * lower level.
 695                  */
 696                 if ((iter.gfn < start ||
 697                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 698                     !is_last_spte(iter.old_spte, iter.level))
 699                         continue;
 700
 701                 tdp_mmu_set_spte(kvm, &iter, 0);
 702                 flush_needed = true;
 703         }
 704
 705         rcu_read_unlock();
 706         return flush_needed;
 707 }
 708
 709 /*
 710  * Tears down the mappings for the range of gfns, [start, end), and frees the
 711  * non-root pages mapping GFNs strictly within that range. Returns true if
 712  * SPTEs have been cleared and a TLB flush is needed before releasing the
 713  * MMU lock.
 714  */
 715 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
 716 {
 717         struct kvm_mmu_page *root;
 718         bool flush = false;
 719
 720         for_each_tdp_mmu_root_yield_safe(kvm, root)
 721                 flush |= zap_gfn_range(kvm, root, start, end, true);
 722
 723         return flush;
 724 }
 725
 726 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 727 {
 728         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 729         bool flush;
 730
 731         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
 732         if (flush)
 733                 kvm_flush_remote_tlbs(kvm);
 734 }
 735
 736 /*
 737  * Installs a last-level SPTE to handle a TDP page fault.
 738  * (NPT/EPT violation/misconfiguration)
 739  */
 740 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 741                                           int map_writable,
 742                                           struct tdp_iter *iter,
 743                                           kvm_pfn_t pfn, bool prefault)
 744 {
 745         u64 new_spte;
 746         int ret = 0;
 747         int make_spte_ret = 0;
 748
 749         if (unlikely(is_noslot_pfn(pfn)))
 750                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 751         else
 752                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 753                                          pfn, iter->old_spte, prefault, true,
 754                                          map_writable, !shadow_accessed_mask,
 755                                          &new_spte);
 756
 757         if (new_spte == iter->old_spte)
 758                 ret = RET_PF_SPURIOUS;
 759         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 760                 return RET_PF_RETRY;
 761
 762         /*
 763          * If the page fault was caused by a write but the page is write
 764          * protected, emulation is needed. If the emulation was skipped,
 765          * the vCPU would have the same fault again.
 766          */
 767         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 768                 if (write)
 769                         ret = RET_PF_EMULATE;
 770                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 771         }
 772
 773         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 774         if (unlikely(is_mmio_spte(new_spte))) {
 775                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 776                                      new_spte);
 777                 ret = RET_PF_EMULATE;
 778         } else
 779                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 780                                        rcu_dereference(iter->sptep));
 781
 782         trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 783                                rcu_dereference(iter->sptep));
 784         if (!prefault)
 785                 vcpu->stat.pf_fixed++;
 786
 787         return ret;
 788 }
 789
 790 /*
 791  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 792  * page tables and SPTEs to translate the faulting guest physical address.
 793  */
 794 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 795                     int map_writable, int max_level, kvm_pfn_t pfn,
 796                     bool prefault)
 797 {
 798         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 799         bool write = error_code & PFERR_WRITE_MASK;
 800         bool exec = error_code & PFERR_FETCH_MASK;
 801         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 802         struct kvm_mmu *mmu = vcpu->arch.mmu;
 803         struct tdp_iter iter;
 804         struct kvm_mmu_page *sp;
 805         u64 *child_pt;
 806         u64 new_spte;
 807         int ret;
 808         gfn_t gfn = gpa >> PAGE_SHIFT;
 809         int level;
 810         int req_level;
 811
 812         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
 813                 return RET_PF_RETRY;
 814         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
 815                 return RET_PF_RETRY;
 816
 817         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
 818                                         huge_page_disallowed, &req_level);
 819
 820         trace_kvm_mmu_spte_requested(gpa, level, pfn);
 821
 822         rcu_read_lock();
 823
 824         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 825                 if (nx_huge_page_workaround_enabled)
 826                         disallowed_hugepage_adjust(iter.old_spte, gfn,
 827                                                    iter.level, &pfn, &level);
 828
 829                 if (iter.level == level)
 830                         break;
 831
 832                 /*
 833                  * If there is an SPTE mapping a large page at a higher level
 834                  * than the target, that SPTE must be cleared and replaced
 835                  * with a non-leaf SPTE.
 836                  */
 837                 if (is_shadow_present_pte(iter.old_spte) &&
 838                     is_large_pte(iter.old_spte)) {
 839                         if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
 840                                 break;
 841
 842                         /*
 843                          * The iter must explicitly re-read the spte here
 844                          * because the new value informs the !present
 845                          * path below.
 846                          */
 847                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
 848                 }
 849
 850                 if (!is_shadow_present_pte(iter.old_spte)) {
 851                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 852                         child_pt = sp->spt;
 853
 854                         new_spte = make_nonleaf_spte(child_pt,
 855                                                      !shadow_accessed_mask);
 856
 857                         if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
 858                                                     new_spte)) {
 859                                 tdp_mmu_link_page(vcpu->kvm, sp, true,
 860                                                   huge_page_disallowed &&
 861                                                   req_level >= iter.level);
 862
 863                                 trace_kvm_mmu_get_page(sp, true);
 864                         } else {
 865                                 tdp_mmu_free_sp(sp);
 866                                 break;
 867                         }
 868                 }
 869         }
 870
 871         if (iter.level != level) {
 872                 rcu_read_unlock();
 873                 return RET_PF_RETRY;
 874         }
 875
 876         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
 877                                               pfn, prefault);
 878         rcu_read_unlock();
 879
 880         return ret;
 881 }
 882
 883 static __always_inline int
 884 kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
 885                              unsigned long start,
 886                              unsigned long end,
 887                              unsigned long data,
 888                              int (*handler)(struct kvm *kvm,
 889                                             struct kvm_memory_slot *slot,
 890                                             struct kvm_mmu_page *root,
 891                                             gfn_t start,
 892                                             gfn_t end,
 893                                             unsigned long data))
 894 {
 895         struct kvm_memslots *slots;
 896         struct kvm_memory_slot *memslot;
 897         struct kvm_mmu_page *root;
 898         int ret = 0;
 899         int as_id;
 900
 901         for_each_tdp_mmu_root_yield_safe(kvm, root) {
 902                 as_id = kvm_mmu_page_as_id(root);
 903                 slots = __kvm_memslots(kvm, as_id);
 904                 kvm_for_each_memslot(memslot, slots) {
 905                         unsigned long hva_start, hva_end;
 906                         gfn_t gfn_start, gfn_end;
 907
 908                         hva_start = max(start, memslot->userspace_addr);
 909                         hva_end = min(end, memslot->userspace_addr +
 910                                       (memslot->npages << PAGE_SHIFT));
 911                         if (hva_start >= hva_end)
 912                                 continue;
 913                         /*
 914                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 915                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 916                          */
 917                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
 918                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 919
 920                         ret |= handler(kvm, memslot, root, gfn_start,
 921                                        gfn_end, data);
 922                 }
 923         }
 924
 925         return ret;
 926 }
 927
 928 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
 929                                      struct kvm_memory_slot *slot,
 930                                      struct kvm_mmu_page *root, gfn_t start,
 931                                      gfn_t end, unsigned long unused)
 932 {
 933         return zap_gfn_range(kvm, root, start, end, false);
 934 }
 935
 936 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
 937                               unsigned long end)
 938 {
 939         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 940                                             zap_gfn_range_hva_wrapper);
 941 }
 942
 943 /*
 944  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
 945  * if any of the GFNs in the range have been accessed.
 946  */
 947 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
 948                          struct kvm_mmu_page *root, gfn_t start, gfn_t end,
 949                          unsigned long unused)
 950 {
 951         struct tdp_iter iter;
 952         int young = 0;
 953         u64 new_spte = 0;
 954
 955         rcu_read_lock();
 956
 957         tdp_root_for_each_leaf_pte(iter, root, start, end) {
 958                 /*
 959                  * If we have a non-accessed entry we don't need to change the
 960                  * pte.
 961                  */
 962                 if (!is_accessed_spte(iter.old_spte))
 963                         continue;
 964
 965                 new_spte = iter.old_spte;
 966
 967                 if (spte_ad_enabled(new_spte)) {
 968                         clear_bit((ffs(shadow_accessed_mask) - 1),
 969                                   (unsigned long *)&new_spte);
 970                 } else {
 971                         /*
 972                          * Capture the dirty status of the page, so that it doesn't get
 973                          * lost when the SPTE is marked for access tracking.
 974                          */
 975                         if (is_writable_pte(new_spte))
 976                                 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 977
 978                         new_spte = mark_spte_for_access_track(new_spte);
 979                 }
 980                 new_spte &= ~shadow_dirty_mask;
 981
 982                 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
 983                 young = 1;
 984
 985                 trace_kvm_age_page(iter.gfn, iter.level, slot, young);
 986         }
 987
 988         rcu_read_unlock();
 989
 990         return young;
 991 }
 992
 993 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
 994                               unsigned long end)
 995 {
 996         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 997                                             age_gfn_range);
 998 }
 999
1000 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
1001                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1002                         unsigned long unused2)
1003 {
1004         struct tdp_iter iter;
1005
1006         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
1007                 if (is_accessed_spte(iter.old_spte))
1008                         return 1;
1009
1010         return 0;
1011 }
1012
1013 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1014 {
1015         return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
1016                                             test_age_gfn);
1017 }
1018
1019 /*
1020  * Handle the changed_pte MMU notifier for the TDP MMU.
1021  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1022  * notifier.
1023  * Returns non-zero if a flush is needed before releasing the MMU lock.
1024  */
1025 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1026                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1027                         unsigned long data)
1028 {
1029         struct tdp_iter iter;
1030         pte_t *ptep = (pte_t *)data;
1031         kvm_pfn_t new_pfn;
1032         u64 new_spte;
1033         int need_flush = 0;
1034
1035         rcu_read_lock();
1036
1037         WARN_ON(pte_huge(*ptep));
1038
1039         new_pfn = pte_pfn(*ptep);
1040
1041         tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
1042                 if (iter.level != PG_LEVEL_4K)
1043                         continue;
1044
1045                 if (!is_shadow_present_pte(iter.old_spte))
1046                         break;
1047
1048                 tdp_mmu_set_spte(kvm, &iter, 0);
1049
1050                 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
1051
1052                 if (!pte_write(*ptep)) {
1053                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1054                                         iter.old_spte, new_pfn);
1055
1056                         tdp_mmu_set_spte(kvm, &iter, new_spte);
1057                 }
1058
1059                 need_flush = 1;
1060         }
1061
1062         if (need_flush)
1063                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1064
1065         rcu_read_unlock();
1066
1067         return 0;
1068 }
1069
1070 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1071                              pte_t *host_ptep)
1072 {
1073         return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
1074                                             (unsigned long)host_ptep,
1075                                             set_tdp_spte);
1076 }
1077
1078 /*
1079  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1080  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1081  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1082  */
1083 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1084                              gfn_t start, gfn_t end, int min_level)
1085 {
1086         struct tdp_iter iter;
1087         u64 new_spte;
1088         bool spte_set = false;
1089
1090         rcu_read_lock();
1091
1092         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1093
1094         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1095                                    min_level, start, end) {
1096                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1097                         continue;
1098
1099                 if (!is_shadow_present_pte(iter.old_spte) ||
1100                     !is_last_spte(iter.old_spte, iter.level) ||
1101                     !(iter.old_spte & PT_WRITABLE_MASK))
1102                         continue;
1103
1104                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1105
1106                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1107                 spte_set = true;
1108         }
1109
1110         rcu_read_unlock();
1111         return spte_set;
1112 }
1113
1114 /*
1115  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1116  * only affect leaf SPTEs down to min_level.
1117  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1118  */
1119 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1120                              int min_level)
1121 {
1122         struct kvm_mmu_page *root;
1123         int root_as_id;
1124         bool spte_set = false;
1125
1126         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1127                 root_as_id = kvm_mmu_page_as_id(root);
1128                 if (root_as_id != slot->as_id)
1129                         continue;
1130
1131                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1132                              slot->base_gfn + slot->npages, min_level);
1133         }
1134
1135         return spte_set;
1136 }
1137
1138 /*
1139  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1140  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1141  * If AD bits are not enabled, this will require clearing the writable bit on
1142  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1143  * be flushed.
1144  */
1145 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1146                            gfn_t start, gfn_t end)
1147 {
1148         struct tdp_iter iter;
1149         u64 new_spte;
1150         bool spte_set = false;
1151
1152         rcu_read_lock();
1153
1154         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1155                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1156                         continue;
1157
1158                 if (spte_ad_need_write_protect(iter.old_spte)) {
1159                         if (is_writable_pte(iter.old_spte))
1160                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1161                         else
1162                                 continue;
1163                 } else {
1164                         if (iter.old_spte & shadow_dirty_mask)
1165                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1166                         else
1167                                 continue;
1168                 }
1169
1170                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1171                 spte_set = true;
1172         }
1173
1174         rcu_read_unlock();
1175         return spte_set;
1176 }
1177
1178 /*
1179  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1180  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1181  * If AD bits are not enabled, this will require clearing the writable bit on
1182  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1183  * be flushed.
1184  */
1185 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1186 {
1187         struct kvm_mmu_page *root;
1188         int root_as_id;
1189         bool spte_set = false;
1190
1191         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1192                 root_as_id = kvm_mmu_page_as_id(root);
1193                 if (root_as_id != slot->as_id)
1194                         continue;
1195
1196                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1197                                 slot->base_gfn + slot->npages);
1198         }
1199
1200         return spte_set;
1201 }
1202
1203 /*
1204  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1205  * set in mask, starting at gfn. The given memslot is expected to contain all
1206  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1207  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1208  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1209  */
1210 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1211                                   gfn_t gfn, unsigned long mask, bool wrprot)
1212 {
1213         struct tdp_iter iter;
1214         u64 new_spte;
1215
1216         rcu_read_lock();
1217
1218         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1219                                     gfn + BITS_PER_LONG) {
1220                 if (!mask)
1221                         break;
1222
1223                 if (iter.level > PG_LEVEL_4K ||
1224                     !(mask & (1UL << (iter.gfn - gfn))))
1225                         continue;
1226
1227                 mask &= ~(1UL << (iter.gfn - gfn));
1228
1229                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1230                         if (is_writable_pte(iter.old_spte))
1231                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1232                         else
1233                                 continue;
1234                 } else {
1235                         if (iter.old_spte & shadow_dirty_mask)
1236                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1237                         else
1238                                 continue;
1239                 }
1240
1241                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1242         }
1243
1244         rcu_read_unlock();
1245 }
1246
1247 /*
1248  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1249  * set in mask, starting at gfn. The given memslot is expected to contain all
1250  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1251  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1252  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1253  */
1254 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1255                                        struct kvm_memory_slot *slot,
1256                                        gfn_t gfn, unsigned long mask,
1257                                        bool wrprot)
1258 {
1259         struct kvm_mmu_page *root;
1260         int root_as_id;
1261
1262         lockdep_assert_held_write(&kvm->mmu_lock);
1263         for_each_tdp_mmu_root(kvm, root) {
1264                 root_as_id = kvm_mmu_page_as_id(root);
1265                 if (root_as_id != slot->as_id)
1266                         continue;
1267
1268                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1269         }
1270 }
1271
1272 /*
1273  * Clear leaf entries which could be replaced by large mappings, for
1274  * GFNs within the slot.
1275  */
1276 static void zap_collapsible_spte_range(struct kvm *kvm,
1277                                        struct kvm_mmu_page *root,
1278                                        struct kvm_memory_slot *slot)
1279 {
1280         gfn_t start = slot->base_gfn;
1281         gfn_t end = start + slot->npages;
1282         struct tdp_iter iter;
1283         kvm_pfn_t pfn;
1284         bool spte_set = false;
1285
1286         rcu_read_lock();
1287
1288         tdp_root_for_each_pte(iter, root, start, end) {
1289                 if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
1290                         spte_set = false;
1291                         continue;
1292                 }
1293
1294                 if (!is_shadow_present_pte(iter.old_spte) ||
1295                     !is_last_spte(iter.old_spte, iter.level))
1296                         continue;
1297
1298                 pfn = spte_to_pfn(iter.old_spte);
1299                 if (kvm_is_reserved_pfn(pfn) ||
1300                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1301                                                             pfn, PG_LEVEL_NUM))
1302                         continue;
1303
1304                 tdp_mmu_set_spte(kvm, &iter, 0);
1305
1306                 spte_set = true;
1307         }
1308
1309         rcu_read_unlock();
1310         if (spte_set)
1311                 kvm_flush_remote_tlbs(kvm);
1312 }
1313
1314 /*
1315  * Clear non-leaf entries (and free associated page tables) which could
1316  * be replaced by large mappings, for GFNs within the slot.
1317  */
1318 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1319                                        struct kvm_memory_slot *slot)
1320 {
1321         struct kvm_mmu_page *root;
1322         int root_as_id;
1323
1324         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1325                 root_as_id = kvm_mmu_page_as_id(root);
1326                 if (root_as_id != slot->as_id)
1327                         continue;
1328
1329                 zap_collapsible_spte_range(kvm, root, slot);
1330         }
1331 }
1332
1333 /*
1334  * Removes write access on the last level SPTE mapping this GFN and unsets the
1335  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1336  * Returns true if an SPTE was set and a TLB flush is needed.
1337  */
1338 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1339                               gfn_t gfn)
1340 {
1341         struct tdp_iter iter;
1342         u64 new_spte;
1343         bool spte_set = false;
1344
1345         rcu_read_lock();
1346
1347         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1348                 if (!is_writable_pte(iter.old_spte))
1349                         break;
1350
1351                 new_spte = iter.old_spte &
1352                         ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
1353
1354                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1355                 spte_set = true;
1356         }
1357
1358         rcu_read_unlock();
1359
1360         return spte_set;
1361 }
1362
1363 /*
1364  * Removes write access on the last level SPTE mapping this GFN and unsets the
1365  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1366  * Returns true if an SPTE was set and a TLB flush is needed.
1367  */
1368 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1369                                    struct kvm_memory_slot *slot, gfn_t gfn)
1370 {
1371         struct kvm_mmu_page *root;
1372         int root_as_id;
1373         bool spte_set = false;
1374
1375         lockdep_assert_held_write(&kvm->mmu_lock);
1376         for_each_tdp_mmu_root(kvm, root) {
1377                 root_as_id = kvm_mmu_page_as_id(root);
1378                 if (root_as_id != slot->as_id)
1379                         continue;
1380
1381                 spte_set |= write_protect_gfn(kvm, root, gfn);
1382         }
1383         return spte_set;
1384 }
1385
1386 /*
1387  * Return the level of the lowest level SPTE added to sptes.
1388  * That SPTE may be non-present.
1389  */
1390 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1391                          int *root_level)
1392 {
1393         struct tdp_iter iter;
1394         struct kvm_mmu *mmu = vcpu->arch.mmu;
1395         gfn_t gfn = addr >> PAGE_SHIFT;
1396         int leaf = -1;
1397
1398         *root_level = vcpu->arch.mmu->shadow_root_level;
1399
1400         rcu_read_lock();
1401
1402         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1403                 leaf = iter.level;
1404                 sptes[leaf] = iter.old_spte;
1405         }
1406
1407         rcu_read_unlock();
1408
1409         return leaf;
1410 }