arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = false;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  20                 return;
  21
  22         /* This should not be changed for the lifetime of the VM. */
  23         kvm->arch.tdp_mmu_enabled = true;
  24
  25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  28 }
  29
  30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  31 {
  32         if (!kvm->arch.tdp_mmu_enabled)
  33                 return;
  34
  35         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  36
  37         /*
  38          * Ensure that all the outstanding RCU callbacks to free shadow pages
  39          * can run before the VM is torn down.
  40          */
  41         rcu_barrier();
  42 }
  43
  44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
  45 {
  46         if (kvm_mmu_put_root(kvm, root))
  47                 kvm_tdp_mmu_free_root(kvm, root);
  48 }
  49
  50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
  51                                            struct kvm_mmu_page *root)
  52 {
  53         lockdep_assert_held_write(&kvm->mmu_lock);
  54
  55         if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
  56                 return false;
  57
  58         kvm_mmu_get_root(kvm, root);
  59         return true;
  60
  61 }
  62
  63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  64                                                      struct kvm_mmu_page *root)
  65 {
  66         struct kvm_mmu_page *next_root;
  67
  68         next_root = list_next_entry(root, link);
  69         tdp_mmu_put_root(kvm, root);
  70         return next_root;
  71 }
  72
  73 /*
  74  * Note: this iterator gets and puts references to the roots it iterates over.
  75  * This makes it safe to release the MMU lock and yield within the loop, but
  76  * if exiting the loop early, the caller must drop the reference to the most
  77  * recent root. (Unless keeping a live reference is desirable.)
  78  */
  79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)           \
  80         for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,        \
  81                                       typeof(*_root), link);            \
  82              tdp_mmu_next_root_valid(_kvm, _root);                      \
  83              _root = tdp_mmu_next_root(_kvm, _root))                    \
  84                 if (kvm_mmu_page_as_id(_root) != _as_id) {              \
  85                 } else
  86
  87 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)                      \
  88         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)     \
  89                 if (kvm_mmu_page_as_id(_root) != _as_id) {              \
  90                 } else
  91
  92 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  93                           gfn_t start, gfn_t end, bool can_yield, bool flush);
  94
  95 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  96 {
  97         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
  98
  99         lockdep_assert_held_write(&kvm->mmu_lock);
 100
 101         WARN_ON(root->root_count);
 102         WARN_ON(!root->tdp_mmu_page);
 103
 104         list_del(&root->link);
 105
 106         zap_gfn_range(kvm, root, 0, max_gfn, false, false);
 107
 108         free_page((unsigned long)root->spt);
 109         kmem_cache_free(mmu_page_header_cache, root);
 110 }
 111
 112 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
 113                                                    int level)
 114 {
 115         union kvm_mmu_page_role role;
 116
 117         role = vcpu->arch.mmu->mmu_role.base;
 118         role.level = level;
 119         role.direct = true;
 120         role.gpte_is_8_bytes = true;
 121         role.access = ACC_ALL;
 122
 123         return role;
 124 }
 125
 126 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 127                                                int level)
 128 {
 129         struct kvm_mmu_page *sp;
 130
 131         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 132         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 133         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 134
 135         sp->role.word = page_role_for_level(vcpu, level).word;
 136         sp->gfn = gfn;
 137         sp->tdp_mmu_page = true;
 138
 139         trace_kvm_mmu_get_page(sp, true);
 140
 141         return sp;
 142 }
 143
 144 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 145 {
 146         union kvm_mmu_page_role role;
 147         struct kvm *kvm = vcpu->kvm;
 148         struct kvm_mmu_page *root;
 149
 150         lockdep_assert_held_write(&kvm->mmu_lock);
 151
 152         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 153
 154         /* Check for an existing root before allocating a new one. */
 155         for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
 156                 if (root->role.word == role.word) {
 157                         kvm_mmu_get_root(kvm, root);
 158                         goto out;
 159                 }
 160         }
 161
 162         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 163         root->root_count = 1;
 164
 165         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 166
 167 out:
 168         return __pa(root->spt);
 169 }
 170
 171 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 172 {
 173         free_page((unsigned long)sp->spt);
 174         kmem_cache_free(mmu_page_header_cache, sp);
 175 }
 176
 177 /*
 178  * This is called through call_rcu in order to free TDP page table memory
 179  * safely with respect to other kernel threads that may be operating on
 180  * the memory.
 181  * By only accessing TDP MMU page table memory in an RCU read critical
 182  * section, and freeing it after a grace period, lockless access to that
 183  * memory won't use it after it is freed.
 184  */
 185 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
 186 {
 187         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
 188                                                rcu_head);
 189
 190         tdp_mmu_free_sp(sp);
 191 }
 192
 193 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 194                                 u64 old_spte, u64 new_spte, int level,
 195                                 bool shared);
 196
 197 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 198 {
 199         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 200                 return;
 201
 202         if (is_accessed_spte(old_spte) &&
 203             (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
 204              spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
 205                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 206 }
 207
 208 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 209                                           u64 old_spte, u64 new_spte, int level)
 210 {
 211         bool pfn_changed;
 212         struct kvm_memory_slot *slot;
 213
 214         if (level > PG_LEVEL_4K)
 215                 return;
 216
 217         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 218
 219         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 220             is_writable_pte(new_spte)) {
 221                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 222                 mark_page_dirty_in_slot(kvm, slot, gfn);
 223         }
 224 }
 225
 226 /**
 227  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
 228  *
 229  * @kvm: kvm instance
 230  * @sp: the new page
 231  * @shared: This operation may not be running under the exclusive use of
 232  *          the MMU lock and the operation must synchronize with other
 233  *          threads that might be adding or removing pages.
 234  * @account_nx: This page replaces a NX large page and should be marked for
 235  *              eventual reclaim.
 236  */
 237 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 238                               bool shared, bool account_nx)
 239 {
 240         if (shared)
 241                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 242         else
 243                 lockdep_assert_held_write(&kvm->mmu_lock);
 244
 245         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
 246         if (account_nx)
 247                 account_huge_nx_page(kvm, sp);
 248
 249         if (shared)
 250                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 251 }
 252
 253 /**
 254  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
 255  *
 256  * @kvm: kvm instance
 257  * @sp: the page to be removed
 258  * @shared: This operation may not be running under the exclusive use of
 259  *          the MMU lock and the operation must synchronize with other
 260  *          threads that might be adding or removing pages.
 261  */
 262 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 263                                 bool shared)
 264 {
 265         if (shared)
 266                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 267         else
 268                 lockdep_assert_held_write(&kvm->mmu_lock);
 269
 270         list_del(&sp->link);
 271         if (sp->lpage_disallowed)
 272                 unaccount_huge_nx_page(kvm, sp);
 273
 274         if (shared)
 275                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 276 }
 277
 278 /**
 279  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
 280  *
 281  * @kvm: kvm instance
 282  * @pt: the page removed from the paging structure
 283  * @shared: This operation may not be running under the exclusive use
 284  *          of the MMU lock and the operation must synchronize with other
 285  *          threads that might be modifying SPTEs.
 286  *
 287  * Given a page table that has been removed from the TDP paging structure,
 288  * iterates through the page table to clear SPTEs and free child page tables.
 289  *
 290  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 291  * protection. Since this thread removed it from the paging structure,
 292  * this thread will be responsible for ensuring the page is freed. Hence the
 293  * early rcu_dereferences in the function.
 294  */
 295 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
 296                                         bool shared)
 297 {
 298         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
 299         int level = sp->role.level;
 300         gfn_t base_gfn = sp->gfn;
 301         u64 old_child_spte;
 302         u64 *sptep;
 303         gfn_t gfn;
 304         int i;
 305
 306         trace_kvm_mmu_prepare_zap_page(sp);
 307
 308         tdp_mmu_unlink_page(kvm, sp, shared);
 309
 310         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 311                 sptep = rcu_dereference(pt) + i;
 312                 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 313
 314                 if (shared) {
 315                         /*
 316                          * Set the SPTE to a nonpresent value that other
 317                          * threads will not overwrite. If the SPTE was
 318                          * already marked as removed then another thread
 319                          * handling a page fault could overwrite it, so
 320                          * set the SPTE until it is set from some other
 321                          * value to the removed SPTE value.
 322                          */
 323                         for (;;) {
 324                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 325                                 if (!is_removed_spte(old_child_spte))
 326                                         break;
 327                                 cpu_relax();
 328                         }
 329                 } else {
 330                         /*
 331                          * If the SPTE is not MMU-present, there is no backing
 332                          * page associated with the SPTE and so no side effects
 333                          * that need to be recorded, and exclusive ownership of
 334                          * mmu_lock ensures the SPTE can't be made present.
 335                          * Note, zapping MMIO SPTEs is also unnecessary as they
 336                          * are guarded by the memslots generation, not by being
 337                          * unreachable.
 338                          */
 339                         old_child_spte = READ_ONCE(*sptep);
 340                         if (!is_shadow_present_pte(old_child_spte))
 341                                 continue;
 342
 343                         /*
 344                          * Marking the SPTE as a removed SPTE is not
 345                          * strictly necessary here as the MMU lock will
 346                          * stop other threads from concurrently modifying
 347                          * this SPTE. Using the removed SPTE value keeps
 348                          * the two branches consistent and simplifies
 349                          * the function.
 350                          */
 351                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 352                 }
 353                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 354                                     old_child_spte, REMOVED_SPTE, level - 1,
 355                                     shared);
 356         }
 357
 358         kvm_flush_remote_tlbs_with_address(kvm, gfn,
 359                                            KVM_PAGES_PER_HPAGE(level));
 360
 361         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 362 }
 363
 364 /**
 365  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 366  * @kvm: kvm instance
 367  * @as_id: the address space of the paging structure the SPTE was a part of
 368  * @gfn: the base GFN that was mapped by the SPTE
 369  * @old_spte: The value of the SPTE before the change
 370  * @new_spte: The value of the SPTE after the change
 371  * @level: the level of the PT the SPTE is part of in the paging structure
 372  * @shared: This operation may not be running under the exclusive use of
 373  *          the MMU lock and the operation must synchronize with other
 374  *          threads that might be modifying SPTEs.
 375  *
 376  * Handle bookkeeping that might result from the modification of a SPTE.
 377  * This function must be called for all TDP SPTE modifications.
 378  */
 379 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 380                                   u64 old_spte, u64 new_spte, int level,
 381                                   bool shared)
 382 {
 383         bool was_present = is_shadow_present_pte(old_spte);
 384         bool is_present = is_shadow_present_pte(new_spte);
 385         bool was_leaf = was_present && is_last_spte(old_spte, level);
 386         bool is_leaf = is_present && is_last_spte(new_spte, level);
 387         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 388
 389         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 390         WARN_ON(level < PG_LEVEL_4K);
 391         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 392
 393         /*
 394          * If this warning were to trigger it would indicate that there was a
 395          * missing MMU notifier or a race with some notifier handler.
 396          * A present, leaf SPTE should never be directly replaced with another
 397          * present leaf SPTE pointing to a differnt PFN. A notifier handler
 398          * should be zapping the SPTE before the main MM's page table is
 399          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 400          * thread before replacement.
 401          */
 402         if (was_leaf && is_leaf && pfn_changed) {
 403                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 404                        "SPTE with another present leaf SPTE mapping a\n"
 405                        "different PFN!\n"
 406                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 407                        as_id, gfn, old_spte, new_spte, level);
 408
 409                 /*
 410                  * Crash the host to prevent error propagation and guest data
 411                  * courruption.
 412                  */
 413                 BUG();
 414         }
 415
 416         if (old_spte == new_spte)
 417                 return;
 418
 419         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 420
 421         /*
 422          * The only times a SPTE should be changed from a non-present to
 423          * non-present state is when an MMIO entry is installed/modified/
 424          * removed. In that case, there is nothing to do here.
 425          */
 426         if (!was_present && !is_present) {
 427                 /*
 428                  * If this change does not involve a MMIO SPTE or removed SPTE,
 429                  * it is unexpected. Log the change, though it should not
 430                  * impact the guest since both the former and current SPTEs
 431                  * are nonpresent.
 432                  */
 433                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 434                             !is_mmio_spte(new_spte) &&
 435                             !is_removed_spte(new_spte)))
 436                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 437                                "should not be replaced with another,\n"
 438                                "different nonpresent SPTE, unless one or both\n"
 439                                "are MMIO SPTEs, or the new SPTE is\n"
 440                                "a temporary removed SPTE.\n"
 441                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 442                                as_id, gfn, old_spte, new_spte, level);
 443                 return;
 444         }
 445
 446
 447         if (was_leaf && is_dirty_spte(old_spte) &&
 448             (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
 449                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 450
 451         /*
 452          * Recursively handle child PTs if the change removed a subtree from
 453          * the paging structure.
 454          */
 455         if (was_present && !was_leaf && (pfn_changed || !is_present))
 456                 handle_removed_tdp_mmu_page(kvm,
 457                                 spte_to_child_pt(old_spte, level), shared);
 458 }
 459
 460 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 461                                 u64 old_spte, u64 new_spte, int level,
 462                                 bool shared)
 463 {
 464         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 465                               shared);
 466         handle_changed_spte_acc_track(old_spte, new_spte, level);
 467         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 468                                       new_spte, level);
 469 }
 470
 471 /*
 472  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
 473  * associated bookkeeping
 474  *
 475  * @kvm: kvm instance
 476  * @iter: a tdp_iter instance currently on the SPTE that should be set
 477  * @new_spte: The value the SPTE should be set to
 478  * Returns: true if the SPTE was set, false if it was not. If false is returned,
 479  *          this function will have no side-effects.
 480  */
 481 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
 482                                            struct tdp_iter *iter,
 483                                            u64 new_spte)
 484 {
 485         lockdep_assert_held_read(&kvm->mmu_lock);
 486
 487         /*
 488          * Do not change removed SPTEs. Only the thread that froze the SPTE
 489          * may modify it.
 490          */
 491         if (is_removed_spte(iter->old_spte))
 492                 return false;
 493
 494         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
 495                       new_spte) != iter->old_spte)
 496                 return false;
 497
 498         handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 499                             new_spte, iter->level, true);
 500
 501         return true;
 502 }
 503
 504 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 505                                            struct tdp_iter *iter)
 506 {
 507         /*
 508          * Freeze the SPTE by setting it to a special,
 509          * non-present value. This will stop other threads from
 510          * immediately installing a present entry in its place
 511          * before the TLBs are flushed.
 512          */
 513         if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
 514                 return false;
 515
 516         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 517                                            KVM_PAGES_PER_HPAGE(iter->level));
 518
 519         /*
 520          * No other thread can overwrite the removed SPTE as they
 521          * must either wait on the MMU lock or use
 522          * tdp_mmu_set_spte_atomic which will not overrite the
 523          * special removed SPTE value. No bookkeeping is needed
 524          * here since the SPTE is going from non-present
 525          * to non-present.
 526          */
 527         WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 528
 529         return true;
 530 }
 531
 532
 533 /*
 534  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 535  * @kvm: kvm instance
 536  * @iter: a tdp_iter instance currently on the SPTE that should be set
 537  * @new_spte: The value the SPTE should be set to
 538  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 539  *                    of the page. Should be set unless handling an MMU
 540  *                    notifier for access tracking. Leaving record_acc_track
 541  *                    unset in that case prevents page accesses from being
 542  *                    double counted.
 543  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 544  *                    appropriate for the change being made. Should be set
 545  *                    unless performing certain dirty logging operations.
 546  *                    Leaving record_dirty_log unset in that case prevents page
 547  *                    writes from being double counted.
 548  */
 549 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 550                                       u64 new_spte, bool record_acc_track,
 551                                       bool record_dirty_log)
 552 {
 553         lockdep_assert_held_write(&kvm->mmu_lock);
 554
 555         /*
 556          * No thread should be using this function to set SPTEs to the
 557          * temporary removed SPTE value.
 558          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 559          * should be used. If operating under the MMU lock in write mode, the
 560          * use of the removed SPTE should not be necessary.
 561          */
 562         WARN_ON(is_removed_spte(iter->old_spte));
 563
 564         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 565
 566         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 567                               new_spte, iter->level, false);
 568         if (record_acc_track)
 569                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 570                                               iter->level);
 571         if (record_dirty_log)
 572                 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
 573                                               iter->old_spte, new_spte,
 574                                               iter->level);
 575 }
 576
 577 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 578                                     u64 new_spte)
 579 {
 580         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 581 }
 582
 583 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 584                                                  struct tdp_iter *iter,
 585                                                  u64 new_spte)
 586 {
 587         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 588 }
 589
 590 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 591                                                  struct tdp_iter *iter,
 592                                                  u64 new_spte)
 593 {
 594         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 595 }
 596
 597 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 598         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 599
 600 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 601         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 602                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 603                     !is_last_spte(_iter.old_spte, _iter.level))         \
 604                         continue;                                       \
 605                 else
 606
 607 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 608         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 609                          _mmu->shadow_root_level, _start, _end)
 610
 611 /*
 612  * Yield if the MMU lock is contended or this thread needs to return control
 613  * to the scheduler.
 614  *
 615  * If this function should yield and flush is set, it will perform a remote
 616  * TLB flush before yielding.
 617  *
 618  * If this function yields, it will also reset the tdp_iter's walk over the
 619  * paging structure and the calling function should skip to the next
 620  * iteration to allow the iterator to continue its traversal from the
 621  * paging structure root.
 622  *
 623  * Return true if this function yielded and the iterator's traversal was reset.
 624  * Return false if a yield was not needed.
 625  */
 626 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 627                                              struct tdp_iter *iter, bool flush)
 628 {
 629         /* Ensure forward progress has been made before yielding. */
 630         if (iter->next_last_level_gfn == iter->yielded_gfn)
 631                 return false;
 632
 633         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 634                 rcu_read_unlock();
 635
 636                 if (flush)
 637                         kvm_flush_remote_tlbs(kvm);
 638
 639                 cond_resched_rwlock_write(&kvm->mmu_lock);
 640                 rcu_read_lock();
 641
 642                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 643
 644                 tdp_iter_restart(iter);
 645
 646                 return true;
 647         }
 648
 649         return false;
 650 }
 651
 652 /*
 653  * Tears down the mappings for the range of gfns, [start, end), and frees the
 654  * non-root pages mapping GFNs strictly within that range. Returns true if
 655  * SPTEs have been cleared and a TLB flush is needed before releasing the
 656  * MMU lock.
 657  * If can_yield is true, will release the MMU lock and reschedule if the
 658  * scheduler needs the CPU or there is contention on the MMU lock. If this
 659  * function cannot yield, it will not release the MMU lock or reschedule and
 660  * the caller must ensure it does not supply too large a GFN range, or the
 661  * operation can cause a soft lockup.  Note, in some use cases a flush may be
 662  * required by prior actions.  Ensure the pending flush is performed prior to
 663  * yielding.
 664  */
 665 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 666                           gfn_t start, gfn_t end, bool can_yield, bool flush)
 667 {
 668         struct tdp_iter iter;
 669
 670         rcu_read_lock();
 671
 672         tdp_root_for_each_pte(iter, root, start, end) {
 673                 if (can_yield &&
 674                     tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
 675                         flush = false;
 676                         continue;
 677                 }
 678
 679                 if (!is_shadow_present_pte(iter.old_spte))
 680                         continue;
 681
 682                 /*
 683                  * If this is a non-last-level SPTE that covers a larger range
 684                  * than should be zapped, continue, and zap the mappings at a
 685                  * lower level.
 686                  */
 687                 if ((iter.gfn < start ||
 688                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 689                     !is_last_spte(iter.old_spte, iter.level))
 690                         continue;
 691
 692                 tdp_mmu_set_spte(kvm, &iter, 0);
 693                 flush = true;
 694         }
 695
 696         rcu_read_unlock();
 697         return flush;
 698 }
 699
 700 /*
 701  * Tears down the mappings for the range of gfns, [start, end), and frees the
 702  * non-root pages mapping GFNs strictly within that range. Returns true if
 703  * SPTEs have been cleared and a TLB flush is needed before releasing the
 704  * MMU lock.
 705  */
 706 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
 707                                  gfn_t end, bool can_yield, bool flush)
 708 {
 709         struct kvm_mmu_page *root;
 710
 711         for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
 712                 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
 713
 714         return flush;
 715 }
 716
 717 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 718 {
 719         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 720         bool flush = false;
 721         int i;
 722
 723         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
 724                 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, flush);
 725
 726         if (flush)
 727                 kvm_flush_remote_tlbs(kvm);
 728 }
 729
 730 /*
 731  * Installs a last-level SPTE to handle a TDP page fault.
 732  * (NPT/EPT violation/misconfiguration)
 733  */
 734 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 735                                           int map_writable,
 736                                           struct tdp_iter *iter,
 737                                           kvm_pfn_t pfn, bool prefault)
 738 {
 739         u64 new_spte;
 740         int ret = 0;
 741         int make_spte_ret = 0;
 742
 743         if (unlikely(is_noslot_pfn(pfn)))
 744                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 745         else
 746                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 747                                          pfn, iter->old_spte, prefault, true,
 748                                          map_writable, !shadow_accessed_mask,
 749                                          &new_spte);
 750
 751         if (new_spte == iter->old_spte)
 752                 ret = RET_PF_SPURIOUS;
 753         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 754                 return RET_PF_RETRY;
 755
 756         /*
 757          * If the page fault was caused by a write but the page is write
 758          * protected, emulation is needed. If the emulation was skipped,
 759          * the vCPU would have the same fault again.
 760          */
 761         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 762                 if (write)
 763                         ret = RET_PF_EMULATE;
 764                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 765         }
 766
 767         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 768         if (unlikely(is_mmio_spte(new_spte))) {
 769                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 770                                      new_spte);
 771                 ret = RET_PF_EMULATE;
 772         } else {
 773                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 774                                        rcu_dereference(iter->sptep));
 775         }
 776
 777         if (!prefault)
 778                 vcpu->stat.pf_fixed++;
 779
 780         return ret;
 781 }
 782
 783 /*
 784  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 785  * page tables and SPTEs to translate the faulting guest physical address.
 786  */
 787 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 788                     int map_writable, int max_level, kvm_pfn_t pfn,
 789                     bool prefault)
 790 {
 791         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 792         bool write = error_code & PFERR_WRITE_MASK;
 793         bool exec = error_code & PFERR_FETCH_MASK;
 794         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 795         struct kvm_mmu *mmu = vcpu->arch.mmu;
 796         struct tdp_iter iter;
 797         struct kvm_mmu_page *sp;
 798         u64 *child_pt;
 799         u64 new_spte;
 800         int ret;
 801         gfn_t gfn = gpa >> PAGE_SHIFT;
 802         int level;
 803         int req_level;
 804
 805         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
 806                 return RET_PF_RETRY;
 807         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
 808                 return RET_PF_RETRY;
 809
 810         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
 811                                         huge_page_disallowed, &req_level);
 812
 813         trace_kvm_mmu_spte_requested(gpa, level, pfn);
 814
 815         rcu_read_lock();
 816
 817         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 818                 if (nx_huge_page_workaround_enabled)
 819                         disallowed_hugepage_adjust(iter.old_spte, gfn,
 820                                                    iter.level, &pfn, &level);
 821
 822                 if (iter.level == level)
 823                         break;
 824
 825                 /*
 826                  * If there is an SPTE mapping a large page at a higher level
 827                  * than the target, that SPTE must be cleared and replaced
 828                  * with a non-leaf SPTE.
 829                  */
 830                 if (is_shadow_present_pte(iter.old_spte) &&
 831                     is_large_pte(iter.old_spte)) {
 832                         if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
 833                                 break;
 834
 835                         /*
 836                          * The iter must explicitly re-read the spte here
 837                          * because the new value informs the !present
 838                          * path below.
 839                          */
 840                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
 841                 }
 842
 843                 if (!is_shadow_present_pte(iter.old_spte)) {
 844                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 845                         child_pt = sp->spt;
 846
 847                         new_spte = make_nonleaf_spte(child_pt,
 848                                                      !shadow_accessed_mask);
 849
 850                         if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
 851                                                     new_spte)) {
 852                                 tdp_mmu_link_page(vcpu->kvm, sp, true,
 853                                                   huge_page_disallowed &&
 854                                                   req_level >= iter.level);
 855
 856                                 trace_kvm_mmu_get_page(sp, true);
 857                         } else {
 858                                 tdp_mmu_free_sp(sp);
 859                                 break;
 860                         }
 861                 }
 862         }
 863
 864         if (iter.level != level) {
 865                 rcu_read_unlock();
 866                 return RET_PF_RETRY;
 867         }
 868
 869         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
 870                                               pfn, prefault);
 871         rcu_read_unlock();
 872
 873         return ret;
 874 }
 875
 876 typedef int (*tdp_handler_t)(struct kvm *kvm, struct kvm_memory_slot *slot,
 877                              struct kvm_mmu_page *root, gfn_t start, gfn_t end,
 878                              unsigned long data);
 879
 880 static __always_inline int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
 881                                                         unsigned long start,
 882                                                         unsigned long end,
 883                                                         unsigned long data,
 884                                                         tdp_handler_t handler)
 885 {
 886         struct kvm_memslots *slots;
 887         struct kvm_memory_slot *memslot;
 888         struct kvm_mmu_page *root;
 889         int ret = 0;
 890         int as_id;
 891
 892         for (as_id = 0; as_id < KVM_ADDRESS_SPACE_NUM; as_id++) {
 893                 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) {
 894                         slots = __kvm_memslots(kvm, as_id);
 895                         kvm_for_each_memslot(memslot, slots) {
 896                                 unsigned long hva_start, hva_end;
 897                                 gfn_t gfn_start, gfn_end;
 898
 899                                 hva_start = max(start, memslot->userspace_addr);
 900                                 hva_end = min(end, memslot->userspace_addr +
 901                                         (memslot->npages << PAGE_SHIFT));
 902                                 if (hva_start >= hva_end)
 903                                         continue;
 904                                 /*
 905                                  * {gfn(page) | page intersects with [hva_start, hva_end)} =
 906                                  * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 907                                  */
 908                                 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
 909                                 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 910
 911                                 ret |= handler(kvm, memslot, root, gfn_start,
 912                                         gfn_end, data);
 913                         }
 914                 }
 915         }
 916
 917         return ret;
 918 }
 919
 920 static __always_inline int kvm_tdp_mmu_handle_hva(struct kvm *kvm,
 921                                                   unsigned long addr,
 922                                                   unsigned long data,
 923                                                   tdp_handler_t handler)
 924 {
 925         return kvm_tdp_mmu_handle_hva_range(kvm, addr, addr + 1, data, handler);
 926 }
 927
 928 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
 929                                      struct kvm_memory_slot *slot,
 930                                      struct kvm_mmu_page *root, gfn_t start,
 931                                      gfn_t end, unsigned long unused)
 932 {
 933         return zap_gfn_range(kvm, root, start, end, false, false);
 934 }
 935
 936 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
 937                               unsigned long end)
 938 {
 939         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 940                                             zap_gfn_range_hva_wrapper);
 941 }
 942
 943 /*
 944  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
 945  * if any of the GFNs in the range have been accessed.
 946  */
 947 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
 948                          struct kvm_mmu_page *root, gfn_t start, gfn_t end,
 949                          unsigned long unused)
 950 {
 951         struct tdp_iter iter;
 952         int young = 0;
 953         u64 new_spte = 0;
 954
 955         rcu_read_lock();
 956
 957         tdp_root_for_each_leaf_pte(iter, root, start, end) {
 958                 /*
 959                  * If we have a non-accessed entry we don't need to change the
 960                  * pte.
 961                  */
 962                 if (!is_accessed_spte(iter.old_spte))
 963                         continue;
 964
 965                 new_spte = iter.old_spte;
 966
 967                 if (spte_ad_enabled(new_spte)) {
 968                         clear_bit((ffs(shadow_accessed_mask) - 1),
 969                                   (unsigned long *)&new_spte);
 970                 } else {
 971                         /*
 972                          * Capture the dirty status of the page, so that it doesn't get
 973                          * lost when the SPTE is marked for access tracking.
 974                          */
 975                         if (is_writable_pte(new_spte))
 976                                 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 977
 978                         new_spte = mark_spte_for_access_track(new_spte);
 979                 }
 980
 981                 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
 982                 young = 1;
 983         }
 984
 985         rcu_read_unlock();
 986
 987         return young;
 988 }
 989
 990 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
 991                               unsigned long end)
 992 {
 993         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 994                                             age_gfn_range);
 995 }
 996
 997 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 998                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
 999                         unsigned long unused)
1000 {
1001         struct tdp_iter iter;
1002
1003         tdp_root_for_each_leaf_pte(iter, root, gfn, end)
1004                 if (is_accessed_spte(iter.old_spte))
1005                         return 1;
1006
1007         return 0;
1008 }
1009
1010 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1011 {
1012         return kvm_tdp_mmu_handle_hva(kvm, hva, 0, test_age_gfn);
1013 }
1014
1015 /*
1016  * Handle the changed_pte MMU notifier for the TDP MMU.
1017  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1018  * notifier.
1019  * Returns non-zero if a flush is needed before releasing the MMU lock.
1020  */
1021 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1022                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
1023                         unsigned long data)
1024 {
1025         struct tdp_iter iter;
1026         pte_t *ptep = (pte_t *)data;
1027         kvm_pfn_t new_pfn;
1028         u64 new_spte;
1029         int need_flush = 0;
1030
1031         rcu_read_lock();
1032
1033         WARN_ON(pte_huge(*ptep) || (gfn + 1) != end);
1034
1035         new_pfn = pte_pfn(*ptep);
1036
1037         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1038                 if (iter.level != PG_LEVEL_4K)
1039                         continue;
1040
1041                 if (!is_shadow_present_pte(iter.old_spte))
1042                         break;
1043
1044                 /*
1045                  * Note, when changing a read-only SPTE, it's not strictly
1046                  * necessary to zero the SPTE before setting the new PFN, but
1047                  * doing so preserves the invariant that the PFN of a present
1048                  * leaf SPTE can never change.  See __handle_changed_spte().
1049                  */
1050                 tdp_mmu_set_spte(kvm, &iter, 0);
1051
1052                 if (!pte_write(*ptep)) {
1053                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1054                                         iter.old_spte, new_pfn);
1055
1056                         tdp_mmu_set_spte(kvm, &iter, new_spte);
1057                 }
1058
1059                 need_flush = 1;
1060         }
1061
1062         if (need_flush)
1063                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1064
1065         rcu_read_unlock();
1066
1067         return 0;
1068 }
1069
1070 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1071                              pte_t *host_ptep)
1072 {
1073         return kvm_tdp_mmu_handle_hva(kvm, address, (unsigned long)host_ptep,
1074                                       set_tdp_spte);
1075 }
1076
1077 /*
1078  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1079  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1080  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1081  */
1082 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1083                              gfn_t start, gfn_t end, int min_level)
1084 {
1085         struct tdp_iter iter;
1086         u64 new_spte;
1087         bool spte_set = false;
1088
1089         rcu_read_lock();
1090
1091         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1092
1093         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1094                                    min_level, start, end) {
1095                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1096                         continue;
1097
1098                 if (!is_shadow_present_pte(iter.old_spte) ||
1099                     !is_last_spte(iter.old_spte, iter.level) ||
1100                     !(iter.old_spte & PT_WRITABLE_MASK))
1101                         continue;
1102
1103                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1104
1105                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1106                 spte_set = true;
1107         }
1108
1109         rcu_read_unlock();
1110         return spte_set;
1111 }
1112
1113 /*
1114  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1115  * only affect leaf SPTEs down to min_level.
1116  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1117  */
1118 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1119                              int min_level)
1120 {
1121         struct kvm_mmu_page *root;
1122         bool spte_set = false;
1123
1124         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1125                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1126                              slot->base_gfn + slot->npages, min_level);
1127
1128         return spte_set;
1129 }
1130
1131 /*
1132  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1133  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1134  * If AD bits are not enabled, this will require clearing the writable bit on
1135  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1136  * be flushed.
1137  */
1138 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1139                            gfn_t start, gfn_t end)
1140 {
1141         struct tdp_iter iter;
1142         u64 new_spte;
1143         bool spte_set = false;
1144
1145         rcu_read_lock();
1146
1147         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1148                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1149                         continue;
1150
1151                 if (spte_ad_need_write_protect(iter.old_spte)) {
1152                         if (is_writable_pte(iter.old_spte))
1153                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1154                         else
1155                                 continue;
1156                 } else {
1157                         if (iter.old_spte & shadow_dirty_mask)
1158                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1159                         else
1160                                 continue;
1161                 }
1162
1163                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1164                 spte_set = true;
1165         }
1166
1167         rcu_read_unlock();
1168         return spte_set;
1169 }
1170
1171 /*
1172  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1173  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1174  * If AD bits are not enabled, this will require clearing the writable bit on
1175  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1176  * be flushed.
1177  */
1178 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1179 {
1180         struct kvm_mmu_page *root;
1181         bool spte_set = false;
1182
1183         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1184                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1185                                 slot->base_gfn + slot->npages);
1186
1187         return spte_set;
1188 }
1189
1190 /*
1191  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1192  * set in mask, starting at gfn. The given memslot is expected to contain all
1193  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1194  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1195  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1196  */
1197 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1198                                   gfn_t gfn, unsigned long mask, bool wrprot)
1199 {
1200         struct tdp_iter iter;
1201         u64 new_spte;
1202
1203         rcu_read_lock();
1204
1205         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1206                                     gfn + BITS_PER_LONG) {
1207                 if (!mask)
1208                         break;
1209
1210                 if (iter.level > PG_LEVEL_4K ||
1211                     !(mask & (1UL << (iter.gfn - gfn))))
1212                         continue;
1213
1214                 mask &= ~(1UL << (iter.gfn - gfn));
1215
1216                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1217                         if (is_writable_pte(iter.old_spte))
1218                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1219                         else
1220                                 continue;
1221                 } else {
1222                         if (iter.old_spte & shadow_dirty_mask)
1223                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1224                         else
1225                                 continue;
1226                 }
1227
1228                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1229         }
1230
1231         rcu_read_unlock();
1232 }
1233
1234 /*
1235  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1236  * set in mask, starting at gfn. The given memslot is expected to contain all
1237  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1238  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1239  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1240  */
1241 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1242                                        struct kvm_memory_slot *slot,
1243                                        gfn_t gfn, unsigned long mask,
1244                                        bool wrprot)
1245 {
1246         struct kvm_mmu_page *root;
1247
1248         lockdep_assert_held_write(&kvm->mmu_lock);
1249         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1250                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1251 }
1252
1253 /*
1254  * Clear leaf entries which could be replaced by large mappings, for
1255  * GFNs within the slot.
1256  */
1257 static bool zap_collapsible_spte_range(struct kvm *kvm,
1258                                        struct kvm_mmu_page *root,
1259                                        struct kvm_memory_slot *slot,
1260                                        bool flush)
1261 {
1262         gfn_t start = slot->base_gfn;
1263         gfn_t end = start + slot->npages;
1264         struct tdp_iter iter;
1265         kvm_pfn_t pfn;
1266
1267         rcu_read_lock();
1268
1269         tdp_root_for_each_pte(iter, root, start, end) {
1270                 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
1271                         flush = false;
1272                         continue;
1273                 }
1274
1275                 if (!is_shadow_present_pte(iter.old_spte) ||
1276                     !is_last_spte(iter.old_spte, iter.level))
1277                         continue;
1278
1279                 pfn = spte_to_pfn(iter.old_spte);
1280                 if (kvm_is_reserved_pfn(pfn) ||
1281                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1282                                                             pfn, PG_LEVEL_NUM))
1283                         continue;
1284
1285                 tdp_mmu_set_spte(kvm, &iter, 0);
1286
1287                 flush = true;
1288         }
1289
1290         rcu_read_unlock();
1291
1292         return flush;
1293 }
1294
1295 /*
1296  * Clear non-leaf entries (and free associated page tables) which could
1297  * be replaced by large mappings, for GFNs within the slot.
1298  */
1299 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1300                                        struct kvm_memory_slot *slot, bool flush)
1301 {
1302         struct kvm_mmu_page *root;
1303
1304         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1305                 flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1306
1307         return flush;
1308 }
1309
1310 /*
1311  * Removes write access on the last level SPTE mapping this GFN and unsets the
1312  * MMU-writable bit to ensure future writes continue to be intercepted.
1313  * Returns true if an SPTE was set and a TLB flush is needed.
1314  */
1315 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1316                               gfn_t gfn)
1317 {
1318         struct tdp_iter iter;
1319         u64 new_spte;
1320         bool spte_set = false;
1321
1322         rcu_read_lock();
1323
1324         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1325                 if (!is_writable_pte(iter.old_spte))
1326                         break;
1327
1328                 new_spte = iter.old_spte &
1329                         ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1330
1331                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1332                 spte_set = true;
1333         }
1334
1335         rcu_read_unlock();
1336
1337         return spte_set;
1338 }
1339
1340 /*
1341  * Removes write access on the last level SPTE mapping this GFN and unsets the
1342  * MMU-writable bit to ensure future writes continue to be intercepted.
1343  * Returns true if an SPTE was set and a TLB flush is needed.
1344  */
1345 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1346                                    struct kvm_memory_slot *slot, gfn_t gfn)
1347 {
1348         struct kvm_mmu_page *root;
1349         bool spte_set = false;
1350
1351         lockdep_assert_held_write(&kvm->mmu_lock);
1352         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1353                 spte_set |= write_protect_gfn(kvm, root, gfn);
1354
1355         return spte_set;
1356 }
1357
1358 /*
1359  * Return the level of the lowest level SPTE added to sptes.
1360  * That SPTE may be non-present.
1361  */
1362 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1363                          int *root_level)
1364 {
1365         struct tdp_iter iter;
1366         struct kvm_mmu *mmu = vcpu->arch.mmu;
1367         gfn_t gfn = addr >> PAGE_SHIFT;
1368         int leaf = -1;
1369
1370         *root_level = vcpu->arch.mmu->shadow_root_level;
1371
1372         rcu_read_lock();
1373
1374         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1375                 leaf = iter.level;
1376                 sptes[leaf] = iter.old_spte;
1377         }
1378
1379         rcu_read_unlock();
1380
1381         return leaf;
1382 }