arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = false;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  20                 return;
  21
  22         /* This should not be changed for the lifetime of the VM. */
  23         kvm->arch.tdp_mmu_enabled = true;
  24
  25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  28 }
  29
  30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  31 {
  32         if (!kvm->arch.tdp_mmu_enabled)
  33                 return;
  34
  35         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  36
  37         /*
  38          * Ensure that all the outstanding RCU callbacks to free shadow pages
  39          * can run before the VM is torn down.
  40          */
  41         rcu_barrier();
  42 }
  43
  44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
  45 {
  46         if (kvm_mmu_put_root(kvm, root))
  47                 kvm_tdp_mmu_free_root(kvm, root);
  48 }
  49
  50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
  51                                            struct kvm_mmu_page *root)
  52 {
  53         lockdep_assert_held_write(&kvm->mmu_lock);
  54
  55         if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
  56                 return false;
  57
  58         kvm_mmu_get_root(kvm, root);
  59         return true;
  60
  61 }
  62
  63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  64                                                      struct kvm_mmu_page *root)
  65 {
  66         struct kvm_mmu_page *next_root;
  67
  68         next_root = list_next_entry(root, link);
  69         tdp_mmu_put_root(kvm, root);
  70         return next_root;
  71 }
  72
  73 /*
  74  * Note: this iterator gets and puts references to the roots it iterates over.
  75  * This makes it safe to release the MMU lock and yield within the loop, but
  76  * if exiting the loop early, the caller must drop the reference to the most
  77  * recent root. (Unless keeping a live reference is desirable.)
  78  */
  79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                           \
  80         for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,        \
  81                                       typeof(*_root), link);            \
  82              tdp_mmu_next_root_valid(_kvm, _root);                      \
  83              _root = tdp_mmu_next_root(_kvm, _root))
  84
  85 #define for_each_tdp_mmu_root(_kvm, _root)                              \
  86         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
  87
  88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  89                           gfn_t start, gfn_t end, bool can_yield, bool flush);
  90
  91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  92 {
  93         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
  94
  95         lockdep_assert_held_write(&kvm->mmu_lock);
  96
  97         WARN_ON(root->root_count);
  98         WARN_ON(!root->tdp_mmu_page);
  99
 100         list_del(&root->link);
 101
 102         zap_gfn_range(kvm, root, 0, max_gfn, false, false);
 103
 104         free_page((unsigned long)root->spt);
 105         kmem_cache_free(mmu_page_header_cache, root);
 106 }
 107
 108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
 109                                                    int level)
 110 {
 111         union kvm_mmu_page_role role;
 112
 113         role = vcpu->arch.mmu->mmu_role.base;
 114         role.level = level;
 115         role.direct = true;
 116         role.gpte_is_8_bytes = true;
 117         role.access = ACC_ALL;
 118
 119         return role;
 120 }
 121
 122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 123                                                int level)
 124 {
 125         struct kvm_mmu_page *sp;
 126
 127         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 128         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 129         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 130
 131         sp->role.word = page_role_for_level(vcpu, level).word;
 132         sp->gfn = gfn;
 133         sp->tdp_mmu_page = true;
 134
 135         trace_kvm_mmu_get_page(sp, true);
 136
 137         return sp;
 138 }
 139
 140 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 141 {
 142         union kvm_mmu_page_role role;
 143         struct kvm *kvm = vcpu->kvm;
 144         struct kvm_mmu_page *root;
 145
 146         lockdep_assert_held_write(&kvm->mmu_lock);
 147
 148         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 149
 150         /* Check for an existing root before allocating a new one. */
 151         for_each_tdp_mmu_root(kvm, root) {
 152                 if (root->role.word == role.word) {
 153                         kvm_mmu_get_root(kvm, root);
 154                         goto out;
 155                 }
 156         }
 157
 158         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 159         root->root_count = 1;
 160
 161         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 162
 163 out:
 164         return __pa(root->spt);
 165 }
 166
 167 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 168 {
 169         free_page((unsigned long)sp->spt);
 170         kmem_cache_free(mmu_page_header_cache, sp);
 171 }
 172
 173 /*
 174  * This is called through call_rcu in order to free TDP page table memory
 175  * safely with respect to other kernel threads that may be operating on
 176  * the memory.
 177  * By only accessing TDP MMU page table memory in an RCU read critical
 178  * section, and freeing it after a grace period, lockless access to that
 179  * memory won't use it after it is freed.
 180  */
 181 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
 182 {
 183         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
 184                                                rcu_head);
 185
 186         tdp_mmu_free_sp(sp);
 187 }
 188
 189 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 190                                 u64 old_spte, u64 new_spte, int level,
 191                                 bool shared);
 192
 193 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 194 {
 195         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 196                 return;
 197
 198         if (is_accessed_spte(old_spte) &&
 199             (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
 200              spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
 201                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 202 }
 203
 204 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 205                                           u64 old_spte, u64 new_spte, int level)
 206 {
 207         bool pfn_changed;
 208         struct kvm_memory_slot *slot;
 209
 210         if (level > PG_LEVEL_4K)
 211                 return;
 212
 213         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 214
 215         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 216             is_writable_pte(new_spte)) {
 217                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 218                 mark_page_dirty_in_slot(kvm, slot, gfn);
 219         }
 220 }
 221
 222 /**
 223  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
 224  *
 225  * @kvm: kvm instance
 226  * @sp: the new page
 227  * @shared: This operation may not be running under the exclusive use of
 228  *          the MMU lock and the operation must synchronize with other
 229  *          threads that might be adding or removing pages.
 230  * @account_nx: This page replaces a NX large page and should be marked for
 231  *              eventual reclaim.
 232  */
 233 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 234                               bool shared, bool account_nx)
 235 {
 236         if (shared)
 237                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 238         else
 239                 lockdep_assert_held_write(&kvm->mmu_lock);
 240
 241         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
 242         if (account_nx)
 243                 account_huge_nx_page(kvm, sp);
 244
 245         if (shared)
 246                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 247 }
 248
 249 /**
 250  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
 251  *
 252  * @kvm: kvm instance
 253  * @sp: the page to be removed
 254  * @shared: This operation may not be running under the exclusive use of
 255  *          the MMU lock and the operation must synchronize with other
 256  *          threads that might be adding or removing pages.
 257  */
 258 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 259                                 bool shared)
 260 {
 261         if (shared)
 262                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 263         else
 264                 lockdep_assert_held_write(&kvm->mmu_lock);
 265
 266         list_del(&sp->link);
 267         if (sp->lpage_disallowed)
 268                 unaccount_huge_nx_page(kvm, sp);
 269
 270         if (shared)
 271                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 272 }
 273
 274 /**
 275  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
 276  *
 277  * @kvm: kvm instance
 278  * @pt: the page removed from the paging structure
 279  * @shared: This operation may not be running under the exclusive use
 280  *          of the MMU lock and the operation must synchronize with other
 281  *          threads that might be modifying SPTEs.
 282  *
 283  * Given a page table that has been removed from the TDP paging structure,
 284  * iterates through the page table to clear SPTEs and free child page tables.
 285  *
 286  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 287  * protection. Since this thread removed it from the paging structure,
 288  * this thread will be responsible for ensuring the page is freed. Hence the
 289  * early rcu_dereferences in the function.
 290  */
 291 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
 292                                         bool shared)
 293 {
 294         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
 295         int level = sp->role.level;
 296         gfn_t base_gfn = sp->gfn;
 297         u64 old_child_spte;
 298         u64 *sptep;
 299         gfn_t gfn;
 300         int i;
 301
 302         trace_kvm_mmu_prepare_zap_page(sp);
 303
 304         tdp_mmu_unlink_page(kvm, sp, shared);
 305
 306         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 307                 sptep = rcu_dereference(pt) + i;
 308                 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 309
 310                 if (shared) {
 311                         /*
 312                          * Set the SPTE to a nonpresent value that other
 313                          * threads will not overwrite. If the SPTE was
 314                          * already marked as removed then another thread
 315                          * handling a page fault could overwrite it, so
 316                          * set the SPTE until it is set from some other
 317                          * value to the removed SPTE value.
 318                          */
 319                         for (;;) {
 320                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 321                                 if (!is_removed_spte(old_child_spte))
 322                                         break;
 323                                 cpu_relax();
 324                         }
 325                 } else {
 326                         /*
 327                          * If the SPTE is not MMU-present, there is no backing
 328                          * page associated with the SPTE and so no side effects
 329                          * that need to be recorded, and exclusive ownership of
 330                          * mmu_lock ensures the SPTE can't be made present.
 331                          * Note, zapping MMIO SPTEs is also unnecessary as they
 332                          * are guarded by the memslots generation, not by being
 333                          * unreachable.
 334                          */
 335                         old_child_spte = READ_ONCE(*sptep);
 336                         if (!is_shadow_present_pte(old_child_spte))
 337                                 continue;
 338
 339                         /*
 340                          * Marking the SPTE as a removed SPTE is not
 341                          * strictly necessary here as the MMU lock will
 342                          * stop other threads from concurrently modifying
 343                          * this SPTE. Using the removed SPTE value keeps
 344                          * the two branches consistent and simplifies
 345                          * the function.
 346                          */
 347                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 348                 }
 349                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 350                                     old_child_spte, REMOVED_SPTE, level - 1,
 351                                     shared);
 352         }
 353
 354         kvm_flush_remote_tlbs_with_address(kvm, gfn,
 355                                            KVM_PAGES_PER_HPAGE(level));
 356
 357         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 358 }
 359
 360 /**
 361  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 362  * @kvm: kvm instance
 363  * @as_id: the address space of the paging structure the SPTE was a part of
 364  * @gfn: the base GFN that was mapped by the SPTE
 365  * @old_spte: The value of the SPTE before the change
 366  * @new_spte: The value of the SPTE after the change
 367  * @level: the level of the PT the SPTE is part of in the paging structure
 368  * @shared: This operation may not be running under the exclusive use of
 369  *          the MMU lock and the operation must synchronize with other
 370  *          threads that might be modifying SPTEs.
 371  *
 372  * Handle bookkeeping that might result from the modification of a SPTE.
 373  * This function must be called for all TDP SPTE modifications.
 374  */
 375 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 376                                   u64 old_spte, u64 new_spte, int level,
 377                                   bool shared)
 378 {
 379         bool was_present = is_shadow_present_pte(old_spte);
 380         bool is_present = is_shadow_present_pte(new_spte);
 381         bool was_leaf = was_present && is_last_spte(old_spte, level);
 382         bool is_leaf = is_present && is_last_spte(new_spte, level);
 383         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 384
 385         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 386         WARN_ON(level < PG_LEVEL_4K);
 387         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 388
 389         /*
 390          * If this warning were to trigger it would indicate that there was a
 391          * missing MMU notifier or a race with some notifier handler.
 392          * A present, leaf SPTE should never be directly replaced with another
 393          * present leaf SPTE pointing to a differnt PFN. A notifier handler
 394          * should be zapping the SPTE before the main MM's page table is
 395          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 396          * thread before replacement.
 397          */
 398         if (was_leaf && is_leaf && pfn_changed) {
 399                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 400                        "SPTE with another present leaf SPTE mapping a\n"
 401                        "different PFN!\n"
 402                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 403                        as_id, gfn, old_spte, new_spte, level);
 404
 405                 /*
 406                  * Crash the host to prevent error propagation and guest data
 407                  * courruption.
 408                  */
 409                 BUG();
 410         }
 411
 412         if (old_spte == new_spte)
 413                 return;
 414
 415         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 416
 417         /*
 418          * The only times a SPTE should be changed from a non-present to
 419          * non-present state is when an MMIO entry is installed/modified/
 420          * removed. In that case, there is nothing to do here.
 421          */
 422         if (!was_present && !is_present) {
 423                 /*
 424                  * If this change does not involve a MMIO SPTE or removed SPTE,
 425                  * it is unexpected. Log the change, though it should not
 426                  * impact the guest since both the former and current SPTEs
 427                  * are nonpresent.
 428                  */
 429                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 430                             !is_mmio_spte(new_spte) &&
 431                             !is_removed_spte(new_spte)))
 432                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 433                                "should not be replaced with another,\n"
 434                                "different nonpresent SPTE, unless one or both\n"
 435                                "are MMIO SPTEs, or the new SPTE is\n"
 436                                "a temporary removed SPTE.\n"
 437                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 438                                as_id, gfn, old_spte, new_spte, level);
 439                 return;
 440         }
 441
 442
 443         if (was_leaf && is_dirty_spte(old_spte) &&
 444             (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
 445                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 446
 447         /*
 448          * Recursively handle child PTs if the change removed a subtree from
 449          * the paging structure.
 450          */
 451         if (was_present && !was_leaf && (pfn_changed || !is_present))
 452                 handle_removed_tdp_mmu_page(kvm,
 453                                 spte_to_child_pt(old_spte, level), shared);
 454 }
 455
 456 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 457                                 u64 old_spte, u64 new_spte, int level,
 458                                 bool shared)
 459 {
 460         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 461                               shared);
 462         handle_changed_spte_acc_track(old_spte, new_spte, level);
 463         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 464                                       new_spte, level);
 465 }
 466
 467 /*
 468  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
 469  * associated bookkeeping
 470  *
 471  * @kvm: kvm instance
 472  * @iter: a tdp_iter instance currently on the SPTE that should be set
 473  * @new_spte: The value the SPTE should be set to
 474  * Returns: true if the SPTE was set, false if it was not. If false is returned,
 475  *          this function will have no side-effects.
 476  */
 477 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
 478                                            struct tdp_iter *iter,
 479                                            u64 new_spte)
 480 {
 481         lockdep_assert_held_read(&kvm->mmu_lock);
 482
 483         /*
 484          * Do not change removed SPTEs. Only the thread that froze the SPTE
 485          * may modify it.
 486          */
 487         if (is_removed_spte(iter->old_spte))
 488                 return false;
 489
 490         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
 491                       new_spte) != iter->old_spte)
 492                 return false;
 493
 494         handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 495                             new_spte, iter->level, true);
 496
 497         return true;
 498 }
 499
 500 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 501                                            struct tdp_iter *iter)
 502 {
 503         /*
 504          * Freeze the SPTE by setting it to a special,
 505          * non-present value. This will stop other threads from
 506          * immediately installing a present entry in its place
 507          * before the TLBs are flushed.
 508          */
 509         if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
 510                 return false;
 511
 512         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 513                                            KVM_PAGES_PER_HPAGE(iter->level));
 514
 515         /*
 516          * No other thread can overwrite the removed SPTE as they
 517          * must either wait on the MMU lock or use
 518          * tdp_mmu_set_spte_atomic which will not overrite the
 519          * special removed SPTE value. No bookkeeping is needed
 520          * here since the SPTE is going from non-present
 521          * to non-present.
 522          */
 523         WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 524
 525         return true;
 526 }
 527
 528
 529 /*
 530  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 531  * @kvm: kvm instance
 532  * @iter: a tdp_iter instance currently on the SPTE that should be set
 533  * @new_spte: The value the SPTE should be set to
 534  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 535  *                    of the page. Should be set unless handling an MMU
 536  *                    notifier for access tracking. Leaving record_acc_track
 537  *                    unset in that case prevents page accesses from being
 538  *                    double counted.
 539  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 540  *                    appropriate for the change being made. Should be set
 541  *                    unless performing certain dirty logging operations.
 542  *                    Leaving record_dirty_log unset in that case prevents page
 543  *                    writes from being double counted.
 544  */
 545 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 546                                       u64 new_spte, bool record_acc_track,
 547                                       bool record_dirty_log)
 548 {
 549         lockdep_assert_held_write(&kvm->mmu_lock);
 550
 551         /*
 552          * No thread should be using this function to set SPTEs to the
 553          * temporary removed SPTE value.
 554          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 555          * should be used. If operating under the MMU lock in write mode, the
 556          * use of the removed SPTE should not be necessary.
 557          */
 558         WARN_ON(is_removed_spte(iter->old_spte));
 559
 560         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 561
 562         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 563                               new_spte, iter->level, false);
 564         if (record_acc_track)
 565                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 566                                               iter->level);
 567         if (record_dirty_log)
 568                 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
 569                                               iter->old_spte, new_spte,
 570                                               iter->level);
 571 }
 572
 573 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 574                                     u64 new_spte)
 575 {
 576         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 577 }
 578
 579 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 580                                                  struct tdp_iter *iter,
 581                                                  u64 new_spte)
 582 {
 583         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 584 }
 585
 586 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 587                                                  struct tdp_iter *iter,
 588                                                  u64 new_spte)
 589 {
 590         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 591 }
 592
 593 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 594         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 595
 596 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 597         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 598                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 599                     !is_last_spte(_iter.old_spte, _iter.level))         \
 600                         continue;                                       \
 601                 else
 602
 603 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 604         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 605                          _mmu->shadow_root_level, _start, _end)
 606
 607 /*
 608  * Yield if the MMU lock is contended or this thread needs to return control
 609  * to the scheduler.
 610  *
 611  * If this function should yield and flush is set, it will perform a remote
 612  * TLB flush before yielding.
 613  *
 614  * If this function yields, it will also reset the tdp_iter's walk over the
 615  * paging structure and the calling function should skip to the next
 616  * iteration to allow the iterator to continue its traversal from the
 617  * paging structure root.
 618  *
 619  * Return true if this function yielded and the iterator's traversal was reset.
 620  * Return false if a yield was not needed.
 621  */
 622 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 623                                              struct tdp_iter *iter, bool flush)
 624 {
 625         /* Ensure forward progress has been made before yielding. */
 626         if (iter->next_last_level_gfn == iter->yielded_gfn)
 627                 return false;
 628
 629         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 630                 rcu_read_unlock();
 631
 632                 if (flush)
 633                         kvm_flush_remote_tlbs(kvm);
 634
 635                 cond_resched_rwlock_write(&kvm->mmu_lock);
 636                 rcu_read_lock();
 637
 638                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 639
 640                 tdp_iter_restart(iter);
 641
 642                 return true;
 643         }
 644
 645         return false;
 646 }
 647
 648 /*
 649  * Tears down the mappings for the range of gfns, [start, end), and frees the
 650  * non-root pages mapping GFNs strictly within that range. Returns true if
 651  * SPTEs have been cleared and a TLB flush is needed before releasing the
 652  * MMU lock.
 653  * If can_yield is true, will release the MMU lock and reschedule if the
 654  * scheduler needs the CPU or there is contention on the MMU lock. If this
 655  * function cannot yield, it will not release the MMU lock or reschedule and
 656  * the caller must ensure it does not supply too large a GFN range, or the
 657  * operation can cause a soft lockup.  Note, in some use cases a flush may be
 658  * required by prior actions.  Ensure the pending flush is performed prior to
 659  * yielding.
 660  */
 661 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 662                           gfn_t start, gfn_t end, bool can_yield, bool flush)
 663 {
 664         struct tdp_iter iter;
 665
 666         rcu_read_lock();
 667
 668         tdp_root_for_each_pte(iter, root, start, end) {
 669                 if (can_yield &&
 670                     tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
 671                         flush = false;
 672                         continue;
 673                 }
 674
 675                 if (!is_shadow_present_pte(iter.old_spte))
 676                         continue;
 677
 678                 /*
 679                  * If this is a non-last-level SPTE that covers a larger range
 680                  * than should be zapped, continue, and zap the mappings at a
 681                  * lower level.
 682                  */
 683                 if ((iter.gfn < start ||
 684                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 685                     !is_last_spte(iter.old_spte, iter.level))
 686                         continue;
 687
 688                 tdp_mmu_set_spte(kvm, &iter, 0);
 689                 flush = true;
 690         }
 691
 692         rcu_read_unlock();
 693         return flush;
 694 }
 695
 696 /*
 697  * Tears down the mappings for the range of gfns, [start, end), and frees the
 698  * non-root pages mapping GFNs strictly within that range. Returns true if
 699  * SPTEs have been cleared and a TLB flush is needed before releasing the
 700  * MMU lock.
 701  */
 702 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
 703                                  gfn_t end, bool can_yield, bool flush)
 704 {
 705         struct kvm_mmu_page *root;
 706
 707         for_each_tdp_mmu_root_yield_safe(kvm, root) {
 708                 if (kvm_mmu_page_as_id(root) != as_id)
 709                         continue;
 710                 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
 711         }
 712
 713         return flush;
 714 }
 715
 716 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 717 {
 718         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 719         bool flush = false;
 720         int i;
 721
 722         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
 723                 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, flush);
 724
 725         if (flush)
 726                 kvm_flush_remote_tlbs(kvm);
 727 }
 728
 729 /*
 730  * Installs a last-level SPTE to handle a TDP page fault.
 731  * (NPT/EPT violation/misconfiguration)
 732  */
 733 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 734                                           int map_writable,
 735                                           struct tdp_iter *iter,
 736                                           kvm_pfn_t pfn, bool prefault)
 737 {
 738         u64 new_spte;
 739         int ret = 0;
 740         int make_spte_ret = 0;
 741
 742         if (unlikely(is_noslot_pfn(pfn)))
 743                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 744         else
 745                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 746                                          pfn, iter->old_spte, prefault, true,
 747                                          map_writable, !shadow_accessed_mask,
 748                                          &new_spte);
 749
 750         if (new_spte == iter->old_spte)
 751                 ret = RET_PF_SPURIOUS;
 752         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 753                 return RET_PF_RETRY;
 754
 755         /*
 756          * If the page fault was caused by a write but the page is write
 757          * protected, emulation is needed. If the emulation was skipped,
 758          * the vCPU would have the same fault again.
 759          */
 760         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 761                 if (write)
 762                         ret = RET_PF_EMULATE;
 763                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 764         }
 765
 766         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 767         if (unlikely(is_mmio_spte(new_spte))) {
 768                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 769                                      new_spte);
 770                 ret = RET_PF_EMULATE;
 771         } else {
 772                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 773                                        rcu_dereference(iter->sptep));
 774         }
 775
 776         if (!prefault)
 777                 vcpu->stat.pf_fixed++;
 778
 779         return ret;
 780 }
 781
 782 /*
 783  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 784  * page tables and SPTEs to translate the faulting guest physical address.
 785  */
 786 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 787                     int map_writable, int max_level, kvm_pfn_t pfn,
 788                     bool prefault)
 789 {
 790         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 791         bool write = error_code & PFERR_WRITE_MASK;
 792         bool exec = error_code & PFERR_FETCH_MASK;
 793         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 794         struct kvm_mmu *mmu = vcpu->arch.mmu;
 795         struct tdp_iter iter;
 796         struct kvm_mmu_page *sp;
 797         u64 *child_pt;
 798         u64 new_spte;
 799         int ret;
 800         gfn_t gfn = gpa >> PAGE_SHIFT;
 801         int level;
 802         int req_level;
 803
 804         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
 805                 return RET_PF_RETRY;
 806         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
 807                 return RET_PF_RETRY;
 808
 809         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
 810                                         huge_page_disallowed, &req_level);
 811
 812         trace_kvm_mmu_spte_requested(gpa, level, pfn);
 813
 814         rcu_read_lock();
 815
 816         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 817                 if (nx_huge_page_workaround_enabled)
 818                         disallowed_hugepage_adjust(iter.old_spte, gfn,
 819                                                    iter.level, &pfn, &level);
 820
 821                 if (iter.level == level)
 822                         break;
 823
 824                 /*
 825                  * If there is an SPTE mapping a large page at a higher level
 826                  * than the target, that SPTE must be cleared and replaced
 827                  * with a non-leaf SPTE.
 828                  */
 829                 if (is_shadow_present_pte(iter.old_spte) &&
 830                     is_large_pte(iter.old_spte)) {
 831                         if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
 832                                 break;
 833
 834                         /*
 835                          * The iter must explicitly re-read the spte here
 836                          * because the new value informs the !present
 837                          * path below.
 838                          */
 839                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
 840                 }
 841
 842                 if (!is_shadow_present_pte(iter.old_spte)) {
 843                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 844                         child_pt = sp->spt;
 845
 846                         new_spte = make_nonleaf_spte(child_pt,
 847                                                      !shadow_accessed_mask);
 848
 849                         if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
 850                                                     new_spte)) {
 851                                 tdp_mmu_link_page(vcpu->kvm, sp, true,
 852                                                   huge_page_disallowed &&
 853                                                   req_level >= iter.level);
 854
 855                                 trace_kvm_mmu_get_page(sp, true);
 856                         } else {
 857                                 tdp_mmu_free_sp(sp);
 858                                 break;
 859                         }
 860                 }
 861         }
 862
 863         if (iter.level != level) {
 864                 rcu_read_unlock();
 865                 return RET_PF_RETRY;
 866         }
 867
 868         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
 869                                               pfn, prefault);
 870         rcu_read_unlock();
 871
 872         return ret;
 873 }
 874
 875 typedef int (*tdp_handler_t)(struct kvm *kvm, struct kvm_memory_slot *slot,
 876                              struct kvm_mmu_page *root, gfn_t start, gfn_t end,
 877                              unsigned long data);
 878
 879 static __always_inline int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
 880                                                         unsigned long start,
 881                                                         unsigned long end,
 882                                                         unsigned long data,
 883                                                         tdp_handler_t handler)
 884 {
 885         struct kvm_memslots *slots;
 886         struct kvm_memory_slot *memslot;
 887         struct kvm_mmu_page *root;
 888         int ret = 0;
 889         int as_id;
 890
 891         for_each_tdp_mmu_root_yield_safe(kvm, root) {
 892                 as_id = kvm_mmu_page_as_id(root);
 893                 slots = __kvm_memslots(kvm, as_id);
 894                 kvm_for_each_memslot(memslot, slots) {
 895                         unsigned long hva_start, hva_end;
 896                         gfn_t gfn_start, gfn_end;
 897
 898                         hva_start = max(start, memslot->userspace_addr);
 899                         hva_end = min(end, memslot->userspace_addr +
 900                                       (memslot->npages << PAGE_SHIFT));
 901                         if (hva_start >= hva_end)
 902                                 continue;
 903                         /*
 904                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 905                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 906                          */
 907                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
 908                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 909
 910                         ret |= handler(kvm, memslot, root, gfn_start,
 911                                        gfn_end, data);
 912                 }
 913         }
 914
 915         return ret;
 916 }
 917
 918 static __always_inline int kvm_tdp_mmu_handle_hva(struct kvm *kvm,
 919                                                   unsigned long addr,
 920                                                   unsigned long data,
 921                                                   tdp_handler_t handler)
 922 {
 923         return kvm_tdp_mmu_handle_hva_range(kvm, addr, addr + 1, data, handler);
 924 }
 925
 926 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
 927                                      struct kvm_memory_slot *slot,
 928                                      struct kvm_mmu_page *root, gfn_t start,
 929                                      gfn_t end, unsigned long unused)
 930 {
 931         return zap_gfn_range(kvm, root, start, end, false, false);
 932 }
 933
 934 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
 935                               unsigned long end)
 936 {
 937         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 938                                             zap_gfn_range_hva_wrapper);
 939 }
 940
 941 /*
 942  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
 943  * if any of the GFNs in the range have been accessed.
 944  */
 945 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
 946                          struct kvm_mmu_page *root, gfn_t start, gfn_t end,
 947                          unsigned long unused)
 948 {
 949         struct tdp_iter iter;
 950         int young = 0;
 951         u64 new_spte = 0;
 952
 953         rcu_read_lock();
 954
 955         tdp_root_for_each_leaf_pte(iter, root, start, end) {
 956                 /*
 957                  * If we have a non-accessed entry we don't need to change the
 958                  * pte.
 959                  */
 960                 if (!is_accessed_spte(iter.old_spte))
 961                         continue;
 962
 963                 new_spte = iter.old_spte;
 964
 965                 if (spte_ad_enabled(new_spte)) {
 966                         clear_bit((ffs(shadow_accessed_mask) - 1),
 967                                   (unsigned long *)&new_spte);
 968                 } else {
 969                         /*
 970                          * Capture the dirty status of the page, so that it doesn't get
 971                          * lost when the SPTE is marked for access tracking.
 972                          */
 973                         if (is_writable_pte(new_spte))
 974                                 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 975
 976                         new_spte = mark_spte_for_access_track(new_spte);
 977                 }
 978                 new_spte &= ~shadow_dirty_mask;
 979
 980                 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
 981                 young = 1;
 982
 983                 trace_kvm_age_page(iter.gfn, iter.level, slot, young);
 984         }
 985
 986         rcu_read_unlock();
 987
 988         return young;
 989 }
 990
 991 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
 992                               unsigned long end)
 993 {
 994         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 995                                             age_gfn_range);
 996 }
 997
 998 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 999                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
1000                         unsigned long unused)
1001 {
1002         struct tdp_iter iter;
1003
1004         tdp_root_for_each_leaf_pte(iter, root, gfn, end)
1005                 if (is_accessed_spte(iter.old_spte))
1006                         return 1;
1007
1008         return 0;
1009 }
1010
1011 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1012 {
1013         return kvm_tdp_mmu_handle_hva(kvm, hva, 0, test_age_gfn);
1014 }
1015
1016 /*
1017  * Handle the changed_pte MMU notifier for the TDP MMU.
1018  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1019  * notifier.
1020  * Returns non-zero if a flush is needed before releasing the MMU lock.
1021  */
1022 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1023                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
1024                         unsigned long data)
1025 {
1026         struct tdp_iter iter;
1027         pte_t *ptep = (pte_t *)data;
1028         kvm_pfn_t new_pfn;
1029         u64 new_spte;
1030         int need_flush = 0;
1031
1032         rcu_read_lock();
1033
1034         WARN_ON(pte_huge(*ptep) || (gfn + 1) != end);
1035
1036         new_pfn = pte_pfn(*ptep);
1037
1038         tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
1039                 if (iter.level != PG_LEVEL_4K)
1040                         continue;
1041
1042                 if (!is_shadow_present_pte(iter.old_spte))
1043                         break;
1044
1045                 /*
1046                  * Note, when changing a read-only SPTE, it's not strictly
1047                  * necessary to zero the SPTE before setting the new PFN, but
1048                  * doing so preserves the invariant that the PFN of a present
1049                  * leaf SPTE can never change.  See __handle_changed_spte().
1050                  */
1051                 tdp_mmu_set_spte(kvm, &iter, 0);
1052
1053                 if (!pte_write(*ptep)) {
1054                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1055                                         iter.old_spte, new_pfn);
1056
1057                         tdp_mmu_set_spte(kvm, &iter, new_spte);
1058                 }
1059
1060                 need_flush = 1;
1061         }
1062
1063         if (need_flush)
1064                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1065
1066         rcu_read_unlock();
1067
1068         return 0;
1069 }
1070
1071 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1072                              pte_t *host_ptep)
1073 {
1074         return kvm_tdp_mmu_handle_hva(kvm, address, (unsigned long)host_ptep,
1075                                       set_tdp_spte);
1076 }
1077
1078 /*
1079  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1080  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1081  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1082  */
1083 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1084                              gfn_t start, gfn_t end, int min_level)
1085 {
1086         struct tdp_iter iter;
1087         u64 new_spte;
1088         bool spte_set = false;
1089
1090         rcu_read_lock();
1091
1092         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1093
1094         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1095                                    min_level, start, end) {
1096                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1097                         continue;
1098
1099                 if (!is_shadow_present_pte(iter.old_spte) ||
1100                     !is_last_spte(iter.old_spte, iter.level) ||
1101                     !(iter.old_spte & PT_WRITABLE_MASK))
1102                         continue;
1103
1104                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1105
1106                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1107                 spte_set = true;
1108         }
1109
1110         rcu_read_unlock();
1111         return spte_set;
1112 }
1113
1114 /*
1115  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1116  * only affect leaf SPTEs down to min_level.
1117  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1118  */
1119 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1120                              int min_level)
1121 {
1122         struct kvm_mmu_page *root;
1123         int root_as_id;
1124         bool spte_set = false;
1125
1126         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1127                 root_as_id = kvm_mmu_page_as_id(root);
1128                 if (root_as_id != slot->as_id)
1129                         continue;
1130
1131                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1132                              slot->base_gfn + slot->npages, min_level);
1133         }
1134
1135         return spte_set;
1136 }
1137
1138 /*
1139  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1140  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1141  * If AD bits are not enabled, this will require clearing the writable bit on
1142  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1143  * be flushed.
1144  */
1145 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1146                            gfn_t start, gfn_t end)
1147 {
1148         struct tdp_iter iter;
1149         u64 new_spte;
1150         bool spte_set = false;
1151
1152         rcu_read_lock();
1153
1154         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1155                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1156                         continue;
1157
1158                 if (spte_ad_need_write_protect(iter.old_spte)) {
1159                         if (is_writable_pte(iter.old_spte))
1160                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1161                         else
1162                                 continue;
1163                 } else {
1164                         if (iter.old_spte & shadow_dirty_mask)
1165                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1166                         else
1167                                 continue;
1168                 }
1169
1170                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1171                 spte_set = true;
1172         }
1173
1174         rcu_read_unlock();
1175         return spte_set;
1176 }
1177
1178 /*
1179  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1180  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1181  * If AD bits are not enabled, this will require clearing the writable bit on
1182  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1183  * be flushed.
1184  */
1185 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1186 {
1187         struct kvm_mmu_page *root;
1188         int root_as_id;
1189         bool spte_set = false;
1190
1191         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1192                 root_as_id = kvm_mmu_page_as_id(root);
1193                 if (root_as_id != slot->as_id)
1194                         continue;
1195
1196                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1197                                 slot->base_gfn + slot->npages);
1198         }
1199
1200         return spte_set;
1201 }
1202
1203 /*
1204  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1205  * set in mask, starting at gfn. The given memslot is expected to contain all
1206  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1207  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1208  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1209  */
1210 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1211                                   gfn_t gfn, unsigned long mask, bool wrprot)
1212 {
1213         struct tdp_iter iter;
1214         u64 new_spte;
1215
1216         rcu_read_lock();
1217
1218         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1219                                     gfn + BITS_PER_LONG) {
1220                 if (!mask)
1221                         break;
1222
1223                 if (iter.level > PG_LEVEL_4K ||
1224                     !(mask & (1UL << (iter.gfn - gfn))))
1225                         continue;
1226
1227                 mask &= ~(1UL << (iter.gfn - gfn));
1228
1229                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1230                         if (is_writable_pte(iter.old_spte))
1231                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1232                         else
1233                                 continue;
1234                 } else {
1235                         if (iter.old_spte & shadow_dirty_mask)
1236                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1237                         else
1238                                 continue;
1239                 }
1240
1241                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1242         }
1243
1244         rcu_read_unlock();
1245 }
1246
1247 /*
1248  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1249  * set in mask, starting at gfn. The given memslot is expected to contain all
1250  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1251  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1252  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1253  */
1254 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1255                                        struct kvm_memory_slot *slot,
1256                                        gfn_t gfn, unsigned long mask,
1257                                        bool wrprot)
1258 {
1259         struct kvm_mmu_page *root;
1260         int root_as_id;
1261
1262         lockdep_assert_held_write(&kvm->mmu_lock);
1263         for_each_tdp_mmu_root(kvm, root) {
1264                 root_as_id = kvm_mmu_page_as_id(root);
1265                 if (root_as_id != slot->as_id)
1266                         continue;
1267
1268                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1269         }
1270 }
1271
1272 /*
1273  * Clear leaf entries which could be replaced by large mappings, for
1274  * GFNs within the slot.
1275  */
1276 static bool zap_collapsible_spte_range(struct kvm *kvm,
1277                                        struct kvm_mmu_page *root,
1278                                        struct kvm_memory_slot *slot,
1279                                        bool flush)
1280 {
1281         gfn_t start = slot->base_gfn;
1282         gfn_t end = start + slot->npages;
1283         struct tdp_iter iter;
1284         kvm_pfn_t pfn;
1285
1286         rcu_read_lock();
1287
1288         tdp_root_for_each_pte(iter, root, start, end) {
1289                 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
1290                         flush = false;
1291                         continue;
1292                 }
1293
1294                 if (!is_shadow_present_pte(iter.old_spte) ||
1295                     !is_last_spte(iter.old_spte, iter.level))
1296                         continue;
1297
1298                 pfn = spte_to_pfn(iter.old_spte);
1299                 if (kvm_is_reserved_pfn(pfn) ||
1300                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1301                                                             pfn, PG_LEVEL_NUM))
1302                         continue;
1303
1304                 tdp_mmu_set_spte(kvm, &iter, 0);
1305
1306                 flush = true;
1307         }
1308
1309         rcu_read_unlock();
1310
1311         return flush;
1312 }
1313
1314 /*
1315  * Clear non-leaf entries (and free associated page tables) which could
1316  * be replaced by large mappings, for GFNs within the slot.
1317  */
1318 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1319                                        struct kvm_memory_slot *slot, bool flush)
1320 {
1321         struct kvm_mmu_page *root;
1322         int root_as_id;
1323
1324         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1325                 root_as_id = kvm_mmu_page_as_id(root);
1326                 if (root_as_id != slot->as_id)
1327                         continue;
1328
1329                 flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1330         }
1331
1332         return flush;
1333 }
1334
1335 /*
1336  * Removes write access on the last level SPTE mapping this GFN and unsets the
1337  * MMU-writable bit to ensure future writes continue to be intercepted.
1338  * Returns true if an SPTE was set and a TLB flush is needed.
1339  */
1340 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1341                               gfn_t gfn)
1342 {
1343         struct tdp_iter iter;
1344         u64 new_spte;
1345         bool spte_set = false;
1346
1347         rcu_read_lock();
1348
1349         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1350                 if (!is_writable_pte(iter.old_spte))
1351                         break;
1352
1353                 new_spte = iter.old_spte &
1354                         ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1355
1356                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1357                 spte_set = true;
1358         }
1359
1360         rcu_read_unlock();
1361
1362         return spte_set;
1363 }
1364
1365 /*
1366  * Removes write access on the last level SPTE mapping this GFN and unsets the
1367  * MMU-writable bit to ensure future writes continue to be intercepted.
1368  * Returns true if an SPTE was set and a TLB flush is needed.
1369  */
1370 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1371                                    struct kvm_memory_slot *slot, gfn_t gfn)
1372 {
1373         struct kvm_mmu_page *root;
1374         int root_as_id;
1375         bool spte_set = false;
1376
1377         lockdep_assert_held_write(&kvm->mmu_lock);
1378         for_each_tdp_mmu_root(kvm, root) {
1379                 root_as_id = kvm_mmu_page_as_id(root);
1380                 if (root_as_id != slot->as_id)
1381                         continue;
1382
1383                 spte_set |= write_protect_gfn(kvm, root, gfn);
1384         }
1385         return spte_set;
1386 }
1387
1388 /*
1389  * Return the level of the lowest level SPTE added to sptes.
1390  * That SPTE may be non-present.
1391  */
1392 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1393                          int *root_level)
1394 {
1395         struct tdp_iter iter;
1396         struct kvm_mmu *mmu = vcpu->arch.mmu;
1397         gfn_t gfn = addr >> PAGE_SHIFT;
1398         int leaf = -1;
1399
1400         *root_level = vcpu->arch.mmu->shadow_root_level;
1401
1402         rcu_read_lock();
1403
1404         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1405                 leaf = iter.level;
1406                 sptes[leaf] = iter.old_spte;
1407         }
1408
1409         rcu_read_unlock();
1410
1411         return leaf;
1412 }