arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = false;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  20                 return;
  21
  22         /* This should not be changed for the lifetime of the VM. */
  23         kvm->arch.tdp_mmu_enabled = true;
  24
  25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  28 }
  29
  30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  31 {
  32         if (!kvm->arch.tdp_mmu_enabled)
  33                 return;
  34
  35         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  36
  37         /*
  38          * Ensure that all the outstanding RCU callbacks to free shadow pages
  39          * can run before the VM is torn down.
  40          */
  41         rcu_barrier();
  42 }
  43
  44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
  45 {
  46         if (kvm_mmu_put_root(kvm, root))
  47                 kvm_tdp_mmu_free_root(kvm, root);
  48 }
  49
  50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
  51                                            struct kvm_mmu_page *root)
  52 {
  53         lockdep_assert_held_write(&kvm->mmu_lock);
  54
  55         if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
  56                 return false;
  57
  58         kvm_mmu_get_root(kvm, root);
  59         return true;
  60
  61 }
  62
  63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  64                                                      struct kvm_mmu_page *root)
  65 {
  66         struct kvm_mmu_page *next_root;
  67
  68         next_root = list_next_entry(root, link);
  69         tdp_mmu_put_root(kvm, root);
  70         return next_root;
  71 }
  72
  73 /*
  74  * Note: this iterator gets and puts references to the roots it iterates over.
  75  * This makes it safe to release the MMU lock and yield within the loop, but
  76  * if exiting the loop early, the caller must drop the reference to the most
  77  * recent root. (Unless keeping a live reference is desirable.)
  78  */
  79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                           \
  80         for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,        \
  81                                       typeof(*_root), link);            \
  82              tdp_mmu_next_root_valid(_kvm, _root);                      \
  83              _root = tdp_mmu_next_root(_kvm, _root))
  84
  85 #define for_each_tdp_mmu_root(_kvm, _root)                              \
  86         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
  87
  88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  89                           gfn_t start, gfn_t end, bool can_yield, bool flush);
  90
  91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  92 {
  93         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
  94
  95         lockdep_assert_held_write(&kvm->mmu_lock);
  96
  97         WARN_ON(root->root_count);
  98         WARN_ON(!root->tdp_mmu_page);
  99
 100         list_del(&root->link);
 101
 102         zap_gfn_range(kvm, root, 0, max_gfn, false, false);
 103
 104         free_page((unsigned long)root->spt);
 105         kmem_cache_free(mmu_page_header_cache, root);
 106 }
 107
 108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
 109                                                    int level)
 110 {
 111         union kvm_mmu_page_role role;
 112
 113         role = vcpu->arch.mmu->mmu_role.base;
 114         role.level = level;
 115         role.direct = true;
 116         role.gpte_is_8_bytes = true;
 117         role.access = ACC_ALL;
 118
 119         return role;
 120 }
 121
 122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 123                                                int level)
 124 {
 125         struct kvm_mmu_page *sp;
 126
 127         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 128         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 129         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 130
 131         sp->role.word = page_role_for_level(vcpu, level).word;
 132         sp->gfn = gfn;
 133         sp->tdp_mmu_page = true;
 134
 135         trace_kvm_mmu_get_page(sp, true);
 136
 137         return sp;
 138 }
 139
 140 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 141 {
 142         union kvm_mmu_page_role role;
 143         struct kvm *kvm = vcpu->kvm;
 144         struct kvm_mmu_page *root;
 145
 146         lockdep_assert_held_write(&kvm->mmu_lock);
 147
 148         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 149
 150         /* Check for an existing root before allocating a new one. */
 151         for_each_tdp_mmu_root(kvm, root) {
 152                 if (root->role.word == role.word) {
 153                         kvm_mmu_get_root(kvm, root);
 154                         goto out;
 155                 }
 156         }
 157
 158         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 159         root->root_count = 1;
 160
 161         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 162
 163 out:
 164         return __pa(root->spt);
 165 }
 166
 167 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 168 {
 169         free_page((unsigned long)sp->spt);
 170         kmem_cache_free(mmu_page_header_cache, sp);
 171 }
 172
 173 /*
 174  * This is called through call_rcu in order to free TDP page table memory
 175  * safely with respect to other kernel threads that may be operating on
 176  * the memory.
 177  * By only accessing TDP MMU page table memory in an RCU read critical
 178  * section, and freeing it after a grace period, lockless access to that
 179  * memory won't use it after it is freed.
 180  */
 181 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
 182 {
 183         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
 184                                                rcu_head);
 185
 186         tdp_mmu_free_sp(sp);
 187 }
 188
 189 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 190                                 u64 old_spte, u64 new_spte, int level,
 191                                 bool shared);
 192
 193 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
 194 {
 195         return sp->role.smm ? 1 : 0;
 196 }
 197
 198 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 199 {
 200         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 201                 return;
 202
 203         if (is_accessed_spte(old_spte) &&
 204             (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
 205              spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
 206                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 207 }
 208
 209 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 210                                           u64 old_spte, u64 new_spte, int level)
 211 {
 212         bool pfn_changed;
 213         struct kvm_memory_slot *slot;
 214
 215         if (level > PG_LEVEL_4K)
 216                 return;
 217
 218         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 219
 220         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 221             is_writable_pte(new_spte)) {
 222                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 223                 mark_page_dirty_in_slot(kvm, slot, gfn);
 224         }
 225 }
 226
 227 /**
 228  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
 229  *
 230  * @kvm: kvm instance
 231  * @sp: the new page
 232  * @shared: This operation may not be running under the exclusive use of
 233  *          the MMU lock and the operation must synchronize with other
 234  *          threads that might be adding or removing pages.
 235  * @account_nx: This page replaces a NX large page and should be marked for
 236  *              eventual reclaim.
 237  */
 238 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 239                               bool shared, bool account_nx)
 240 {
 241         if (shared)
 242                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 243         else
 244                 lockdep_assert_held_write(&kvm->mmu_lock);
 245
 246         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
 247         if (account_nx)
 248                 account_huge_nx_page(kvm, sp);
 249
 250         if (shared)
 251                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 252 }
 253
 254 /**
 255  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
 256  *
 257  * @kvm: kvm instance
 258  * @sp: the page to be removed
 259  * @shared: This operation may not be running under the exclusive use of
 260  *          the MMU lock and the operation must synchronize with other
 261  *          threads that might be adding or removing pages.
 262  */
 263 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 264                                 bool shared)
 265 {
 266         if (shared)
 267                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 268         else
 269                 lockdep_assert_held_write(&kvm->mmu_lock);
 270
 271         list_del(&sp->link);
 272         if (sp->lpage_disallowed)
 273                 unaccount_huge_nx_page(kvm, sp);
 274
 275         if (shared)
 276                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 277 }
 278
 279 /**
 280  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
 281  *
 282  * @kvm: kvm instance
 283  * @pt: the page removed from the paging structure
 284  * @shared: This operation may not be running under the exclusive use
 285  *          of the MMU lock and the operation must synchronize with other
 286  *          threads that might be modifying SPTEs.
 287  *
 288  * Given a page table that has been removed from the TDP paging structure,
 289  * iterates through the page table to clear SPTEs and free child page tables.
 290  */
 291 static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
 292                                         bool shared)
 293 {
 294         struct kvm_mmu_page *sp = sptep_to_sp(pt);
 295         int level = sp->role.level;
 296         gfn_t base_gfn = sp->gfn;
 297         u64 old_child_spte;
 298         u64 *sptep;
 299         gfn_t gfn;
 300         int i;
 301
 302         trace_kvm_mmu_prepare_zap_page(sp);
 303
 304         tdp_mmu_unlink_page(kvm, sp, shared);
 305
 306         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 307                 sptep = pt + i;
 308                 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 309
 310                 if (shared) {
 311                         /*
 312                          * Set the SPTE to a nonpresent value that other
 313                          * threads will not overwrite. If the SPTE was
 314                          * already marked as removed then another thread
 315                          * handling a page fault could overwrite it, so
 316                          * set the SPTE until it is set from some other
 317                          * value to the removed SPTE value.
 318                          */
 319                         for (;;) {
 320                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 321                                 if (!is_removed_spte(old_child_spte))
 322                                         break;
 323                                 cpu_relax();
 324                         }
 325                 } else {
 326                         /*
 327                          * If the SPTE is not MMU-present, there is no backing
 328                          * page associated with the SPTE and so no side effects
 329                          * that need to be recorded, and exclusive ownership of
 330                          * mmu_lock ensures the SPTE can't be made present.
 331                          * Note, zapping MMIO SPTEs is also unnecessary as they
 332                          * are guarded by the memslots generation, not by being
 333                          * unreachable.
 334                          */
 335                         old_child_spte = READ_ONCE(*sptep);
 336                         if (!is_shadow_present_pte(old_child_spte))
 337                                 continue;
 338
 339                         /*
 340                          * Marking the SPTE as a removed SPTE is not
 341                          * strictly necessary here as the MMU lock will
 342                          * stop other threads from concurrently modifying
 343                          * this SPTE. Using the removed SPTE value keeps
 344                          * the two branches consistent and simplifies
 345                          * the function.
 346                          */
 347                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 348                 }
 349                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 350                                     old_child_spte, REMOVED_SPTE, level - 1,
 351                                     shared);
 352         }
 353
 354         kvm_flush_remote_tlbs_with_address(kvm, gfn,
 355                                            KVM_PAGES_PER_HPAGE(level));
 356
 357         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 358 }
 359
 360 /**
 361  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 362  * @kvm: kvm instance
 363  * @as_id: the address space of the paging structure the SPTE was a part of
 364  * @gfn: the base GFN that was mapped by the SPTE
 365  * @old_spte: The value of the SPTE before the change
 366  * @new_spte: The value of the SPTE after the change
 367  * @level: the level of the PT the SPTE is part of in the paging structure
 368  * @shared: This operation may not be running under the exclusive use of
 369  *          the MMU lock and the operation must synchronize with other
 370  *          threads that might be modifying SPTEs.
 371  *
 372  * Handle bookkeeping that might result from the modification of a SPTE.
 373  * This function must be called for all TDP SPTE modifications.
 374  */
 375 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 376                                   u64 old_spte, u64 new_spte, int level,
 377                                   bool shared)
 378 {
 379         bool was_present = is_shadow_present_pte(old_spte);
 380         bool is_present = is_shadow_present_pte(new_spte);
 381         bool was_leaf = was_present && is_last_spte(old_spte, level);
 382         bool is_leaf = is_present && is_last_spte(new_spte, level);
 383         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 384
 385         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 386         WARN_ON(level < PG_LEVEL_4K);
 387         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 388
 389         /*
 390          * If this warning were to trigger it would indicate that there was a
 391          * missing MMU notifier or a race with some notifier handler.
 392          * A present, leaf SPTE should never be directly replaced with another
 393          * present leaf SPTE pointing to a differnt PFN. A notifier handler
 394          * should be zapping the SPTE before the main MM's page table is
 395          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 396          * thread before replacement.
 397          */
 398         if (was_leaf && is_leaf && pfn_changed) {
 399                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 400                        "SPTE with another present leaf SPTE mapping a\n"
 401                        "different PFN!\n"
 402                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 403                        as_id, gfn, old_spte, new_spte, level);
 404
 405                 /*
 406                  * Crash the host to prevent error propagation and guest data
 407                  * courruption.
 408                  */
 409                 BUG();
 410         }
 411
 412         if (old_spte == new_spte)
 413                 return;
 414
 415         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 416
 417         /*
 418          * The only times a SPTE should be changed from a non-present to
 419          * non-present state is when an MMIO entry is installed/modified/
 420          * removed. In that case, there is nothing to do here.
 421          */
 422         if (!was_present && !is_present) {
 423                 /*
 424                  * If this change does not involve a MMIO SPTE or removed SPTE,
 425                  * it is unexpected. Log the change, though it should not
 426                  * impact the guest since both the former and current SPTEs
 427                  * are nonpresent.
 428                  */
 429                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 430                             !is_mmio_spte(new_spte) &&
 431                             !is_removed_spte(new_spte)))
 432                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 433                                "should not be replaced with another,\n"
 434                                "different nonpresent SPTE, unless one or both\n"
 435                                "are MMIO SPTEs, or the new SPTE is\n"
 436                                "a temporary removed SPTE.\n"
 437                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 438                                as_id, gfn, old_spte, new_spte, level);
 439                 return;
 440         }
 441
 442
 443         if (was_leaf && is_dirty_spte(old_spte) &&
 444             (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
 445                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 446
 447         /*
 448          * Recursively handle child PTs if the change removed a subtree from
 449          * the paging structure.
 450          */
 451         if (was_present && !was_leaf && (pfn_changed || !is_present))
 452                 handle_removed_tdp_mmu_page(kvm,
 453                                 spte_to_child_pt(old_spte, level), shared);
 454 }
 455
 456 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 457                                 u64 old_spte, u64 new_spte, int level,
 458                                 bool shared)
 459 {
 460         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 461                               shared);
 462         handle_changed_spte_acc_track(old_spte, new_spte, level);
 463         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 464                                       new_spte, level);
 465 }
 466
 467 /*
 468  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
 469  * associated bookkeeping
 470  *
 471  * @kvm: kvm instance
 472  * @iter: a tdp_iter instance currently on the SPTE that should be set
 473  * @new_spte: The value the SPTE should be set to
 474  * Returns: true if the SPTE was set, false if it was not. If false is returned,
 475  *          this function will have no side-effects.
 476  */
 477 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
 478                                            struct tdp_iter *iter,
 479                                            u64 new_spte)
 480 {
 481         u64 *root_pt = tdp_iter_root_pt(iter);
 482         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
 483         int as_id = kvm_mmu_page_as_id(root);
 484
 485         lockdep_assert_held_read(&kvm->mmu_lock);
 486
 487         /*
 488          * Do not change removed SPTEs. Only the thread that froze the SPTE
 489          * may modify it.
 490          */
 491         if (is_removed_spte(iter->old_spte))
 492                 return false;
 493
 494         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
 495                       new_spte) != iter->old_spte)
 496                 return false;
 497
 498         handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
 499                             iter->level, true);
 500
 501         return true;
 502 }
 503
 504 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 505                                            struct tdp_iter *iter)
 506 {
 507         /*
 508          * Freeze the SPTE by setting it to a special,
 509          * non-present value. This will stop other threads from
 510          * immediately installing a present entry in its place
 511          * before the TLBs are flushed.
 512          */
 513         if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
 514                 return false;
 515
 516         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 517                                            KVM_PAGES_PER_HPAGE(iter->level));
 518
 519         /*
 520          * No other thread can overwrite the removed SPTE as they
 521          * must either wait on the MMU lock or use
 522          * tdp_mmu_set_spte_atomic which will not overrite the
 523          * special removed SPTE value. No bookkeeping is needed
 524          * here since the SPTE is going from non-present
 525          * to non-present.
 526          */
 527         WRITE_ONCE(*iter->sptep, 0);
 528
 529         return true;
 530 }
 531
 532
 533 /*
 534  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 535  * @kvm: kvm instance
 536  * @iter: a tdp_iter instance currently on the SPTE that should be set
 537  * @new_spte: The value the SPTE should be set to
 538  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 539  *                    of the page. Should be set unless handling an MMU
 540  *                    notifier for access tracking. Leaving record_acc_track
 541  *                    unset in that case prevents page accesses from being
 542  *                    double counted.
 543  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 544  *                    appropriate for the change being made. Should be set
 545  *                    unless performing certain dirty logging operations.
 546  *                    Leaving record_dirty_log unset in that case prevents page
 547  *                    writes from being double counted.
 548  */
 549 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 550                                       u64 new_spte, bool record_acc_track,
 551                                       bool record_dirty_log)
 552 {
 553         tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
 554         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
 555         int as_id = kvm_mmu_page_as_id(root);
 556
 557         lockdep_assert_held_write(&kvm->mmu_lock);
 558
 559         /*
 560          * No thread should be using this function to set SPTEs to the
 561          * temporary removed SPTE value.
 562          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 563          * should be used. If operating under the MMU lock in write mode, the
 564          * use of the removed SPTE should not be necessary.
 565          */
 566         WARN_ON(is_removed_spte(iter->old_spte));
 567
 568         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 569
 570         __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
 571                               iter->level, false);
 572         if (record_acc_track)
 573                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 574                                               iter->level);
 575         if (record_dirty_log)
 576                 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
 577                                               iter->old_spte, new_spte,
 578                                               iter->level);
 579 }
 580
 581 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 582                                     u64 new_spte)
 583 {
 584         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 585 }
 586
 587 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 588                                                  struct tdp_iter *iter,
 589                                                  u64 new_spte)
 590 {
 591         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 592 }
 593
 594 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 595                                                  struct tdp_iter *iter,
 596                                                  u64 new_spte)
 597 {
 598         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 599 }
 600
 601 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 602         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 603
 604 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 605         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 606                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 607                     !is_last_spte(_iter.old_spte, _iter.level))         \
 608                         continue;                                       \
 609                 else
 610
 611 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 612         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 613                          _mmu->shadow_root_level, _start, _end)
 614
 615 /*
 616  * Yield if the MMU lock is contended or this thread needs to return control
 617  * to the scheduler.
 618  *
 619  * If this function should yield and flush is set, it will perform a remote
 620  * TLB flush before yielding.
 621  *
 622  * If this function yields, it will also reset the tdp_iter's walk over the
 623  * paging structure and the calling function should skip to the next
 624  * iteration to allow the iterator to continue its traversal from the
 625  * paging structure root.
 626  *
 627  * Return true if this function yielded and the iterator's traversal was reset.
 628  * Return false if a yield was not needed.
 629  */
 630 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 631                                              struct tdp_iter *iter, bool flush)
 632 {
 633         /* Ensure forward progress has been made before yielding. */
 634         if (iter->next_last_level_gfn == iter->yielded_gfn)
 635                 return false;
 636
 637         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 638                 rcu_read_unlock();
 639
 640                 if (flush)
 641                         kvm_flush_remote_tlbs(kvm);
 642
 643                 cond_resched_rwlock_write(&kvm->mmu_lock);
 644                 rcu_read_lock();
 645
 646                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 647
 648                 tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
 649                                iter->root_level, iter->min_level,
 650                                iter->next_last_level_gfn);
 651
 652                 return true;
 653         }
 654
 655         return false;
 656 }
 657
 658 /*
 659  * Tears down the mappings for the range of gfns, [start, end), and frees the
 660  * non-root pages mapping GFNs strictly within that range. Returns true if
 661  * SPTEs have been cleared and a TLB flush is needed before releasing the
 662  * MMU lock.
 663  * If can_yield is true, will release the MMU lock and reschedule if the
 664  * scheduler needs the CPU or there is contention on the MMU lock. If this
 665  * function cannot yield, it will not release the MMU lock or reschedule and
 666  * the caller must ensure it does not supply too large a GFN range, or the
 667  * operation can cause a soft lockup.  Note, in some use cases a flush may be
 668  * required by prior actions.  Ensure the pending flush is performed prior to
 669  * yielding.
 670  */
 671 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 672                           gfn_t start, gfn_t end, bool can_yield, bool flush)
 673 {
 674         struct tdp_iter iter;
 675
 676         rcu_read_lock();
 677
 678         tdp_root_for_each_pte(iter, root, start, end) {
 679                 if (can_yield &&
 680                     tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
 681                         flush = false;
 682                         continue;
 683                 }
 684
 685                 if (!is_shadow_present_pte(iter.old_spte))
 686                         continue;
 687
 688                 /*
 689                  * If this is a non-last-level SPTE that covers a larger range
 690                  * than should be zapped, continue, and zap the mappings at a
 691                  * lower level.
 692                  */
 693                 if ((iter.gfn < start ||
 694                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 695                     !is_last_spte(iter.old_spte, iter.level))
 696                         continue;
 697
 698                 tdp_mmu_set_spte(kvm, &iter, 0);
 699                 flush = true;
 700         }
 701
 702         rcu_read_unlock();
 703         return flush;
 704 }
 705
 706 /*
 707  * Tears down the mappings for the range of gfns, [start, end), and frees the
 708  * non-root pages mapping GFNs strictly within that range. Returns true if
 709  * SPTEs have been cleared and a TLB flush is needed before releasing the
 710  * MMU lock.
 711  */
 712 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
 713                                  bool can_yield)
 714 {
 715         struct kvm_mmu_page *root;
 716         bool flush = false;
 717
 718         for_each_tdp_mmu_root_yield_safe(kvm, root)
 719                 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
 720
 721         return flush;
 722 }
 723
 724 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 725 {
 726         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 727         bool flush;
 728
 729         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
 730         if (flush)
 731                 kvm_flush_remote_tlbs(kvm);
 732 }
 733
 734 /*
 735  * Installs a last-level SPTE to handle a TDP page fault.
 736  * (NPT/EPT violation/misconfiguration)
 737  */
 738 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 739                                           int map_writable,
 740                                           struct tdp_iter *iter,
 741                                           kvm_pfn_t pfn, bool prefault)
 742 {
 743         u64 new_spte;
 744         int ret = 0;
 745         int make_spte_ret = 0;
 746
 747         if (unlikely(is_noslot_pfn(pfn)))
 748                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 749         else
 750                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 751                                          pfn, iter->old_spte, prefault, true,
 752                                          map_writable, !shadow_accessed_mask,
 753                                          &new_spte);
 754
 755         if (new_spte == iter->old_spte)
 756                 ret = RET_PF_SPURIOUS;
 757         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 758                 return RET_PF_RETRY;
 759
 760         /*
 761          * If the page fault was caused by a write but the page is write
 762          * protected, emulation is needed. If the emulation was skipped,
 763          * the vCPU would have the same fault again.
 764          */
 765         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 766                 if (write)
 767                         ret = RET_PF_EMULATE;
 768                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 769         }
 770
 771         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 772         if (unlikely(is_mmio_spte(new_spte))) {
 773                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 774                                      new_spte);
 775                 ret = RET_PF_EMULATE;
 776         } else {
 777                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 778                                        rcu_dereference(iter->sptep));
 779         }
 780
 781         if (!prefault)
 782                 vcpu->stat.pf_fixed++;
 783
 784         return ret;
 785 }
 786
 787 /*
 788  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 789  * page tables and SPTEs to translate the faulting guest physical address.
 790  */
 791 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 792                     int map_writable, int max_level, kvm_pfn_t pfn,
 793                     bool prefault)
 794 {
 795         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 796         bool write = error_code & PFERR_WRITE_MASK;
 797         bool exec = error_code & PFERR_FETCH_MASK;
 798         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 799         struct kvm_mmu *mmu = vcpu->arch.mmu;
 800         struct tdp_iter iter;
 801         struct kvm_mmu_page *sp;
 802         u64 *child_pt;
 803         u64 new_spte;
 804         int ret;
 805         gfn_t gfn = gpa >> PAGE_SHIFT;
 806         int level;
 807         int req_level;
 808
 809         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
 810                 return RET_PF_RETRY;
 811         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
 812                 return RET_PF_RETRY;
 813
 814         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
 815                                         huge_page_disallowed, &req_level);
 816
 817         trace_kvm_mmu_spte_requested(gpa, level, pfn);
 818
 819         rcu_read_lock();
 820
 821         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 822                 if (nx_huge_page_workaround_enabled)
 823                         disallowed_hugepage_adjust(iter.old_spte, gfn,
 824                                                    iter.level, &pfn, &level);
 825
 826                 if (iter.level == level)
 827                         break;
 828
 829                 /*
 830                  * If there is an SPTE mapping a large page at a higher level
 831                  * than the target, that SPTE must be cleared and replaced
 832                  * with a non-leaf SPTE.
 833                  */
 834                 if (is_shadow_present_pte(iter.old_spte) &&
 835                     is_large_pte(iter.old_spte)) {
 836                         if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
 837                                 break;
 838
 839                         /*
 840                          * The iter must explicitly re-read the spte here
 841                          * because the new value informs the !present
 842                          * path below.
 843                          */
 844                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
 845                 }
 846
 847                 if (!is_shadow_present_pte(iter.old_spte)) {
 848                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 849                         child_pt = sp->spt;
 850
 851                         new_spte = make_nonleaf_spte(child_pt,
 852                                                      !shadow_accessed_mask);
 853
 854                         if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
 855                                                     new_spte)) {
 856                                 tdp_mmu_link_page(vcpu->kvm, sp, true,
 857                                                   huge_page_disallowed &&
 858                                                   req_level >= iter.level);
 859
 860                                 trace_kvm_mmu_get_page(sp, true);
 861                         } else {
 862                                 tdp_mmu_free_sp(sp);
 863                                 break;
 864                         }
 865                 }
 866         }
 867
 868         if (iter.level != level) {
 869                 rcu_read_unlock();
 870                 return RET_PF_RETRY;
 871         }
 872
 873         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
 874                                               pfn, prefault);
 875         rcu_read_unlock();
 876
 877         return ret;
 878 }
 879
 880 typedef int (*tdp_handler_t)(struct kvm *kvm, struct kvm_memory_slot *slot,
 881                              struct kvm_mmu_page *root, gfn_t start, gfn_t end,
 882                              unsigned long data);
 883
 884 static __always_inline int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
 885                                                         unsigned long start,
 886                                                         unsigned long end,
 887                                                         unsigned long data,
 888                                                         tdp_handler_t handler)
 889 {
 890         struct kvm_memslots *slots;
 891         struct kvm_memory_slot *memslot;
 892         struct kvm_mmu_page *root;
 893         int ret = 0;
 894         int as_id;
 895
 896         for_each_tdp_mmu_root_yield_safe(kvm, root) {
 897                 as_id = kvm_mmu_page_as_id(root);
 898                 slots = __kvm_memslots(kvm, as_id);
 899                 kvm_for_each_memslot(memslot, slots) {
 900                         unsigned long hva_start, hva_end;
 901                         gfn_t gfn_start, gfn_end;
 902
 903                         hva_start = max(start, memslot->userspace_addr);
 904                         hva_end = min(end, memslot->userspace_addr +
 905                                       (memslot->npages << PAGE_SHIFT));
 906                         if (hva_start >= hva_end)
 907                                 continue;
 908                         /*
 909                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 910                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 911                          */
 912                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
 913                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 914
 915                         ret |= handler(kvm, memslot, root, gfn_start,
 916                                        gfn_end, data);
 917                 }
 918         }
 919
 920         return ret;
 921 }
 922
 923 static __always_inline int kvm_tdp_mmu_handle_hva(struct kvm *kvm,
 924                                                   unsigned long addr,
 925                                                   unsigned long data,
 926                                                   tdp_handler_t handler)
 927 {
 928         return kvm_tdp_mmu_handle_hva_range(kvm, addr, addr + 1, data, handler);
 929 }
 930
 931 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
 932                                      struct kvm_memory_slot *slot,
 933                                      struct kvm_mmu_page *root, gfn_t start,
 934                                      gfn_t end, unsigned long unused)
 935 {
 936         return zap_gfn_range(kvm, root, start, end, false, false);
 937 }
 938
 939 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
 940                               unsigned long end)
 941 {
 942         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 943                                             zap_gfn_range_hva_wrapper);
 944 }
 945
 946 /*
 947  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
 948  * if any of the GFNs in the range have been accessed.
 949  */
 950 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
 951                          struct kvm_mmu_page *root, gfn_t start, gfn_t end,
 952                          unsigned long unused)
 953 {
 954         struct tdp_iter iter;
 955         int young = 0;
 956         u64 new_spte = 0;
 957
 958         rcu_read_lock();
 959
 960         tdp_root_for_each_leaf_pte(iter, root, start, end) {
 961                 /*
 962                  * If we have a non-accessed entry we don't need to change the
 963                  * pte.
 964                  */
 965                 if (!is_accessed_spte(iter.old_spte))
 966                         continue;
 967
 968                 new_spte = iter.old_spte;
 969
 970                 if (spte_ad_enabled(new_spte)) {
 971                         clear_bit((ffs(shadow_accessed_mask) - 1),
 972                                   (unsigned long *)&new_spte);
 973                 } else {
 974                         /*
 975                          * Capture the dirty status of the page, so that it doesn't get
 976                          * lost when the SPTE is marked for access tracking.
 977                          */
 978                         if (is_writable_pte(new_spte))
 979                                 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 980
 981                         new_spte = mark_spte_for_access_track(new_spte);
 982                 }
 983                 new_spte &= ~shadow_dirty_mask;
 984
 985                 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
 986                 young = 1;
 987
 988                 trace_kvm_age_page(iter.gfn, iter.level, slot, young);
 989         }
 990
 991         rcu_read_unlock();
 992
 993         return young;
 994 }
 995
 996 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
 997                               unsigned long end)
 998 {
 999         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
1000                                             age_gfn_range);
1001 }
1002
1003 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
1004                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
1005                         unsigned long unused)
1006 {
1007         struct tdp_iter iter;
1008
1009         tdp_root_for_each_leaf_pte(iter, root, gfn, end)
1010                 if (is_accessed_spte(iter.old_spte))
1011                         return 1;
1012
1013         return 0;
1014 }
1015
1016 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1017 {
1018         return kvm_tdp_mmu_handle_hva(kvm, hva, 0, test_age_gfn);
1019 }
1020
1021 /*
1022  * Handle the changed_pte MMU notifier for the TDP MMU.
1023  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1024  * notifier.
1025  * Returns non-zero if a flush is needed before releasing the MMU lock.
1026  */
1027 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1028                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
1029                         unsigned long data)
1030 {
1031         struct tdp_iter iter;
1032         pte_t *ptep = (pte_t *)data;
1033         kvm_pfn_t new_pfn;
1034         u64 new_spte;
1035         int need_flush = 0;
1036
1037         rcu_read_lock();
1038
1039         WARN_ON(pte_huge(*ptep) || (gfn + 1) != end);
1040
1041         new_pfn = pte_pfn(*ptep);
1042
1043         tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
1044                 if (iter.level != PG_LEVEL_4K)
1045                         continue;
1046
1047                 if (!is_shadow_present_pte(iter.old_spte))
1048                         break;
1049
1050                 /*
1051                  * Note, when changing a read-only SPTE, it's not strictly
1052                  * necessary to zero the SPTE before setting the new PFN, but
1053                  * doing so preserves the invariant that the PFN of a present
1054                  * leaf SPTE can never change.  See __handle_changed_spte().
1055                  */
1056                 tdp_mmu_set_spte(kvm, &iter, 0);
1057
1058                 if (!pte_write(*ptep)) {
1059                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1060                                         iter.old_spte, new_pfn);
1061
1062                         tdp_mmu_set_spte(kvm, &iter, new_spte);
1063                 }
1064
1065                 need_flush = 1;
1066         }
1067
1068         if (need_flush)
1069                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1070
1071         rcu_read_unlock();
1072
1073         return 0;
1074 }
1075
1076 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1077                              pte_t *host_ptep)
1078 {
1079         return kvm_tdp_mmu_handle_hva(kvm, address, (unsigned long)host_ptep,
1080                                       set_tdp_spte);
1081 }
1082
1083 /*
1084  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1085  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1086  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1087  */
1088 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1089                              gfn_t start, gfn_t end, int min_level)
1090 {
1091         struct tdp_iter iter;
1092         u64 new_spte;
1093         bool spte_set = false;
1094
1095         rcu_read_lock();
1096
1097         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1098
1099         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1100                                    min_level, start, end) {
1101                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1102                         continue;
1103
1104                 if (!is_shadow_present_pte(iter.old_spte) ||
1105                     !is_last_spte(iter.old_spte, iter.level) ||
1106                     !(iter.old_spte & PT_WRITABLE_MASK))
1107                         continue;
1108
1109                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1110
1111                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1112                 spte_set = true;
1113         }
1114
1115         rcu_read_unlock();
1116         return spte_set;
1117 }
1118
1119 /*
1120  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1121  * only affect leaf SPTEs down to min_level.
1122  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1123  */
1124 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1125                              int min_level)
1126 {
1127         struct kvm_mmu_page *root;
1128         int root_as_id;
1129         bool spte_set = false;
1130
1131         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1132                 root_as_id = kvm_mmu_page_as_id(root);
1133                 if (root_as_id != slot->as_id)
1134                         continue;
1135
1136                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1137                              slot->base_gfn + slot->npages, min_level);
1138         }
1139
1140         return spte_set;
1141 }
1142
1143 /*
1144  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1145  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1146  * If AD bits are not enabled, this will require clearing the writable bit on
1147  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1148  * be flushed.
1149  */
1150 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1151                            gfn_t start, gfn_t end)
1152 {
1153         struct tdp_iter iter;
1154         u64 new_spte;
1155         bool spte_set = false;
1156
1157         rcu_read_lock();
1158
1159         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1160                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1161                         continue;
1162
1163                 if (spte_ad_need_write_protect(iter.old_spte)) {
1164                         if (is_writable_pte(iter.old_spte))
1165                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1166                         else
1167                                 continue;
1168                 } else {
1169                         if (iter.old_spte & shadow_dirty_mask)
1170                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1171                         else
1172                                 continue;
1173                 }
1174
1175                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1176                 spte_set = true;
1177         }
1178
1179         rcu_read_unlock();
1180         return spte_set;
1181 }
1182
1183 /*
1184  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1185  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1186  * If AD bits are not enabled, this will require clearing the writable bit on
1187  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1188  * be flushed.
1189  */
1190 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1191 {
1192         struct kvm_mmu_page *root;
1193         int root_as_id;
1194         bool spte_set = false;
1195
1196         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1197                 root_as_id = kvm_mmu_page_as_id(root);
1198                 if (root_as_id != slot->as_id)
1199                         continue;
1200
1201                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1202                                 slot->base_gfn + slot->npages);
1203         }
1204
1205         return spte_set;
1206 }
1207
1208 /*
1209  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1210  * set in mask, starting at gfn. The given memslot is expected to contain all
1211  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1212  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1213  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1214  */
1215 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1216                                   gfn_t gfn, unsigned long mask, bool wrprot)
1217 {
1218         struct tdp_iter iter;
1219         u64 new_spte;
1220
1221         rcu_read_lock();
1222
1223         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1224                                     gfn + BITS_PER_LONG) {
1225                 if (!mask)
1226                         break;
1227
1228                 if (iter.level > PG_LEVEL_4K ||
1229                     !(mask & (1UL << (iter.gfn - gfn))))
1230                         continue;
1231
1232                 mask &= ~(1UL << (iter.gfn - gfn));
1233
1234                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1235                         if (is_writable_pte(iter.old_spte))
1236                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1237                         else
1238                                 continue;
1239                 } else {
1240                         if (iter.old_spte & shadow_dirty_mask)
1241                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1242                         else
1243                                 continue;
1244                 }
1245
1246                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1247         }
1248
1249         rcu_read_unlock();
1250 }
1251
1252 /*
1253  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1254  * set in mask, starting at gfn. The given memslot is expected to contain all
1255  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1256  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1257  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1258  */
1259 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1260                                        struct kvm_memory_slot *slot,
1261                                        gfn_t gfn, unsigned long mask,
1262                                        bool wrprot)
1263 {
1264         struct kvm_mmu_page *root;
1265         int root_as_id;
1266
1267         lockdep_assert_held_write(&kvm->mmu_lock);
1268         for_each_tdp_mmu_root(kvm, root) {
1269                 root_as_id = kvm_mmu_page_as_id(root);
1270                 if (root_as_id != slot->as_id)
1271                         continue;
1272
1273                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1274         }
1275 }
1276
1277 /*
1278  * Clear leaf entries which could be replaced by large mappings, for
1279  * GFNs within the slot.
1280  */
1281 static void zap_collapsible_spte_range(struct kvm *kvm,
1282                                        struct kvm_mmu_page *root,
1283                                        struct kvm_memory_slot *slot)
1284 {
1285         gfn_t start = slot->base_gfn;
1286         gfn_t end = start + slot->npages;
1287         struct tdp_iter iter;
1288         kvm_pfn_t pfn;
1289         bool spte_set = false;
1290
1291         rcu_read_lock();
1292
1293         tdp_root_for_each_pte(iter, root, start, end) {
1294                 if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
1295                         spte_set = false;
1296                         continue;
1297                 }
1298
1299                 if (!is_shadow_present_pte(iter.old_spte) ||
1300                     !is_last_spte(iter.old_spte, iter.level))
1301                         continue;
1302
1303                 pfn = spte_to_pfn(iter.old_spte);
1304                 if (kvm_is_reserved_pfn(pfn) ||
1305                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1306                                                             pfn, PG_LEVEL_NUM))
1307                         continue;
1308
1309                 tdp_mmu_set_spte(kvm, &iter, 0);
1310
1311                 spte_set = true;
1312         }
1313
1314         rcu_read_unlock();
1315         if (spte_set)
1316                 kvm_flush_remote_tlbs(kvm);
1317 }
1318
1319 /*
1320  * Clear non-leaf entries (and free associated page tables) which could
1321  * be replaced by large mappings, for GFNs within the slot.
1322  */
1323 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1324                                        struct kvm_memory_slot *slot)
1325 {
1326         struct kvm_mmu_page *root;
1327         int root_as_id;
1328
1329         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1330                 root_as_id = kvm_mmu_page_as_id(root);
1331                 if (root_as_id != slot->as_id)
1332                         continue;
1333
1334                 zap_collapsible_spte_range(kvm, root, slot);
1335         }
1336 }
1337
1338 /*
1339  * Removes write access on the last level SPTE mapping this GFN and unsets the
1340  * MMU-writable bit to ensure future writes continue to be intercepted.
1341  * Returns true if an SPTE was set and a TLB flush is needed.
1342  */
1343 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1344                               gfn_t gfn)
1345 {
1346         struct tdp_iter iter;
1347         u64 new_spte;
1348         bool spte_set = false;
1349
1350         rcu_read_lock();
1351
1352         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1353                 if (!is_writable_pte(iter.old_spte))
1354                         break;
1355
1356                 new_spte = iter.old_spte &
1357                         ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1358
1359                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1360                 spte_set = true;
1361         }
1362
1363         rcu_read_unlock();
1364
1365         return spte_set;
1366 }
1367
1368 /*
1369  * Removes write access on the last level SPTE mapping this GFN and unsets the
1370  * MMU-writable bit to ensure future writes continue to be intercepted.
1371  * Returns true if an SPTE was set and a TLB flush is needed.
1372  */
1373 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1374                                    struct kvm_memory_slot *slot, gfn_t gfn)
1375 {
1376         struct kvm_mmu_page *root;
1377         int root_as_id;
1378         bool spte_set = false;
1379
1380         lockdep_assert_held_write(&kvm->mmu_lock);
1381         for_each_tdp_mmu_root(kvm, root) {
1382                 root_as_id = kvm_mmu_page_as_id(root);
1383                 if (root_as_id != slot->as_id)
1384                         continue;
1385
1386                 spte_set |= write_protect_gfn(kvm, root, gfn);
1387         }
1388         return spte_set;
1389 }
1390
1391 /*
1392  * Return the level of the lowest level SPTE added to sptes.
1393  * That SPTE may be non-present.
1394  */
1395 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1396                          int *root_level)
1397 {
1398         struct tdp_iter iter;
1399         struct kvm_mmu *mmu = vcpu->arch.mmu;
1400         gfn_t gfn = addr >> PAGE_SHIFT;
1401         int leaf = -1;
1402
1403         *root_level = vcpu->arch.mmu->shadow_root_level;
1404
1405         rcu_read_lock();
1406
1407         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1408                 leaf = iter.level;
1409                 sptes[leaf] = iter.old_spte;
1410         }
1411
1412         rcu_read_unlock();
1413
1414         return leaf;
1415 }