arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = true;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         struct workqueue_struct *wq;
  20
  21         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  22                 return 0;
  23
  24         wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
  25         if (!wq)
  26                 return -ENOMEM;
  27
  28         /* This should not be changed for the lifetime of the VM. */
  29         kvm->arch.tdp_mmu_enabled = true;
  30         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  31         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  32         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  33         kvm->arch.tdp_mmu_zap_wq = wq;
  34         return 1;
  35 }
  36
  37 /* Arbitrarily returns true so that this may be used in if statements. */
  38 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
  39                                                              bool shared)
  40 {
  41         if (shared)
  42                 lockdep_assert_held_read(&kvm->mmu_lock);
  43         else
  44                 lockdep_assert_held_write(&kvm->mmu_lock);
  45
  46         return true;
  47 }
  48
  49 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  50 {
  51         if (!kvm->arch.tdp_mmu_enabled)
  52                 return;
  53
  54         flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
  55         destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
  56
  57         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
  58         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  59
  60         /*
  61          * Ensure that all the outstanding RCU callbacks to free shadow pages
  62          * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
  63          * can call kvm_tdp_mmu_put_root and create new callbacks.
  64          */
  65         rcu_barrier();
  66 }
  67
  68 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
  69 {
  70         free_page((unsigned long)sp->spt);
  71         kmem_cache_free(mmu_page_header_cache, sp);
  72 }
  73
  74 /*
  75  * This is called through call_rcu in order to free TDP page table memory
  76  * safely with respect to other kernel threads that may be operating on
  77  * the memory.
  78  * By only accessing TDP MMU page table memory in an RCU read critical
  79  * section, and freeing it after a grace period, lockless access to that
  80  * memory won't use it after it is freed.
  81  */
  82 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
  83 {
  84         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
  85                                                rcu_head);
  86
  87         tdp_mmu_free_sp(sp);
  88 }
  89
  90 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
  91                              bool shared);
  92
  93 static void tdp_mmu_zap_root_work(struct work_struct *work)
  94 {
  95         struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
  96                                                  tdp_mmu_async_work);
  97         struct kvm *kvm = root->tdp_mmu_async_data;
  98
  99         read_lock(&kvm->mmu_lock);
 100
 101         /*
 102          * A TLB flush is not necessary as KVM performs a local TLB flush when
 103          * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
 104          * to a different pCPU.  Note, the local TLB flush on reuse also
 105          * invalidates any paging-structure-cache entries, i.e. TLB entries for
 106          * intermediate paging structures, that may be zapped, as such entries
 107          * are associated with the ASID on both VMX and SVM.
 108          */
 109         tdp_mmu_zap_root(kvm, root, true);
 110
 111         /*
 112          * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
 113          * avoiding an infinite loop.  By design, the root is reachable while
 114          * it's being asynchronously zapped, thus a different task can put its
 115          * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
 116          * asynchronously zapped root is unavoidable.
 117          */
 118         kvm_tdp_mmu_put_root(kvm, root, true);
 119
 120         read_unlock(&kvm->mmu_lock);
 121 }
 122
 123 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
 124 {
 125         root->tdp_mmu_async_data = kvm;
 126         INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
 127         queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
 128 }
 129
 130 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
 131 {
 132         union kvm_mmu_page_role role = page->role;
 133         role.invalid = true;
 134
 135         /* No need to use cmpxchg, only the invalid bit can change.  */
 136         role.word = xchg(&page->role.word, role.word);
 137         return role.invalid;
 138 }
 139
 140 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
 141                           bool shared)
 142 {
 143         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 144
 145         if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
 146                 return;
 147
 148         WARN_ON(!root->tdp_mmu_page);
 149
 150         /*
 151          * The root now has refcount=0.  It is valid, but readers already
 152          * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
 153          * rejects it.  This remains true for the rest of the execution
 154          * of this function, because readers visit valid roots only
 155          * (except for tdp_mmu_zap_root_work(), which however
 156          * does not acquire any reference itself).
 157          *
 158          * Even though there are flows that need to visit all roots for
 159          * correctness, they all take mmu_lock for write, so they cannot yet
 160          * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
 161          * since the root still has refcount=0.
 162          *
 163          * However, tdp_mmu_zap_root can yield, and writers do not expect to
 164          * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
 165          * So the root temporarily gets an extra reference, going to refcount=1
 166          * while staying invalid.  Readers still cannot acquire any reference;
 167          * but writers are now allowed to run if tdp_mmu_zap_root yields and
 168          * they might take an extra reference if they themselves yield.
 169          * Therefore, when the reference is given back by the worker,
 170          * there is no guarantee that the refcount is still 1.  If not, whoever
 171          * puts the last reference will free the page, but they will not have to
 172          * zap the root because a root cannot go from invalid to valid.
 173          */
 174         if (!kvm_tdp_root_mark_invalid(root)) {
 175                 refcount_set(&root->tdp_mmu_root_count, 1);
 176
 177                 /*
 178                  * Zapping the root in a worker is not just "nice to have";
 179                  * it is required because kvm_tdp_mmu_invalidate_all_roots()
 180                  * skips already-invalid roots.  If kvm_tdp_mmu_put_root() did
 181                  * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
 182                  * might return with some roots not zapped yet.
 183                  */
 184                 tdp_mmu_schedule_zap_root(kvm, root);
 185                 return;
 186         }
 187
 188         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 189         list_del_rcu(&root->link);
 190         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 191         call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
 192 }
 193
 194 /*
 195  * Returns the next root after @prev_root (or the first root if @prev_root is
 196  * NULL).  A reference to the returned root is acquired, and the reference to
 197  * @prev_root is released (the caller obviously must hold a reference to
 198  * @prev_root if it's non-NULL).
 199  *
 200  * If @only_valid is true, invalid roots are skipped.
 201  *
 202  * Returns NULL if the end of tdp_mmu_roots was reached.
 203  */
 204 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 205                                               struct kvm_mmu_page *prev_root,
 206                                               bool shared, bool only_valid)
 207 {
 208         struct kvm_mmu_page *next_root;
 209
 210         rcu_read_lock();
 211
 212         if (prev_root)
 213                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 214                                                   &prev_root->link,
 215                                                   typeof(*prev_root), link);
 216         else
 217                 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 218                                                    typeof(*next_root), link);
 219
 220         while (next_root) {
 221                 if ((!only_valid || !next_root->role.invalid) &&
 222                     kvm_tdp_mmu_get_root(next_root))
 223                         break;
 224
 225                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 226                                 &next_root->link, typeof(*next_root), link);
 227         }
 228
 229         rcu_read_unlock();
 230
 231         if (prev_root)
 232                 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
 233
 234         return next_root;
 235 }
 236
 237 /*
 238  * Note: this iterator gets and puts references to the roots it iterates over.
 239  * This makes it safe to release the MMU lock and yield within the loop, but
 240  * if exiting the loop early, the caller must drop the reference to the most
 241  * recent root. (Unless keeping a live reference is desirable.)
 242  *
 243  * If shared is set, this function is operating under the MMU lock in read
 244  * mode. In the unlikely event that this thread must free a root, the lock
 245  * will be temporarily dropped and reacquired in write mode.
 246  */
 247 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
 248         for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);       \
 249              _root;                                                             \
 250              _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))      \
 251                 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&          \
 252                     kvm_mmu_page_as_id(_root) != _as_id) {                      \
 253                 } else
 254
 255 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)    \
 256         __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
 257
 258 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)                   \
 259         __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
 260
 261 /*
 262  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
 263  * the implication being that any flow that holds mmu_lock for read is
 264  * inherently yield-friendly and should use the yield-safe variant above.
 265  * Holding mmu_lock for write obviates the need for RCU protection as the list
 266  * is guaranteed to be stable.
 267  */
 268 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)                      \
 269         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)     \
 270                 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&    \
 271                     kvm_mmu_page_as_id(_root) != _as_id) {              \
 272                 } else
 273
 274 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
 275 {
 276         struct kvm_mmu_page *sp;
 277
 278         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 279         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 280
 281         return sp;
 282 }
 283
 284 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
 285                             gfn_t gfn, union kvm_mmu_page_role role)
 286 {
 287         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 288
 289         sp->role = role;
 290         sp->gfn = gfn;
 291         sp->ptep = sptep;
 292         sp->tdp_mmu_page = true;
 293
 294         trace_kvm_mmu_get_page(sp, true);
 295 }
 296
 297 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
 298                                   struct tdp_iter *iter)
 299 {
 300         struct kvm_mmu_page *parent_sp;
 301         union kvm_mmu_page_role role;
 302
 303         parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
 304
 305         role = parent_sp->role;
 306         role.level--;
 307
 308         tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
 309 }
 310
 311 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 312 {
 313         union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
 314         struct kvm *kvm = vcpu->kvm;
 315         struct kvm_mmu_page *root;
 316
 317         lockdep_assert_held_write(&kvm->mmu_lock);
 318
 319         /*
 320          * Check for an existing root before allocating a new one.  Note, the
 321          * role check prevents consuming an invalid root.
 322          */
 323         for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
 324                 if (root->role.word == role.word &&
 325                     kvm_tdp_mmu_get_root(root))
 326                         goto out;
 327         }
 328
 329         root = tdp_mmu_alloc_sp(vcpu);
 330         tdp_mmu_init_sp(root, NULL, 0, role);
 331
 332         refcount_set(&root->tdp_mmu_root_count, 1);
 333
 334         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 335         list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
 336         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 337
 338 out:
 339         return __pa(root->spt);
 340 }
 341
 342 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 343                                 u64 old_spte, u64 new_spte, int level,
 344                                 bool shared);
 345
 346 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 347 {
 348         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 349                 return;
 350
 351         if (is_accessed_spte(old_spte) &&
 352             (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
 353              spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
 354                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 355 }
 356
 357 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 358                                           u64 old_spte, u64 new_spte, int level)
 359 {
 360         bool pfn_changed;
 361         struct kvm_memory_slot *slot;
 362
 363         if (level > PG_LEVEL_4K)
 364                 return;
 365
 366         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 367
 368         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 369             is_writable_pte(new_spte)) {
 370                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 371                 mark_page_dirty_in_slot(kvm, slot, gfn);
 372         }
 373 }
 374
 375 /**
 376  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
 377  *
 378  * @kvm: kvm instance
 379  * @sp: the page to be removed
 380  * @shared: This operation may not be running under the exclusive use of
 381  *          the MMU lock and the operation must synchronize with other
 382  *          threads that might be adding or removing pages.
 383  */
 384 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
 385                               bool shared)
 386 {
 387         if (shared)
 388                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 389         else
 390                 lockdep_assert_held_write(&kvm->mmu_lock);
 391
 392         list_del(&sp->link);
 393         if (sp->lpage_disallowed)
 394                 unaccount_huge_nx_page(kvm, sp);
 395
 396         if (shared)
 397                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 398 }
 399
 400 /**
 401  * handle_removed_pt() - handle a page table removed from the TDP structure
 402  *
 403  * @kvm: kvm instance
 404  * @pt: the page removed from the paging structure
 405  * @shared: This operation may not be running under the exclusive use
 406  *          of the MMU lock and the operation must synchronize with other
 407  *          threads that might be modifying SPTEs.
 408  *
 409  * Given a page table that has been removed from the TDP paging structure,
 410  * iterates through the page table to clear SPTEs and free child page tables.
 411  *
 412  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 413  * protection. Since this thread removed it from the paging structure,
 414  * this thread will be responsible for ensuring the page is freed. Hence the
 415  * early rcu_dereferences in the function.
 416  */
 417 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
 418 {
 419         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
 420         int level = sp->role.level;
 421         gfn_t base_gfn = sp->gfn;
 422         int i;
 423
 424         trace_kvm_mmu_prepare_zap_page(sp);
 425
 426         tdp_mmu_unlink_sp(kvm, sp, shared);
 427
 428         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 429                 u64 *sptep = rcu_dereference(pt) + i;
 430                 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
 431                 u64 old_child_spte;
 432
 433                 if (shared) {
 434                         /*
 435                          * Set the SPTE to a nonpresent value that other
 436                          * threads will not overwrite. If the SPTE was
 437                          * already marked as removed then another thread
 438                          * handling a page fault could overwrite it, so
 439                          * set the SPTE until it is set from some other
 440                          * value to the removed SPTE value.
 441                          */
 442                         for (;;) {
 443                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 444                                 if (!is_removed_spte(old_child_spte))
 445                                         break;
 446                                 cpu_relax();
 447                         }
 448                 } else {
 449                         /*
 450                          * If the SPTE is not MMU-present, there is no backing
 451                          * page associated with the SPTE and so no side effects
 452                          * that need to be recorded, and exclusive ownership of
 453                          * mmu_lock ensures the SPTE can't be made present.
 454                          * Note, zapping MMIO SPTEs is also unnecessary as they
 455                          * are guarded by the memslots generation, not by being
 456                          * unreachable.
 457                          */
 458                         old_child_spte = READ_ONCE(*sptep);
 459                         if (!is_shadow_present_pte(old_child_spte))
 460                                 continue;
 461
 462                         /*
 463                          * Marking the SPTE as a removed SPTE is not
 464                          * strictly necessary here as the MMU lock will
 465                          * stop other threads from concurrently modifying
 466                          * this SPTE. Using the removed SPTE value keeps
 467                          * the two branches consistent and simplifies
 468                          * the function.
 469                          */
 470                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 471                 }
 472                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 473                                     old_child_spte, REMOVED_SPTE, level,
 474                                     shared);
 475         }
 476
 477         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 478 }
 479
 480 /**
 481  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
 482  * @kvm: kvm instance
 483  * @as_id: the address space of the paging structure the SPTE was a part of
 484  * @gfn: the base GFN that was mapped by the SPTE
 485  * @old_spte: The value of the SPTE before the change
 486  * @new_spte: The value of the SPTE after the change
 487  * @level: the level of the PT the SPTE is part of in the paging structure
 488  * @shared: This operation may not be running under the exclusive use of
 489  *          the MMU lock and the operation must synchronize with other
 490  *          threads that might be modifying SPTEs.
 491  *
 492  * Handle bookkeeping that might result from the modification of a SPTE.
 493  * This function must be called for all TDP SPTE modifications.
 494  */
 495 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 496                                   u64 old_spte, u64 new_spte, int level,
 497                                   bool shared)
 498 {
 499         bool was_present = is_shadow_present_pte(old_spte);
 500         bool is_present = is_shadow_present_pte(new_spte);
 501         bool was_leaf = was_present && is_last_spte(old_spte, level);
 502         bool is_leaf = is_present && is_last_spte(new_spte, level);
 503         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 504
 505         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 506         WARN_ON(level < PG_LEVEL_4K);
 507         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 508
 509         /*
 510          * If this warning were to trigger it would indicate that there was a
 511          * missing MMU notifier or a race with some notifier handler.
 512          * A present, leaf SPTE should never be directly replaced with another
 513          * present leaf SPTE pointing to a different PFN. A notifier handler
 514          * should be zapping the SPTE before the main MM's page table is
 515          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 516          * thread before replacement.
 517          */
 518         if (was_leaf && is_leaf && pfn_changed) {
 519                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 520                        "SPTE with another present leaf SPTE mapping a\n"
 521                        "different PFN!\n"
 522                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 523                        as_id, gfn, old_spte, new_spte, level);
 524
 525                 /*
 526                  * Crash the host to prevent error propagation and guest data
 527                  * corruption.
 528                  */
 529                 BUG();
 530         }
 531
 532         if (old_spte == new_spte)
 533                 return;
 534
 535         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 536
 537         if (is_leaf)
 538                 check_spte_writable_invariants(new_spte);
 539
 540         /*
 541          * The only times a SPTE should be changed from a non-present to
 542          * non-present state is when an MMIO entry is installed/modified/
 543          * removed. In that case, there is nothing to do here.
 544          */
 545         if (!was_present && !is_present) {
 546                 /*
 547                  * If this change does not involve a MMIO SPTE or removed SPTE,
 548                  * it is unexpected. Log the change, though it should not
 549                  * impact the guest since both the former and current SPTEs
 550                  * are nonpresent.
 551                  */
 552                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 553                             !is_mmio_spte(new_spte) &&
 554                             !is_removed_spte(new_spte)))
 555                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 556                                "should not be replaced with another,\n"
 557                                "different nonpresent SPTE, unless one or both\n"
 558                                "are MMIO SPTEs, or the new SPTE is\n"
 559                                "a temporary removed SPTE.\n"
 560                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 561                                as_id, gfn, old_spte, new_spte, level);
 562                 return;
 563         }
 564
 565         if (is_leaf != was_leaf)
 566                 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
 567
 568         if (was_leaf && is_dirty_spte(old_spte) &&
 569             (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
 570                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 571
 572         /*
 573          * Recursively handle child PTs if the change removed a subtree from
 574          * the paging structure.  Note the WARN on the PFN changing without the
 575          * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
 576          * pages are kernel allocations and should never be migrated.
 577          */
 578         if (was_present && !was_leaf &&
 579             (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
 580                 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
 581 }
 582
 583 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 584                                 u64 old_spte, u64 new_spte, int level,
 585                                 bool shared)
 586 {
 587         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 588                               shared);
 589         handle_changed_spte_acc_track(old_spte, new_spte, level);
 590         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 591                                       new_spte, level);
 592 }
 593
 594 /*
 595  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
 596  * and handle the associated bookkeeping.  Do not mark the page dirty
 597  * in KVM's dirty bitmaps.
 598  *
 599  * If setting the SPTE fails because it has changed, iter->old_spte will be
 600  * refreshed to the current value of the spte.
 601  *
 602  * @kvm: kvm instance
 603  * @iter: a tdp_iter instance currently on the SPTE that should be set
 604  * @new_spte: The value the SPTE should be set to
 605  * Return:
 606  * * 0      - If the SPTE was set.
 607  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
 608  *            no side-effects other than setting iter->old_spte to the last
 609  *            known value of the spte.
 610  */
 611 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
 612                                           struct tdp_iter *iter,
 613                                           u64 new_spte)
 614 {
 615         u64 *sptep = rcu_dereference(iter->sptep);
 616         u64 old_spte;
 617
 618         /*
 619          * The caller is responsible for ensuring the old SPTE is not a REMOVED
 620          * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
 621          * and pre-checking before inserting a new SPTE is advantageous as it
 622          * avoids unnecessary work.
 623          */
 624         WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
 625
 626         lockdep_assert_held_read(&kvm->mmu_lock);
 627
 628         /*
 629          * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
 630          * does not hold the mmu_lock.
 631          */
 632         old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
 633         if (old_spte != iter->old_spte) {
 634                 /*
 635                  * The page table entry was modified by a different logical
 636                  * CPU. Refresh iter->old_spte with the current value so the
 637                  * caller operates on fresh data, e.g. if it retries
 638                  * tdp_mmu_set_spte_atomic().
 639                  */
 640                 iter->old_spte = old_spte;
 641                 return -EBUSY;
 642         }
 643
 644         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 645                               new_spte, iter->level, true);
 646         handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
 647
 648         return 0;
 649 }
 650
 651 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 652                                           struct tdp_iter *iter)
 653 {
 654         int ret;
 655
 656         /*
 657          * Freeze the SPTE by setting it to a special,
 658          * non-present value. This will stop other threads from
 659          * immediately installing a present entry in its place
 660          * before the TLBs are flushed.
 661          */
 662         ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
 663         if (ret)
 664                 return ret;
 665
 666         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 667                                            KVM_PAGES_PER_HPAGE(iter->level));
 668
 669         /*
 670          * No other thread can overwrite the removed SPTE as they
 671          * must either wait on the MMU lock or use
 672          * tdp_mmu_set_spte_atomic which will not overwrite the
 673          * special removed SPTE value. No bookkeeping is needed
 674          * here since the SPTE is going from non-present
 675          * to non-present.
 676          */
 677         kvm_tdp_mmu_write_spte(iter->sptep, 0);
 678
 679         return 0;
 680 }
 681
 682
 683 /*
 684  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 685  * @kvm:              KVM instance
 686  * @as_id:            Address space ID, i.e. regular vs. SMM
 687  * @sptep:            Pointer to the SPTE
 688  * @old_spte:         The current value of the SPTE
 689  * @new_spte:         The new value that will be set for the SPTE
 690  * @gfn:              The base GFN that was (or will be) mapped by the SPTE
 691  * @level:            The level _containing_ the SPTE (its parent PT's level)
 692  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 693  *                    of the page. Should be set unless handling an MMU
 694  *                    notifier for access tracking. Leaving record_acc_track
 695  *                    unset in that case prevents page accesses from being
 696  *                    double counted.
 697  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 698  *                    appropriate for the change being made. Should be set
 699  *                    unless performing certain dirty logging operations.
 700  *                    Leaving record_dirty_log unset in that case prevents page
 701  *                    writes from being double counted.
 702  */
 703 static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
 704                                u64 old_spte, u64 new_spte, gfn_t gfn, int level,
 705                                bool record_acc_track, bool record_dirty_log)
 706 {
 707         lockdep_assert_held_write(&kvm->mmu_lock);
 708
 709         /*
 710          * No thread should be using this function to set SPTEs to or from the
 711          * temporary removed SPTE value.
 712          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 713          * should be used. If operating under the MMU lock in write mode, the
 714          * use of the removed SPTE should not be necessary.
 715          */
 716         WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
 717
 718         kvm_tdp_mmu_write_spte(sptep, new_spte);
 719
 720         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
 721
 722         if (record_acc_track)
 723                 handle_changed_spte_acc_track(old_spte, new_spte, level);
 724         if (record_dirty_log)
 725                 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 726                                               new_spte, level);
 727 }
 728
 729 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 730                                      u64 new_spte, bool record_acc_track,
 731                                      bool record_dirty_log)
 732 {
 733         WARN_ON_ONCE(iter->yielded);
 734
 735         __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
 736                            new_spte, iter->gfn, iter->level,
 737                            record_acc_track, record_dirty_log);
 738 }
 739
 740 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 741                                     u64 new_spte)
 742 {
 743         _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 744 }
 745
 746 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 747                                                  struct tdp_iter *iter,
 748                                                  u64 new_spte)
 749 {
 750         _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 751 }
 752
 753 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 754                                                  struct tdp_iter *iter,
 755                                                  u64 new_spte)
 756 {
 757         _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 758 }
 759
 760 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 761         for_each_tdp_pte(_iter, _root, _start, _end)
 762
 763 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 764         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 765                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 766                     !is_last_spte(_iter.old_spte, _iter.level))         \
 767                         continue;                                       \
 768                 else
 769
 770 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 771         for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
 772
 773 /*
 774  * Yield if the MMU lock is contended or this thread needs to return control
 775  * to the scheduler.
 776  *
 777  * If this function should yield and flush is set, it will perform a remote
 778  * TLB flush before yielding.
 779  *
 780  * If this function yields, iter->yielded is set and the caller must skip to
 781  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
 782  * over the paging structures to allow the iterator to continue its traversal
 783  * from the paging structure root.
 784  *
 785  * Returns true if this function yielded.
 786  */
 787 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
 788                                                           struct tdp_iter *iter,
 789                                                           bool flush, bool shared)
 790 {
 791         WARN_ON(iter->yielded);
 792
 793         /* Ensure forward progress has been made before yielding. */
 794         if (iter->next_last_level_gfn == iter->yielded_gfn)
 795                 return false;
 796
 797         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 798                 if (flush)
 799                         kvm_flush_remote_tlbs(kvm);
 800
 801                 rcu_read_unlock();
 802
 803                 if (shared)
 804                         cond_resched_rwlock_read(&kvm->mmu_lock);
 805                 else
 806                         cond_resched_rwlock_write(&kvm->mmu_lock);
 807
 808                 rcu_read_lock();
 809
 810                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 811
 812                 iter->yielded = true;
 813         }
 814
 815         return iter->yielded;
 816 }
 817
 818 static inline gfn_t tdp_mmu_max_gfn_host(void)
 819 {
 820         /*
 821          * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
 822          * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
 823          * and so KVM will never install a SPTE for such addresses.
 824          */
 825         return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 826 }
 827
 828 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
 829                                bool shared, int zap_level)
 830 {
 831         struct tdp_iter iter;
 832
 833         gfn_t end = tdp_mmu_max_gfn_host();
 834         gfn_t start = 0;
 835
 836         for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
 837 retry:
 838                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
 839                         continue;
 840
 841                 if (!is_shadow_present_pte(iter.old_spte))
 842                         continue;
 843
 844                 if (iter.level > zap_level)
 845                         continue;
 846
 847                 if (!shared)
 848                         tdp_mmu_set_spte(kvm, &iter, 0);
 849                 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
 850                         goto retry;
 851         }
 852 }
 853
 854 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
 855                              bool shared)
 856 {
 857
 858         /*
 859          * The root must have an elevated refcount so that it's reachable via
 860          * mmu_notifier callbacks, which allows this path to yield and drop
 861          * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
 862          * must drop all references to relevant pages prior to completing the
 863          * callback.  Dropping mmu_lock with an unreachable root would result
 864          * in zapping SPTEs after a relevant mmu_notifier callback completes
 865          * and lead to use-after-free as zapping a SPTE triggers "writeback" of
 866          * dirty accessed bits to the SPTE's associated struct page.
 867          */
 868         WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
 869
 870         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 871
 872         rcu_read_lock();
 873
 874         /*
 875          * To avoid RCU stalls due to recursively removing huge swaths of SPs,
 876          * split the zap into two passes.  On the first pass, zap at the 1gb
 877          * level, and then zap top-level SPs on the second pass.  "1gb" is not
 878          * arbitrary, as KVM must be able to zap a 1gb shadow page without
 879          * inducing a stall to allow in-place replacement with a 1gb hugepage.
 880          *
 881          * Because zapping a SP recurses on its children, stepping down to
 882          * PG_LEVEL_4K in the iterator itself is unnecessary.
 883          */
 884         __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
 885         __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
 886
 887         rcu_read_unlock();
 888 }
 889
 890 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 891 {
 892         u64 old_spte;
 893
 894         /*
 895          * This helper intentionally doesn't allow zapping a root shadow page,
 896          * which doesn't have a parent page table and thus no associated entry.
 897          */
 898         if (WARN_ON_ONCE(!sp->ptep))
 899                 return false;
 900
 901         old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
 902         if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
 903                 return false;
 904
 905         __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
 906                            sp->gfn, sp->role.level + 1, true, true);
 907
 908         return true;
 909 }
 910
 911 /*
 912  * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
 913  * have been cleared and a TLB flush is needed before releasing the MMU lock.
 914  *
 915  * If can_yield is true, will release the MMU lock and reschedule if the
 916  * scheduler needs the CPU or there is contention on the MMU lock. If this
 917  * function cannot yield, it will not release the MMU lock or reschedule and
 918  * the caller must ensure it does not supply too large a GFN range, or the
 919  * operation can cause a soft lockup.
 920  */
 921 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
 922                               gfn_t start, gfn_t end, bool can_yield, bool flush)
 923 {
 924         struct tdp_iter iter;
 925
 926         end = min(end, tdp_mmu_max_gfn_host());
 927
 928         lockdep_assert_held_write(&kvm->mmu_lock);
 929
 930         rcu_read_lock();
 931
 932         for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
 933                 if (can_yield &&
 934                     tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
 935                         flush = false;
 936                         continue;
 937                 }
 938
 939                 if (!is_shadow_present_pte(iter.old_spte) ||
 940                     !is_last_spte(iter.old_spte, iter.level))
 941                         continue;
 942
 943                 tdp_mmu_set_spte(kvm, &iter, 0);
 944                 flush = true;
 945         }
 946
 947         rcu_read_unlock();
 948
 949         /*
 950          * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
 951          * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
 952          */
 953         return flush;
 954 }
 955
 956 /*
 957  * Tears down the mappings for the range of gfns, [start, end), and frees the
 958  * non-root pages mapping GFNs strictly within that range. Returns true if
 959  * SPTEs have been cleared and a TLB flush is needed before releasing the
 960  * MMU lock.
 961  */
 962 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
 963                            bool can_yield, bool flush)
 964 {
 965         struct kvm_mmu_page *root;
 966
 967         for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
 968                 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
 969
 970         return flush;
 971 }
 972
 973 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 974 {
 975         struct kvm_mmu_page *root;
 976         int i;
 977
 978         /*
 979          * Zap all roots, including invalid roots, as all SPTEs must be dropped
 980          * before returning to the caller.  Zap directly even if the root is
 981          * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
 982          * all that expensive and mmu_lock is already held, which means the
 983          * worker has yielded, i.e. flushing the work instead of zapping here
 984          * isn't guaranteed to be any faster.
 985          *
 986          * A TLB flush is unnecessary, KVM zaps everything if and only the VM
 987          * is being destroyed or the userspace VMM has exited.  In both cases,
 988          * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
 989          */
 990         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 991                 for_each_tdp_mmu_root_yield_safe(kvm, root, i)
 992                         tdp_mmu_zap_root(kvm, root, false);
 993         }
 994 }
 995
 996 /*
 997  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
 998  * zap" completes.
 999  */
1000 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1001 {
1002         flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1003 }
1004
1005 /*
1006  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1007  * is about to be zapped, e.g. in response to a memslots update.  The actual
1008  * zapping is performed asynchronously, so a reference is taken on all roots.
1009  * Using a separate workqueue makes it easy to ensure that the destruction is
1010  * performed before the "fast zap" completes, without keeping a separate list
1011  * of invalidated roots; the list is effectively the list of work items in
1012  * the workqueue.
1013  *
1014  * Get a reference even if the root is already invalid, the asynchronous worker
1015  * assumes it was gifted a reference to the root it processes.  Because mmu_lock
1016  * is held for write, it should be impossible to observe a root with zero refcount,
1017  * i.e. the list of roots cannot be stale.
1018  *
1019  * This has essentially the same effect for the TDP MMU
1020  * as updating mmu_valid_gen does for the shadow MMU.
1021  */
1022 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1023 {
1024         struct kvm_mmu_page *root;
1025
1026         lockdep_assert_held_write(&kvm->mmu_lock);
1027         list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1028                 if (!root->role.invalid &&
1029                     !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1030                         root->role.invalid = true;
1031                         tdp_mmu_schedule_zap_root(kvm, root);
1032                 }
1033         }
1034 }
1035
1036 /*
1037  * Installs a last-level SPTE to handle a TDP page fault.
1038  * (NPT/EPT violation/misconfiguration)
1039  */
1040 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1041                                           struct kvm_page_fault *fault,
1042                                           struct tdp_iter *iter)
1043 {
1044         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1045         u64 new_spte;
1046         int ret = RET_PF_FIXED;
1047         bool wrprot = false;
1048
1049         WARN_ON(sp->role.level != fault->goal_level);
1050         if (unlikely(!fault->slot))
1051                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1052         else
1053                 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1054                                          fault->pfn, iter->old_spte, fault->prefetch, true,
1055                                          fault->map_writable, &new_spte);
1056
1057         if (new_spte == iter->old_spte)
1058                 ret = RET_PF_SPURIOUS;
1059         else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1060                 return RET_PF_RETRY;
1061         else if (is_shadow_present_pte(iter->old_spte) &&
1062                  !is_last_spte(iter->old_spte, iter->level))
1063                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1064                                                    KVM_PAGES_PER_HPAGE(iter->level + 1));
1065
1066         /*
1067          * If the page fault was caused by a write but the page is write
1068          * protected, emulation is needed. If the emulation was skipped,
1069          * the vCPU would have the same fault again.
1070          */
1071         if (wrprot) {
1072                 if (fault->write)
1073                         ret = RET_PF_EMULATE;
1074         }
1075
1076         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1077         if (unlikely(is_mmio_spte(new_spte))) {
1078                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1079                                      new_spte);
1080                 ret = RET_PF_EMULATE;
1081         } else {
1082                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1083                                        rcu_dereference(iter->sptep));
1084         }
1085
1086         /*
1087          * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
1088          * consistent with legacy MMU behavior.
1089          */
1090         if (ret != RET_PF_SPURIOUS)
1091                 vcpu->stat.pf_fixed++;
1092
1093         return ret;
1094 }
1095
1096 /*
1097  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1098  * provided page table.
1099  *
1100  * @kvm: kvm instance
1101  * @iter: a tdp_iter instance currently on the SPTE that should be set
1102  * @sp: The new TDP page table to install.
1103  * @account_nx: True if this page table is being installed to split a
1104  *              non-executable huge page.
1105  * @shared: This operation is running under the MMU lock in read mode.
1106  *
1107  * Returns: 0 if the new page table was installed. Non-0 if the page table
1108  *          could not be installed (e.g. the atomic compare-exchange failed).
1109  */
1110 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1111                            struct kvm_mmu_page *sp, bool account_nx,
1112                            bool shared)
1113 {
1114         u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
1115         int ret = 0;
1116
1117         if (shared) {
1118                 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1119                 if (ret)
1120                         return ret;
1121         } else {
1122                 tdp_mmu_set_spte(kvm, iter, spte);
1123         }
1124
1125         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1126         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1127         if (account_nx)
1128                 account_huge_nx_page(kvm, sp);
1129         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1130
1131         return 0;
1132 }
1133
1134 /*
1135  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1136  * page tables and SPTEs to translate the faulting guest physical address.
1137  */
1138 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1139 {
1140         struct kvm_mmu *mmu = vcpu->arch.mmu;
1141         struct tdp_iter iter;
1142         struct kvm_mmu_page *sp;
1143         int ret;
1144
1145         kvm_mmu_hugepage_adjust(vcpu, fault);
1146
1147         trace_kvm_mmu_spte_requested(fault);
1148
1149         rcu_read_lock();
1150
1151         tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1152                 if (fault->nx_huge_page_workaround_enabled)
1153                         disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1154
1155                 if (iter.level == fault->goal_level)
1156                         break;
1157
1158                 /*
1159                  * If there is an SPTE mapping a large page at a higher level
1160                  * than the target, that SPTE must be cleared and replaced
1161                  * with a non-leaf SPTE.
1162                  */
1163                 if (is_shadow_present_pte(iter.old_spte) &&
1164                     is_large_pte(iter.old_spte)) {
1165                         if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1166                                 break;
1167
1168                         /*
1169                          * The iter must explicitly re-read the spte here
1170                          * because the new value informs the !present
1171                          * path below.
1172                          */
1173                         iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
1174                 }
1175
1176                 if (!is_shadow_present_pte(iter.old_spte)) {
1177                         bool account_nx = fault->huge_page_disallowed &&
1178                                           fault->req_level >= iter.level;
1179
1180                         /*
1181                          * If SPTE has been frozen by another thread, just
1182                          * give up and retry, avoiding unnecessary page table
1183                          * allocation and free.
1184                          */
1185                         if (is_removed_spte(iter.old_spte))
1186                                 break;
1187
1188                         sp = tdp_mmu_alloc_sp(vcpu);
1189                         tdp_mmu_init_child_sp(sp, &iter);
1190
1191                         if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1192                                 tdp_mmu_free_sp(sp);
1193                                 break;
1194                         }
1195                 }
1196         }
1197
1198         /*
1199          * Force the guest to retry the access if the upper level SPTEs aren't
1200          * in place, or if the target leaf SPTE is frozen by another CPU.
1201          */
1202         if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
1203                 rcu_read_unlock();
1204                 return RET_PF_RETRY;
1205         }
1206
1207         ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1208         rcu_read_unlock();
1209
1210         return ret;
1211 }
1212
1213 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1214                                  bool flush)
1215 {
1216         return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1217                                      range->end, range->may_block, flush);
1218 }
1219
1220 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1221                               struct kvm_gfn_range *range);
1222
1223 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1224                                                    struct kvm_gfn_range *range,
1225                                                    tdp_handler_t handler)
1226 {
1227         struct kvm_mmu_page *root;
1228         struct tdp_iter iter;
1229         bool ret = false;
1230
1231         /*
1232          * Don't support rescheduling, none of the MMU notifiers that funnel
1233          * into this helper allow blocking; it'd be dead, wasteful code.
1234          */
1235         for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1236                 rcu_read_lock();
1237
1238                 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1239                         ret |= handler(kvm, &iter, range);
1240
1241                 rcu_read_unlock();
1242         }
1243
1244         return ret;
1245 }
1246
1247 /*
1248  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1249  * if any of the GFNs in the range have been accessed.
1250  */
1251 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1252                           struct kvm_gfn_range *range)
1253 {
1254         u64 new_spte = 0;
1255
1256         /* If we have a non-accessed entry we don't need to change the pte. */
1257         if (!is_accessed_spte(iter->old_spte))
1258                 return false;
1259
1260         new_spte = iter->old_spte;
1261
1262         if (spte_ad_enabled(new_spte)) {
1263                 new_spte &= ~shadow_accessed_mask;
1264         } else {
1265                 /*
1266                  * Capture the dirty status of the page, so that it doesn't get
1267                  * lost when the SPTE is marked for access tracking.
1268                  */
1269                 if (is_writable_pte(new_spte))
1270                         kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1271
1272                 new_spte = mark_spte_for_access_track(new_spte);
1273         }
1274
1275         tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1276
1277         return true;
1278 }
1279
1280 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1281 {
1282         return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1283 }
1284
1285 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1286                          struct kvm_gfn_range *range)
1287 {
1288         return is_accessed_spte(iter->old_spte);
1289 }
1290
1291 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1292 {
1293         return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1294 }
1295
1296 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1297                          struct kvm_gfn_range *range)
1298 {
1299         u64 new_spte;
1300
1301         /* Huge pages aren't expected to be modified without first being zapped. */
1302         WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1303
1304         if (iter->level != PG_LEVEL_4K ||
1305             !is_shadow_present_pte(iter->old_spte))
1306                 return false;
1307
1308         /*
1309          * Note, when changing a read-only SPTE, it's not strictly necessary to
1310          * zero the SPTE before setting the new PFN, but doing so preserves the
1311          * invariant that the PFN of a present * leaf SPTE can never change.
1312          * See __handle_changed_spte().
1313          */
1314         tdp_mmu_set_spte(kvm, iter, 0);
1315
1316         if (!pte_write(range->pte)) {
1317                 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1318                                                                   pte_pfn(range->pte));
1319
1320                 tdp_mmu_set_spte(kvm, iter, new_spte);
1321         }
1322
1323         return true;
1324 }
1325
1326 /*
1327  * Handle the changed_pte MMU notifier for the TDP MMU.
1328  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1329  * notifier.
1330  * Returns non-zero if a flush is needed before releasing the MMU lock.
1331  */
1332 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1333 {
1334         /*
1335          * No need to handle the remote TLB flush under RCU protection, the
1336          * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1337          * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
1338          */
1339         return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1340 }
1341
1342 /*
1343  * Remove write access from all SPTEs at or above min_level that map GFNs
1344  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1345  * be flushed.
1346  */
1347 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1348                              gfn_t start, gfn_t end, int min_level)
1349 {
1350         struct tdp_iter iter;
1351         u64 new_spte;
1352         bool spte_set = false;
1353
1354         rcu_read_lock();
1355
1356         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1357
1358         for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1359 retry:
1360                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1361                         continue;
1362
1363                 if (!is_shadow_present_pte(iter.old_spte) ||
1364                     !is_last_spte(iter.old_spte, iter.level) ||
1365                     !(iter.old_spte & PT_WRITABLE_MASK))
1366                         continue;
1367
1368                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1369
1370                 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1371                         goto retry;
1372
1373                 spte_set = true;
1374         }
1375
1376         rcu_read_unlock();
1377         return spte_set;
1378 }
1379
1380 /*
1381  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1382  * only affect leaf SPTEs down to min_level.
1383  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1384  */
1385 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1386                              const struct kvm_memory_slot *slot, int min_level)
1387 {
1388         struct kvm_mmu_page *root;
1389         bool spte_set = false;
1390
1391         lockdep_assert_held_read(&kvm->mmu_lock);
1392
1393         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1394                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1395                              slot->base_gfn + slot->npages, min_level);
1396
1397         return spte_set;
1398 }
1399
1400 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1401 {
1402         struct kvm_mmu_page *sp;
1403
1404         gfp |= __GFP_ZERO;
1405
1406         sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1407         if (!sp)
1408                 return NULL;
1409
1410         sp->spt = (void *)__get_free_page(gfp);
1411         if (!sp->spt) {
1412                 kmem_cache_free(mmu_page_header_cache, sp);
1413                 return NULL;
1414         }
1415
1416         return sp;
1417 }
1418
1419 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1420                                                        struct tdp_iter *iter,
1421                                                        bool shared)
1422 {
1423         struct kvm_mmu_page *sp;
1424
1425         /*
1426          * Since we are allocating while under the MMU lock we have to be
1427          * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1428          * reclaim and to avoid making any filesystem callbacks (which can end
1429          * up invoking KVM MMU notifiers, resulting in a deadlock).
1430          *
1431          * If this allocation fails we drop the lock and retry with reclaim
1432          * allowed.
1433          */
1434         sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1435         if (sp)
1436                 return sp;
1437
1438         rcu_read_unlock();
1439
1440         if (shared)
1441                 read_unlock(&kvm->mmu_lock);
1442         else
1443                 write_unlock(&kvm->mmu_lock);
1444
1445         iter->yielded = true;
1446         sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1447
1448         if (shared)
1449                 read_lock(&kvm->mmu_lock);
1450         else
1451                 write_lock(&kvm->mmu_lock);
1452
1453         rcu_read_lock();
1454
1455         return sp;
1456 }
1457
1458 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1459                                    struct kvm_mmu_page *sp, bool shared)
1460 {
1461         const u64 huge_spte = iter->old_spte;
1462         const int level = iter->level;
1463         int ret, i;
1464
1465         tdp_mmu_init_child_sp(sp, iter);
1466
1467         /*
1468          * No need for atomics when writing to sp->spt since the page table has
1469          * not been linked in yet and thus is not reachable from any other CPU.
1470          */
1471         for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1472                 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1473
1474         /*
1475          * Replace the huge spte with a pointer to the populated lower level
1476          * page table. Since we are making this change without a TLB flush vCPUs
1477          * will see a mix of the split mappings and the original huge mapping,
1478          * depending on what's currently in their TLB. This is fine from a
1479          * correctness standpoint since the translation will be the same either
1480          * way.
1481          */
1482         ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1483         if (ret)
1484                 goto out;
1485
1486         /*
1487          * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1488          * are overwriting from the page stats. But we have to manually update
1489          * the page stats with the new present child pages.
1490          */
1491         kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1492
1493 out:
1494         trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1495         return ret;
1496 }
1497
1498 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1499                                          struct kvm_mmu_page *root,
1500                                          gfn_t start, gfn_t end,
1501                                          int target_level, bool shared)
1502 {
1503         struct kvm_mmu_page *sp = NULL;
1504         struct tdp_iter iter;
1505         int ret = 0;
1506
1507         rcu_read_lock();
1508
1509         /*
1510          * Traverse the page table splitting all huge pages above the target
1511          * level into one lower level. For example, if we encounter a 1GB page
1512          * we split it into 512 2MB pages.
1513          *
1514          * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1515          * to visit an SPTE before ever visiting its children, which means we
1516          * will correctly recursively split huge pages that are more than one
1517          * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1518          * and then splitting each of those to 512 4KB pages).
1519          */
1520         for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1521 retry:
1522                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1523                         continue;
1524
1525                 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1526                         continue;
1527
1528                 if (!sp) {
1529                         sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1530                         if (!sp) {
1531                                 ret = -ENOMEM;
1532                                 trace_kvm_mmu_split_huge_page(iter.gfn,
1533                                                               iter.old_spte,
1534                                                               iter.level, ret);
1535                                 break;
1536                         }
1537
1538                         if (iter.yielded)
1539                                 continue;
1540                 }
1541
1542                 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1543                         goto retry;
1544
1545                 sp = NULL;
1546         }
1547
1548         rcu_read_unlock();
1549
1550         /*
1551          * It's possible to exit the loop having never used the last sp if, for
1552          * example, a vCPU doing HugePage NX splitting wins the race and
1553          * installs its own sp in place of the last sp we tried to split.
1554          */
1555         if (sp)
1556                 tdp_mmu_free_sp(sp);
1557
1558         return ret;
1559 }
1560
1561
1562 /*
1563  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1564  */
1565 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1566                                       const struct kvm_memory_slot *slot,
1567                                       gfn_t start, gfn_t end,
1568                                       int target_level, bool shared)
1569 {
1570         struct kvm_mmu_page *root;
1571         int r = 0;
1572
1573         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1574
1575         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1576                 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1577                 if (r) {
1578                         kvm_tdp_mmu_put_root(kvm, root, shared);
1579                         break;
1580                 }
1581         }
1582 }
1583
1584 /*
1585  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1586  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1587  * If AD bits are not enabled, this will require clearing the writable bit on
1588  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1589  * be flushed.
1590  */
1591 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1592                            gfn_t start, gfn_t end)
1593 {
1594         struct tdp_iter iter;
1595         u64 new_spte;
1596         bool spte_set = false;
1597
1598         rcu_read_lock();
1599
1600         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1601 retry:
1602                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1603                         continue;
1604
1605                 if (!is_shadow_present_pte(iter.old_spte))
1606                         continue;
1607
1608                 if (spte_ad_need_write_protect(iter.old_spte)) {
1609                         if (is_writable_pte(iter.old_spte))
1610                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1611                         else
1612                                 continue;
1613                 } else {
1614                         if (iter.old_spte & shadow_dirty_mask)
1615                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1616                         else
1617                                 continue;
1618                 }
1619
1620                 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1621                         goto retry;
1622
1623                 spte_set = true;
1624         }
1625
1626         rcu_read_unlock();
1627         return spte_set;
1628 }
1629
1630 /*
1631  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1632  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1633  * If AD bits are not enabled, this will require clearing the writable bit on
1634  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1635  * be flushed.
1636  */
1637 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1638                                   const struct kvm_memory_slot *slot)
1639 {
1640         struct kvm_mmu_page *root;
1641         bool spte_set = false;
1642
1643         lockdep_assert_held_read(&kvm->mmu_lock);
1644
1645         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1646                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1647                                 slot->base_gfn + slot->npages);
1648
1649         return spte_set;
1650 }
1651
1652 /*
1653  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1654  * set in mask, starting at gfn. The given memslot is expected to contain all
1655  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1656  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1657  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1658  */
1659 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1660                                   gfn_t gfn, unsigned long mask, bool wrprot)
1661 {
1662         struct tdp_iter iter;
1663         u64 new_spte;
1664
1665         rcu_read_lock();
1666
1667         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1668                                     gfn + BITS_PER_LONG) {
1669                 if (!mask)
1670                         break;
1671
1672                 if (iter.level > PG_LEVEL_4K ||
1673                     !(mask & (1UL << (iter.gfn - gfn))))
1674                         continue;
1675
1676                 mask &= ~(1UL << (iter.gfn - gfn));
1677
1678                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1679                         if (is_writable_pte(iter.old_spte))
1680                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1681                         else
1682                                 continue;
1683                 } else {
1684                         if (iter.old_spte & shadow_dirty_mask)
1685                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1686                         else
1687                                 continue;
1688                 }
1689
1690                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1691         }
1692
1693         rcu_read_unlock();
1694 }
1695
1696 /*
1697  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1698  * set in mask, starting at gfn. The given memslot is expected to contain all
1699  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1700  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1701  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1702  */
1703 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1704                                        struct kvm_memory_slot *slot,
1705                                        gfn_t gfn, unsigned long mask,
1706                                        bool wrprot)
1707 {
1708         struct kvm_mmu_page *root;
1709
1710         lockdep_assert_held_write(&kvm->mmu_lock);
1711         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1712                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1713 }
1714
1715 /*
1716  * Clear leaf entries which could be replaced by large mappings, for
1717  * GFNs within the slot.
1718  */
1719 static void zap_collapsible_spte_range(struct kvm *kvm,
1720                                        struct kvm_mmu_page *root,
1721                                        const struct kvm_memory_slot *slot)
1722 {
1723         gfn_t start = slot->base_gfn;
1724         gfn_t end = start + slot->npages;
1725         struct tdp_iter iter;
1726         kvm_pfn_t pfn;
1727
1728         rcu_read_lock();
1729
1730         tdp_root_for_each_pte(iter, root, start, end) {
1731 retry:
1732                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1733                         continue;
1734
1735                 if (!is_shadow_present_pte(iter.old_spte) ||
1736                     !is_last_spte(iter.old_spte, iter.level))
1737                         continue;
1738
1739                 pfn = spte_to_pfn(iter.old_spte);
1740                 if (kvm_is_reserved_pfn(pfn) ||
1741                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1742                                                             pfn, PG_LEVEL_NUM))
1743                         continue;
1744
1745                 /* Note, a successful atomic zap also does a remote TLB flush. */
1746                 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1747                         goto retry;
1748         }
1749
1750         rcu_read_unlock();
1751 }
1752
1753 /*
1754  * Clear non-leaf entries (and free associated page tables) which could
1755  * be replaced by large mappings, for GFNs within the slot.
1756  */
1757 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1758                                        const struct kvm_memory_slot *slot)
1759 {
1760         struct kvm_mmu_page *root;
1761
1762         lockdep_assert_held_read(&kvm->mmu_lock);
1763
1764         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1765                 zap_collapsible_spte_range(kvm, root, slot);
1766 }
1767
1768 /*
1769  * Removes write access on the last level SPTE mapping this GFN and unsets the
1770  * MMU-writable bit to ensure future writes continue to be intercepted.
1771  * Returns true if an SPTE was set and a TLB flush is needed.
1772  */
1773 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1774                               gfn_t gfn, int min_level)
1775 {
1776         struct tdp_iter iter;
1777         u64 new_spte;
1778         bool spte_set = false;
1779
1780         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1781
1782         rcu_read_lock();
1783
1784         for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1785                 if (!is_shadow_present_pte(iter.old_spte) ||
1786                     !is_last_spte(iter.old_spte, iter.level))
1787                         continue;
1788
1789                 new_spte = iter.old_spte &
1790                         ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1791
1792                 if (new_spte == iter.old_spte)
1793                         break;
1794
1795                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1796                 spte_set = true;
1797         }
1798
1799         rcu_read_unlock();
1800
1801         return spte_set;
1802 }
1803
1804 /*
1805  * Removes write access on the last level SPTE mapping this GFN and unsets the
1806  * MMU-writable bit to ensure future writes continue to be intercepted.
1807  * Returns true if an SPTE was set and a TLB flush is needed.
1808  */
1809 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1810                                    struct kvm_memory_slot *slot, gfn_t gfn,
1811                                    int min_level)
1812 {
1813         struct kvm_mmu_page *root;
1814         bool spte_set = false;
1815
1816         lockdep_assert_held_write(&kvm->mmu_lock);
1817         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1818                 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1819
1820         return spte_set;
1821 }
1822
1823 /*
1824  * Return the level of the lowest level SPTE added to sptes.
1825  * That SPTE may be non-present.
1826  *
1827  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1828  */
1829 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1830                          int *root_level)
1831 {
1832         struct tdp_iter iter;
1833         struct kvm_mmu *mmu = vcpu->arch.mmu;
1834         gfn_t gfn = addr >> PAGE_SHIFT;
1835         int leaf = -1;
1836
1837         *root_level = vcpu->arch.mmu->shadow_root_level;
1838
1839         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1840                 leaf = iter.level;
1841                 sptes[leaf] = iter.old_spte;
1842         }
1843
1844         return leaf;
1845 }
1846
1847 /*
1848  * Returns the last level spte pointer of the shadow page walk for the given
1849  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1850  * walk could be performed, returns NULL and *spte does not contain valid data.
1851  *
1852  * Contract:
1853  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1854  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1855  *
1856  * WARNING: This function is only intended to be called during fast_page_fault.
1857  */
1858 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1859                                         u64 *spte)
1860 {
1861         struct tdp_iter iter;
1862         struct kvm_mmu *mmu = vcpu->arch.mmu;
1863         gfn_t gfn = addr >> PAGE_SHIFT;
1864         tdp_ptep_t sptep = NULL;
1865
1866         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1867                 *spte = iter.old_spte;
1868                 sptep = iter.sptep;
1869         }
1870
1871         /*
1872          * Perform the rcu_dereference to get the raw spte pointer value since
1873          * we are passing it up to fast_page_fault, which is shared with the
1874          * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1875          * annotation.
1876          *
1877          * This is safe since fast_page_fault obeys the contracts of this
1878          * function as well as all TDP MMU contracts around modifying SPTEs
1879          * outside of mmu_lock.
1880          */
1881         return rcu_dereference(sptep);
1882 }