arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = true;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         struct workqueue_struct *wq;
  20
  21         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  22                 return 0;
  23
  24         wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
  25         if (!wq)
  26                 return -ENOMEM;
  27
  28         /* This should not be changed for the lifetime of the VM. */
  29         kvm->arch.tdp_mmu_enabled = true;
  30         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  31         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  32         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  33         kvm->arch.tdp_mmu_zap_wq = wq;
  34         return 1;
  35 }
  36
  37 /* Arbitrarily returns true so that this may be used in if statements. */
  38 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
  39                                                              bool shared)
  40 {
  41         if (shared)
  42                 lockdep_assert_held_read(&kvm->mmu_lock);
  43         else
  44                 lockdep_assert_held_write(&kvm->mmu_lock);
  45
  46         return true;
  47 }
  48
  49 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  50 {
  51         if (!kvm->arch.tdp_mmu_enabled)
  52                 return;
  53
  54         /* Also waits for any queued work items.  */
  55         destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
  56
  57         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
  58         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  59
  60         /*
  61          * Ensure that all the outstanding RCU callbacks to free shadow pages
  62          * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
  63          * can call kvm_tdp_mmu_put_root and create new callbacks.
  64          */
  65         rcu_barrier();
  66 }
  67
  68 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
  69 {
  70         free_page((unsigned long)sp->spt);
  71         kmem_cache_free(mmu_page_header_cache, sp);
  72 }
  73
  74 /*
  75  * This is called through call_rcu in order to free TDP page table memory
  76  * safely with respect to other kernel threads that may be operating on
  77  * the memory.
  78  * By only accessing TDP MMU page table memory in an RCU read critical
  79  * section, and freeing it after a grace period, lockless access to that
  80  * memory won't use it after it is freed.
  81  */
  82 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
  83 {
  84         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
  85                                                rcu_head);
  86
  87         tdp_mmu_free_sp(sp);
  88 }
  89
  90 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
  91                              bool shared);
  92
  93 static void tdp_mmu_zap_root_work(struct work_struct *work)
  94 {
  95         struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
  96                                                  tdp_mmu_async_work);
  97         struct kvm *kvm = root->tdp_mmu_async_data;
  98
  99         read_lock(&kvm->mmu_lock);
 100
 101         /*
 102          * A TLB flush is not necessary as KVM performs a local TLB flush when
 103          * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
 104          * to a different pCPU.  Note, the local TLB flush on reuse also
 105          * invalidates any paging-structure-cache entries, i.e. TLB entries for
 106          * intermediate paging structures, that may be zapped, as such entries
 107          * are associated with the ASID on both VMX and SVM.
 108          */
 109         tdp_mmu_zap_root(kvm, root, true);
 110
 111         /*
 112          * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
 113          * avoiding an infinite loop.  By design, the root is reachable while
 114          * it's being asynchronously zapped, thus a different task can put its
 115          * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
 116          * asynchronously zapped root is unavoidable.
 117          */
 118         kvm_tdp_mmu_put_root(kvm, root, true);
 119
 120         read_unlock(&kvm->mmu_lock);
 121 }
 122
 123 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
 124 {
 125         root->tdp_mmu_async_data = kvm;
 126         INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
 127         queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
 128 }
 129
 130 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
 131 {
 132         union kvm_mmu_page_role role = page->role;
 133         role.invalid = true;
 134
 135         /* No need to use cmpxchg, only the invalid bit can change.  */
 136         role.word = xchg(&page->role.word, role.word);
 137         return role.invalid;
 138 }
 139
 140 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
 141                           bool shared)
 142 {
 143         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 144
 145         if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
 146                 return;
 147
 148         WARN_ON(!root->tdp_mmu_page);
 149
 150         /*
 151          * The root now has refcount=0.  It is valid, but readers already
 152          * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
 153          * rejects it.  This remains true for the rest of the execution
 154          * of this function, because readers visit valid roots only
 155          * (except for tdp_mmu_zap_root_work(), which however
 156          * does not acquire any reference itself).
 157          *
 158          * Even though there are flows that need to visit all roots for
 159          * correctness, they all take mmu_lock for write, so they cannot yet
 160          * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
 161          * since the root still has refcount=0.
 162          *
 163          * However, tdp_mmu_zap_root can yield, and writers do not expect to
 164          * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
 165          * So the root temporarily gets an extra reference, going to refcount=1
 166          * while staying invalid.  Readers still cannot acquire any reference;
 167          * but writers are now allowed to run if tdp_mmu_zap_root yields and
 168          * they might take an extra reference if they themselves yield.
 169          * Therefore, when the reference is given back by the worker,
 170          * there is no guarantee that the refcount is still 1.  If not, whoever
 171          * puts the last reference will free the page, but they will not have to
 172          * zap the root because a root cannot go from invalid to valid.
 173          */
 174         if (!kvm_tdp_root_mark_invalid(root)) {
 175                 refcount_set(&root->tdp_mmu_root_count, 1);
 176
 177                 /*
 178                  * Zapping the root in a worker is not just "nice to have";
 179                  * it is required because kvm_tdp_mmu_invalidate_all_roots()
 180                  * skips already-invalid roots.  If kvm_tdp_mmu_put_root() did
 181                  * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
 182                  * might return with some roots not zapped yet.
 183                  */
 184                 tdp_mmu_schedule_zap_root(kvm, root);
 185                 return;
 186         }
 187
 188         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 189         list_del_rcu(&root->link);
 190         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 191         call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
 192 }
 193
 194 /*
 195  * Returns the next root after @prev_root (or the first root if @prev_root is
 196  * NULL).  A reference to the returned root is acquired, and the reference to
 197  * @prev_root is released (the caller obviously must hold a reference to
 198  * @prev_root if it's non-NULL).
 199  *
 200  * If @only_valid is true, invalid roots are skipped.
 201  *
 202  * Returns NULL if the end of tdp_mmu_roots was reached.
 203  */
 204 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 205                                               struct kvm_mmu_page *prev_root,
 206                                               bool shared, bool only_valid)
 207 {
 208         struct kvm_mmu_page *next_root;
 209
 210         rcu_read_lock();
 211
 212         if (prev_root)
 213                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 214                                                   &prev_root->link,
 215                                                   typeof(*prev_root), link);
 216         else
 217                 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 218                                                    typeof(*next_root), link);
 219
 220         while (next_root) {
 221                 if ((!only_valid || !next_root->role.invalid) &&
 222                     kvm_tdp_mmu_get_root(next_root))
 223                         break;
 224
 225                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 226                                 &next_root->link, typeof(*next_root), link);
 227         }
 228
 229         rcu_read_unlock();
 230
 231         if (prev_root)
 232                 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
 233
 234         return next_root;
 235 }
 236
 237 /*
 238  * Note: this iterator gets and puts references to the roots it iterates over.
 239  * This makes it safe to release the MMU lock and yield within the loop, but
 240  * if exiting the loop early, the caller must drop the reference to the most
 241  * recent root. (Unless keeping a live reference is desirable.)
 242  *
 243  * If shared is set, this function is operating under the MMU lock in read
 244  * mode. In the unlikely event that this thread must free a root, the lock
 245  * will be temporarily dropped and reacquired in write mode.
 246  */
 247 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
 248         for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);       \
 249              _root;                                                             \
 250              _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))      \
 251                 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&          \
 252                     kvm_mmu_page_as_id(_root) != _as_id) {                      \
 253                 } else
 254
 255 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)    \
 256         __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
 257
 258 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)                   \
 259         __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
 260
 261 /*
 262  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
 263  * the implication being that any flow that holds mmu_lock for read is
 264  * inherently yield-friendly and should use the yield-safe variant above.
 265  * Holding mmu_lock for write obviates the need for RCU protection as the list
 266  * is guaranteed to be stable.
 267  */
 268 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)                      \
 269         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)     \
 270                 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&    \
 271                     kvm_mmu_page_as_id(_root) != _as_id) {              \
 272                 } else
 273
 274 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
 275 {
 276         struct kvm_mmu_page *sp;
 277
 278         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 279         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 280
 281         return sp;
 282 }
 283
 284 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
 285                             gfn_t gfn, union kvm_mmu_page_role role)
 286 {
 287         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 288
 289         sp->role = role;
 290         sp->gfn = gfn;
 291         sp->ptep = sptep;
 292         sp->tdp_mmu_page = true;
 293
 294         trace_kvm_mmu_get_page(sp, true);
 295 }
 296
 297 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
 298                                   struct tdp_iter *iter)
 299 {
 300         struct kvm_mmu_page *parent_sp;
 301         union kvm_mmu_page_role role;
 302
 303         parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
 304
 305         role = parent_sp->role;
 306         role.level--;
 307
 308         tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
 309 }
 310
 311 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 312 {
 313         union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
 314         struct kvm *kvm = vcpu->kvm;
 315         struct kvm_mmu_page *root;
 316
 317         lockdep_assert_held_write(&kvm->mmu_lock);
 318
 319         /*
 320          * Check for an existing root before allocating a new one.  Note, the
 321          * role check prevents consuming an invalid root.
 322          */
 323         for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
 324                 if (root->role.word == role.word &&
 325                     kvm_tdp_mmu_get_root(root))
 326                         goto out;
 327         }
 328
 329         root = tdp_mmu_alloc_sp(vcpu);
 330         tdp_mmu_init_sp(root, NULL, 0, role);
 331
 332         refcount_set(&root->tdp_mmu_root_count, 1);
 333
 334         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 335         list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
 336         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 337
 338 out:
 339         return __pa(root->spt);
 340 }
 341
 342 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 343                                 u64 old_spte, u64 new_spte, int level,
 344                                 bool shared);
 345
 346 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 347 {
 348         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 349                 return;
 350
 351         if (is_accessed_spte(old_spte) &&
 352             (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
 353              spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
 354                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 355 }
 356
 357 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 358                                           u64 old_spte, u64 new_spte, int level)
 359 {
 360         bool pfn_changed;
 361         struct kvm_memory_slot *slot;
 362
 363         if (level > PG_LEVEL_4K)
 364                 return;
 365
 366         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 367
 368         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 369             is_writable_pte(new_spte)) {
 370                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 371                 mark_page_dirty_in_slot(kvm, slot, gfn);
 372         }
 373 }
 374
 375 /**
 376  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
 377  *
 378  * @kvm: kvm instance
 379  * @sp: the page to be removed
 380  * @shared: This operation may not be running under the exclusive use of
 381  *          the MMU lock and the operation must synchronize with other
 382  *          threads that might be adding or removing pages.
 383  */
 384 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
 385                               bool shared)
 386 {
 387         if (shared)
 388                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 389         else
 390                 lockdep_assert_held_write(&kvm->mmu_lock);
 391
 392         list_del(&sp->link);
 393         if (sp->lpage_disallowed)
 394                 unaccount_huge_nx_page(kvm, sp);
 395
 396         if (shared)
 397                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 398 }
 399
 400 /**
 401  * handle_removed_pt() - handle a page table removed from the TDP structure
 402  *
 403  * @kvm: kvm instance
 404  * @pt: the page removed from the paging structure
 405  * @shared: This operation may not be running under the exclusive use
 406  *          of the MMU lock and the operation must synchronize with other
 407  *          threads that might be modifying SPTEs.
 408  *
 409  * Given a page table that has been removed from the TDP paging structure,
 410  * iterates through the page table to clear SPTEs and free child page tables.
 411  *
 412  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 413  * protection. Since this thread removed it from the paging structure,
 414  * this thread will be responsible for ensuring the page is freed. Hence the
 415  * early rcu_dereferences in the function.
 416  */
 417 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
 418 {
 419         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
 420         int level = sp->role.level;
 421         gfn_t base_gfn = sp->gfn;
 422         int i;
 423
 424         trace_kvm_mmu_prepare_zap_page(sp);
 425
 426         tdp_mmu_unlink_sp(kvm, sp, shared);
 427
 428         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 429                 tdp_ptep_t sptep = pt + i;
 430                 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
 431                 u64 old_spte;
 432
 433                 if (shared) {
 434                         /*
 435                          * Set the SPTE to a nonpresent value that other
 436                          * threads will not overwrite. If the SPTE was
 437                          * already marked as removed then another thread
 438                          * handling a page fault could overwrite it, so
 439                          * set the SPTE until it is set from some other
 440                          * value to the removed SPTE value.
 441                          */
 442                         for (;;) {
 443                                 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
 444                                 if (!is_removed_spte(old_spte))
 445                                         break;
 446                                 cpu_relax();
 447                         }
 448                 } else {
 449                         /*
 450                          * If the SPTE is not MMU-present, there is no backing
 451                          * page associated with the SPTE and so no side effects
 452                          * that need to be recorded, and exclusive ownership of
 453                          * mmu_lock ensures the SPTE can't be made present.
 454                          * Note, zapping MMIO SPTEs is also unnecessary as they
 455                          * are guarded by the memslots generation, not by being
 456                          * unreachable.
 457                          */
 458                         old_spte = kvm_tdp_mmu_read_spte(sptep);
 459                         if (!is_shadow_present_pte(old_spte))
 460                                 continue;
 461
 462                         /*
 463                          * Use the common helper instead of a raw WRITE_ONCE as
 464                          * the SPTE needs to be updated atomically if it can be
 465                          * modified by a different vCPU outside of mmu_lock.
 466                          * Even though the parent SPTE is !PRESENT, the TLB
 467                          * hasn't yet been flushed, and both Intel and AMD
 468                          * document that A/D assists can use upper-level PxE
 469                          * entries that are cached in the TLB, i.e. the CPU can
 470                          * still access the page and mark it dirty.
 471                          *
 472                          * No retry is needed in the atomic update path as the
 473                          * sole concern is dropping a Dirty bit, i.e. no other
 474                          * task can zap/remove the SPTE as mmu_lock is held for
 475                          * write.  Marking the SPTE as a removed SPTE is not
 476                          * strictly necessary for the same reason, but using
 477                          * the remove SPTE value keeps the shared/exclusive
 478                          * paths consistent and allows the handle_changed_spte()
 479                          * call below to hardcode the new value to REMOVED_SPTE.
 480                          *
 481                          * Note, even though dropping a Dirty bit is the only
 482                          * scenario where a non-atomic update could result in a
 483                          * functional bug, simply checking the Dirty bit isn't
 484                          * sufficient as a fast page fault could read the upper
 485                          * level SPTE before it is zapped, and then make this
 486                          * target SPTE writable, resume the guest, and set the
 487                          * Dirty bit between reading the SPTE above and writing
 488                          * it here.
 489                          */
 490                         old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
 491                                                           REMOVED_SPTE, level);
 492                 }
 493                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 494                                     old_spte, REMOVED_SPTE, level, shared);
 495         }
 496
 497         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 498 }
 499
 500 /**
 501  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
 502  * @kvm: kvm instance
 503  * @as_id: the address space of the paging structure the SPTE was a part of
 504  * @gfn: the base GFN that was mapped by the SPTE
 505  * @old_spte: The value of the SPTE before the change
 506  * @new_spte: The value of the SPTE after the change
 507  * @level: the level of the PT the SPTE is part of in the paging structure
 508  * @shared: This operation may not be running under the exclusive use of
 509  *          the MMU lock and the operation must synchronize with other
 510  *          threads that might be modifying SPTEs.
 511  *
 512  * Handle bookkeeping that might result from the modification of a SPTE.
 513  * This function must be called for all TDP SPTE modifications.
 514  */
 515 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 516                                   u64 old_spte, u64 new_spte, int level,
 517                                   bool shared)
 518 {
 519         bool was_present = is_shadow_present_pte(old_spte);
 520         bool is_present = is_shadow_present_pte(new_spte);
 521         bool was_leaf = was_present && is_last_spte(old_spte, level);
 522         bool is_leaf = is_present && is_last_spte(new_spte, level);
 523         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 524
 525         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 526         WARN_ON(level < PG_LEVEL_4K);
 527         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 528
 529         /*
 530          * If this warning were to trigger it would indicate that there was a
 531          * missing MMU notifier or a race with some notifier handler.
 532          * A present, leaf SPTE should never be directly replaced with another
 533          * present leaf SPTE pointing to a different PFN. A notifier handler
 534          * should be zapping the SPTE before the main MM's page table is
 535          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 536          * thread before replacement.
 537          */
 538         if (was_leaf && is_leaf && pfn_changed) {
 539                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 540                        "SPTE with another present leaf SPTE mapping a\n"
 541                        "different PFN!\n"
 542                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 543                        as_id, gfn, old_spte, new_spte, level);
 544
 545                 /*
 546                  * Crash the host to prevent error propagation and guest data
 547                  * corruption.
 548                  */
 549                 BUG();
 550         }
 551
 552         if (old_spte == new_spte)
 553                 return;
 554
 555         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 556
 557         if (is_leaf)
 558                 check_spte_writable_invariants(new_spte);
 559
 560         /*
 561          * The only times a SPTE should be changed from a non-present to
 562          * non-present state is when an MMIO entry is installed/modified/
 563          * removed. In that case, there is nothing to do here.
 564          */
 565         if (!was_present && !is_present) {
 566                 /*
 567                  * If this change does not involve a MMIO SPTE or removed SPTE,
 568                  * it is unexpected. Log the change, though it should not
 569                  * impact the guest since both the former and current SPTEs
 570                  * are nonpresent.
 571                  */
 572                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 573                             !is_mmio_spte(new_spte) &&
 574                             !is_removed_spte(new_spte)))
 575                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 576                                "should not be replaced with another,\n"
 577                                "different nonpresent SPTE, unless one or both\n"
 578                                "are MMIO SPTEs, or the new SPTE is\n"
 579                                "a temporary removed SPTE.\n"
 580                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 581                                as_id, gfn, old_spte, new_spte, level);
 582                 return;
 583         }
 584
 585         if (is_leaf != was_leaf)
 586                 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
 587
 588         if (was_leaf && is_dirty_spte(old_spte) &&
 589             (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
 590                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 591
 592         /*
 593          * Recursively handle child PTs if the change removed a subtree from
 594          * the paging structure.  Note the WARN on the PFN changing without the
 595          * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
 596          * pages are kernel allocations and should never be migrated.
 597          */
 598         if (was_present && !was_leaf &&
 599             (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
 600                 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
 601 }
 602
 603 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 604                                 u64 old_spte, u64 new_spte, int level,
 605                                 bool shared)
 606 {
 607         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 608                               shared);
 609         handle_changed_spte_acc_track(old_spte, new_spte, level);
 610         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 611                                       new_spte, level);
 612 }
 613
 614 /*
 615  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
 616  * and handle the associated bookkeeping.  Do not mark the page dirty
 617  * in KVM's dirty bitmaps.
 618  *
 619  * If setting the SPTE fails because it has changed, iter->old_spte will be
 620  * refreshed to the current value of the spte.
 621  *
 622  * @kvm: kvm instance
 623  * @iter: a tdp_iter instance currently on the SPTE that should be set
 624  * @new_spte: The value the SPTE should be set to
 625  * Return:
 626  * * 0      - If the SPTE was set.
 627  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
 628  *            no side-effects other than setting iter->old_spte to the last
 629  *            known value of the spte.
 630  */
 631 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
 632                                           struct tdp_iter *iter,
 633                                           u64 new_spte)
 634 {
 635         u64 *sptep = rcu_dereference(iter->sptep);
 636         u64 old_spte;
 637
 638         /*
 639          * The caller is responsible for ensuring the old SPTE is not a REMOVED
 640          * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
 641          * and pre-checking before inserting a new SPTE is advantageous as it
 642          * avoids unnecessary work.
 643          */
 644         WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
 645
 646         lockdep_assert_held_read(&kvm->mmu_lock);
 647
 648         /*
 649          * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
 650          * does not hold the mmu_lock.
 651          */
 652         old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
 653         if (old_spte != iter->old_spte) {
 654                 /*
 655                  * The page table entry was modified by a different logical
 656                  * CPU. Refresh iter->old_spte with the current value so the
 657                  * caller operates on fresh data, e.g. if it retries
 658                  * tdp_mmu_set_spte_atomic().
 659                  */
 660                 iter->old_spte = old_spte;
 661                 return -EBUSY;
 662         }
 663
 664         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 665                               new_spte, iter->level, true);
 666         handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
 667
 668         return 0;
 669 }
 670
 671 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 672                                           struct tdp_iter *iter)
 673 {
 674         int ret;
 675
 676         /*
 677          * Freeze the SPTE by setting it to a special,
 678          * non-present value. This will stop other threads from
 679          * immediately installing a present entry in its place
 680          * before the TLBs are flushed.
 681          */
 682         ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
 683         if (ret)
 684                 return ret;
 685
 686         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 687                                            KVM_PAGES_PER_HPAGE(iter->level));
 688
 689         /*
 690          * No other thread can overwrite the removed SPTE as they must either
 691          * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
 692          * overwrite the special removed SPTE value. No bookkeeping is needed
 693          * here since the SPTE is going from non-present to non-present.  Use
 694          * the raw write helper to avoid an unnecessary check on volatile bits.
 695          */
 696         __kvm_tdp_mmu_write_spte(iter->sptep, 0);
 697
 698         return 0;
 699 }
 700
 701
 702 /*
 703  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 704  * @kvm:              KVM instance
 705  * @as_id:            Address space ID, i.e. regular vs. SMM
 706  * @sptep:            Pointer to the SPTE
 707  * @old_spte:         The current value of the SPTE
 708  * @new_spte:         The new value that will be set for the SPTE
 709  * @gfn:              The base GFN that was (or will be) mapped by the SPTE
 710  * @level:            The level _containing_ the SPTE (its parent PT's level)
 711  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 712  *                    of the page. Should be set unless handling an MMU
 713  *                    notifier for access tracking. Leaving record_acc_track
 714  *                    unset in that case prevents page accesses from being
 715  *                    double counted.
 716  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 717  *                    appropriate for the change being made. Should be set
 718  *                    unless performing certain dirty logging operations.
 719  *                    Leaving record_dirty_log unset in that case prevents page
 720  *                    writes from being double counted.
 721  *
 722  * Returns the old SPTE value, which _may_ be different than @old_spte if the
 723  * SPTE had voldatile bits.
 724  */
 725 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
 726                               u64 old_spte, u64 new_spte, gfn_t gfn, int level,
 727                               bool record_acc_track, bool record_dirty_log)
 728 {
 729         lockdep_assert_held_write(&kvm->mmu_lock);
 730
 731         /*
 732          * No thread should be using this function to set SPTEs to or from the
 733          * temporary removed SPTE value.
 734          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 735          * should be used. If operating under the MMU lock in write mode, the
 736          * use of the removed SPTE should not be necessary.
 737          */
 738         WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
 739
 740         old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
 741
 742         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
 743
 744         if (record_acc_track)
 745                 handle_changed_spte_acc_track(old_spte, new_spte, level);
 746         if (record_dirty_log)
 747                 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 748                                               new_spte, level);
 749         return old_spte;
 750 }
 751
 752 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 753                                      u64 new_spte, bool record_acc_track,
 754                                      bool record_dirty_log)
 755 {
 756         WARN_ON_ONCE(iter->yielded);
 757
 758         iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
 759                                             iter->old_spte, new_spte,
 760                                             iter->gfn, iter->level,
 761                                             record_acc_track, record_dirty_log);
 762 }
 763
 764 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 765                                     u64 new_spte)
 766 {
 767         _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 768 }
 769
 770 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 771                                                  struct tdp_iter *iter,
 772                                                  u64 new_spte)
 773 {
 774         _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 775 }
 776
 777 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 778                                                  struct tdp_iter *iter,
 779                                                  u64 new_spte)
 780 {
 781         _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 782 }
 783
 784 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 785         for_each_tdp_pte(_iter, _root, _start, _end)
 786
 787 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 788         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 789                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 790                     !is_last_spte(_iter.old_spte, _iter.level))         \
 791                         continue;                                       \
 792                 else
 793
 794 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 795         for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
 796
 797 /*
 798  * Yield if the MMU lock is contended or this thread needs to return control
 799  * to the scheduler.
 800  *
 801  * If this function should yield and flush is set, it will perform a remote
 802  * TLB flush before yielding.
 803  *
 804  * If this function yields, iter->yielded is set and the caller must skip to
 805  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
 806  * over the paging structures to allow the iterator to continue its traversal
 807  * from the paging structure root.
 808  *
 809  * Returns true if this function yielded.
 810  */
 811 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
 812                                                           struct tdp_iter *iter,
 813                                                           bool flush, bool shared)
 814 {
 815         WARN_ON(iter->yielded);
 816
 817         /* Ensure forward progress has been made before yielding. */
 818         if (iter->next_last_level_gfn == iter->yielded_gfn)
 819                 return false;
 820
 821         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 822                 if (flush)
 823                         kvm_flush_remote_tlbs(kvm);
 824
 825                 rcu_read_unlock();
 826
 827                 if (shared)
 828                         cond_resched_rwlock_read(&kvm->mmu_lock);
 829                 else
 830                         cond_resched_rwlock_write(&kvm->mmu_lock);
 831
 832                 rcu_read_lock();
 833
 834                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 835
 836                 iter->yielded = true;
 837         }
 838
 839         return iter->yielded;
 840 }
 841
 842 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
 843 {
 844         /*
 845          * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
 846          * a gpa range that would exceed the max gfn, and KVM does not create
 847          * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
 848          * the slow emulation path every time.
 849          */
 850         return kvm_mmu_max_gfn() + 1;
 851 }
 852
 853 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
 854                                bool shared, int zap_level)
 855 {
 856         struct tdp_iter iter;
 857
 858         gfn_t end = tdp_mmu_max_gfn_exclusive();
 859         gfn_t start = 0;
 860
 861         for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
 862 retry:
 863                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
 864                         continue;
 865
 866                 if (!is_shadow_present_pte(iter.old_spte))
 867                         continue;
 868
 869                 if (iter.level > zap_level)
 870                         continue;
 871
 872                 if (!shared)
 873                         tdp_mmu_set_spte(kvm, &iter, 0);
 874                 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
 875                         goto retry;
 876         }
 877 }
 878
 879 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
 880                              bool shared)
 881 {
 882
 883         /*
 884          * The root must have an elevated refcount so that it's reachable via
 885          * mmu_notifier callbacks, which allows this path to yield and drop
 886          * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
 887          * must drop all references to relevant pages prior to completing the
 888          * callback.  Dropping mmu_lock with an unreachable root would result
 889          * in zapping SPTEs after a relevant mmu_notifier callback completes
 890          * and lead to use-after-free as zapping a SPTE triggers "writeback" of
 891          * dirty accessed bits to the SPTE's associated struct page.
 892          */
 893         WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
 894
 895         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 896
 897         rcu_read_lock();
 898
 899         /*
 900          * To avoid RCU stalls due to recursively removing huge swaths of SPs,
 901          * split the zap into two passes.  On the first pass, zap at the 1gb
 902          * level, and then zap top-level SPs on the second pass.  "1gb" is not
 903          * arbitrary, as KVM must be able to zap a 1gb shadow page without
 904          * inducing a stall to allow in-place replacement with a 1gb hugepage.
 905          *
 906          * Because zapping a SP recurses on its children, stepping down to
 907          * PG_LEVEL_4K in the iterator itself is unnecessary.
 908          */
 909         __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
 910         __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
 911
 912         rcu_read_unlock();
 913 }
 914
 915 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 916 {
 917         u64 old_spte;
 918
 919         /*
 920          * This helper intentionally doesn't allow zapping a root shadow page,
 921          * which doesn't have a parent page table and thus no associated entry.
 922          */
 923         if (WARN_ON_ONCE(!sp->ptep))
 924                 return false;
 925
 926         old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
 927         if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
 928                 return false;
 929
 930         __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
 931                            sp->gfn, sp->role.level + 1, true, true);
 932
 933         return true;
 934 }
 935
 936 /*
 937  * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
 938  * have been cleared and a TLB flush is needed before releasing the MMU lock.
 939  *
 940  * If can_yield is true, will release the MMU lock and reschedule if the
 941  * scheduler needs the CPU or there is contention on the MMU lock. If this
 942  * function cannot yield, it will not release the MMU lock or reschedule and
 943  * the caller must ensure it does not supply too large a GFN range, or the
 944  * operation can cause a soft lockup.
 945  */
 946 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
 947                               gfn_t start, gfn_t end, bool can_yield, bool flush)
 948 {
 949         struct tdp_iter iter;
 950
 951         end = min(end, tdp_mmu_max_gfn_exclusive());
 952
 953         lockdep_assert_held_write(&kvm->mmu_lock);
 954
 955         rcu_read_lock();
 956
 957         for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
 958                 if (can_yield &&
 959                     tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
 960                         flush = false;
 961                         continue;
 962                 }
 963
 964                 if (!is_shadow_present_pte(iter.old_spte) ||
 965                     !is_last_spte(iter.old_spte, iter.level))
 966                         continue;
 967
 968                 tdp_mmu_set_spte(kvm, &iter, 0);
 969                 flush = true;
 970         }
 971
 972         rcu_read_unlock();
 973
 974         /*
 975          * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
 976          * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
 977          */
 978         return flush;
 979 }
 980
 981 /*
 982  * Tears down the mappings for the range of gfns, [start, end), and frees the
 983  * non-root pages mapping GFNs strictly within that range. Returns true if
 984  * SPTEs have been cleared and a TLB flush is needed before releasing the
 985  * MMU lock.
 986  */
 987 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
 988                            bool can_yield, bool flush)
 989 {
 990         struct kvm_mmu_page *root;
 991
 992         for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
 993                 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
 994
 995         return flush;
 996 }
 997
 998 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 999 {
1000         struct kvm_mmu_page *root;
1001         int i;
1002
1003         /*
1004          * Zap all roots, including invalid roots, as all SPTEs must be dropped
1005          * before returning to the caller.  Zap directly even if the root is
1006          * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
1007          * all that expensive and mmu_lock is already held, which means the
1008          * worker has yielded, i.e. flushing the work instead of zapping here
1009          * isn't guaranteed to be any faster.
1010          *
1011          * A TLB flush is unnecessary, KVM zaps everything if and only the VM
1012          * is being destroyed or the userspace VMM has exited.  In both cases,
1013          * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1014          */
1015         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1016                 for_each_tdp_mmu_root_yield_safe(kvm, root, i)
1017                         tdp_mmu_zap_root(kvm, root, false);
1018         }
1019 }
1020
1021 /*
1022  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1023  * zap" completes.
1024  */
1025 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1026 {
1027         flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1028 }
1029
1030 /*
1031  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1032  * is about to be zapped, e.g. in response to a memslots update.  The actual
1033  * zapping is performed asynchronously, so a reference is taken on all roots.
1034  * Using a separate workqueue makes it easy to ensure that the destruction is
1035  * performed before the "fast zap" completes, without keeping a separate list
1036  * of invalidated roots; the list is effectively the list of work items in
1037  * the workqueue.
1038  *
1039  * Get a reference even if the root is already invalid, the asynchronous worker
1040  * assumes it was gifted a reference to the root it processes.  Because mmu_lock
1041  * is held for write, it should be impossible to observe a root with zero refcount,
1042  * i.e. the list of roots cannot be stale.
1043  *
1044  * This has essentially the same effect for the TDP MMU
1045  * as updating mmu_valid_gen does for the shadow MMU.
1046  */
1047 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1048 {
1049         struct kvm_mmu_page *root;
1050
1051         lockdep_assert_held_write(&kvm->mmu_lock);
1052         list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1053                 if (!root->role.invalid &&
1054                     !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1055                         root->role.invalid = true;
1056                         tdp_mmu_schedule_zap_root(kvm, root);
1057                 }
1058         }
1059 }
1060
1061 /*
1062  * Installs a last-level SPTE to handle a TDP page fault.
1063  * (NPT/EPT violation/misconfiguration)
1064  */
1065 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1066                                           struct kvm_page_fault *fault,
1067                                           struct tdp_iter *iter)
1068 {
1069         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1070         u64 new_spte;
1071         int ret = RET_PF_FIXED;
1072         bool wrprot = false;
1073
1074         WARN_ON(sp->role.level != fault->goal_level);
1075         if (unlikely(!fault->slot))
1076                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1077         else
1078                 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1079                                          fault->pfn, iter->old_spte, fault->prefetch, true,
1080                                          fault->map_writable, &new_spte);
1081
1082         if (new_spte == iter->old_spte)
1083                 ret = RET_PF_SPURIOUS;
1084         else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1085                 return RET_PF_RETRY;
1086         else if (is_shadow_present_pte(iter->old_spte) &&
1087                  !is_last_spte(iter->old_spte, iter->level))
1088                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1089                                                    KVM_PAGES_PER_HPAGE(iter->level + 1));
1090
1091         /*
1092          * If the page fault was caused by a write but the page is write
1093          * protected, emulation is needed. If the emulation was skipped,
1094          * the vCPU would have the same fault again.
1095          */
1096         if (wrprot) {
1097                 if (fault->write)
1098                         ret = RET_PF_EMULATE;
1099         }
1100
1101         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1102         if (unlikely(is_mmio_spte(new_spte))) {
1103                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1104                                      new_spte);
1105                 ret = RET_PF_EMULATE;
1106         } else {
1107                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1108                                        rcu_dereference(iter->sptep));
1109         }
1110
1111         /*
1112          * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
1113          * consistent with legacy MMU behavior.
1114          */
1115         if (ret != RET_PF_SPURIOUS)
1116                 vcpu->stat.pf_fixed++;
1117
1118         return ret;
1119 }
1120
1121 /*
1122  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1123  * provided page table.
1124  *
1125  * @kvm: kvm instance
1126  * @iter: a tdp_iter instance currently on the SPTE that should be set
1127  * @sp: The new TDP page table to install.
1128  * @account_nx: True if this page table is being installed to split a
1129  *              non-executable huge page.
1130  * @shared: This operation is running under the MMU lock in read mode.
1131  *
1132  * Returns: 0 if the new page table was installed. Non-0 if the page table
1133  *          could not be installed (e.g. the atomic compare-exchange failed).
1134  */
1135 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1136                            struct kvm_mmu_page *sp, bool account_nx,
1137                            bool shared)
1138 {
1139         u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
1140         int ret = 0;
1141
1142         if (shared) {
1143                 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1144                 if (ret)
1145                         return ret;
1146         } else {
1147                 tdp_mmu_set_spte(kvm, iter, spte);
1148         }
1149
1150         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1151         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1152         if (account_nx)
1153                 account_huge_nx_page(kvm, sp);
1154         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1155
1156         return 0;
1157 }
1158
1159 /*
1160  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1161  * page tables and SPTEs to translate the faulting guest physical address.
1162  */
1163 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1164 {
1165         struct kvm_mmu *mmu = vcpu->arch.mmu;
1166         struct tdp_iter iter;
1167         struct kvm_mmu_page *sp;
1168         int ret;
1169
1170         kvm_mmu_hugepage_adjust(vcpu, fault);
1171
1172         trace_kvm_mmu_spte_requested(fault);
1173
1174         rcu_read_lock();
1175
1176         tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1177                 if (fault->nx_huge_page_workaround_enabled)
1178                         disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1179
1180                 if (iter.level == fault->goal_level)
1181                         break;
1182
1183                 /*
1184                  * If there is an SPTE mapping a large page at a higher level
1185                  * than the target, that SPTE must be cleared and replaced
1186                  * with a non-leaf SPTE.
1187                  */
1188                 if (is_shadow_present_pte(iter.old_spte) &&
1189                     is_large_pte(iter.old_spte)) {
1190                         if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1191                                 break;
1192
1193                         /*
1194                          * The iter must explicitly re-read the spte here
1195                          * because the new value informs the !present
1196                          * path below.
1197                          */
1198                         iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
1199                 }
1200
1201                 if (!is_shadow_present_pte(iter.old_spte)) {
1202                         bool account_nx = fault->huge_page_disallowed &&
1203                                           fault->req_level >= iter.level;
1204
1205                         /*
1206                          * If SPTE has been frozen by another thread, just
1207                          * give up and retry, avoiding unnecessary page table
1208                          * allocation and free.
1209                          */
1210                         if (is_removed_spte(iter.old_spte))
1211                                 break;
1212
1213                         sp = tdp_mmu_alloc_sp(vcpu);
1214                         tdp_mmu_init_child_sp(sp, &iter);
1215
1216                         if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1217                                 tdp_mmu_free_sp(sp);
1218                                 break;
1219                         }
1220                 }
1221         }
1222
1223         /*
1224          * Force the guest to retry the access if the upper level SPTEs aren't
1225          * in place, or if the target leaf SPTE is frozen by another CPU.
1226          */
1227         if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
1228                 rcu_read_unlock();
1229                 return RET_PF_RETRY;
1230         }
1231
1232         ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1233         rcu_read_unlock();
1234
1235         return ret;
1236 }
1237
1238 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1239                                  bool flush)
1240 {
1241         return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1242                                      range->end, range->may_block, flush);
1243 }
1244
1245 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1246                               struct kvm_gfn_range *range);
1247
1248 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1249                                                    struct kvm_gfn_range *range,
1250                                                    tdp_handler_t handler)
1251 {
1252         struct kvm_mmu_page *root;
1253         struct tdp_iter iter;
1254         bool ret = false;
1255
1256         /*
1257          * Don't support rescheduling, none of the MMU notifiers that funnel
1258          * into this helper allow blocking; it'd be dead, wasteful code.
1259          */
1260         for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1261                 rcu_read_lock();
1262
1263                 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1264                         ret |= handler(kvm, &iter, range);
1265
1266                 rcu_read_unlock();
1267         }
1268
1269         return ret;
1270 }
1271
1272 /*
1273  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1274  * if any of the GFNs in the range have been accessed.
1275  */
1276 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1277                           struct kvm_gfn_range *range)
1278 {
1279         u64 new_spte = 0;
1280
1281         /* If we have a non-accessed entry we don't need to change the pte. */
1282         if (!is_accessed_spte(iter->old_spte))
1283                 return false;
1284
1285         new_spte = iter->old_spte;
1286
1287         if (spte_ad_enabled(new_spte)) {
1288                 new_spte &= ~shadow_accessed_mask;
1289         } else {
1290                 /*
1291                  * Capture the dirty status of the page, so that it doesn't get
1292                  * lost when the SPTE is marked for access tracking.
1293                  */
1294                 if (is_writable_pte(new_spte))
1295                         kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1296
1297                 new_spte = mark_spte_for_access_track(new_spte);
1298         }
1299
1300         tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1301
1302         return true;
1303 }
1304
1305 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1306 {
1307         return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1308 }
1309
1310 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1311                          struct kvm_gfn_range *range)
1312 {
1313         return is_accessed_spte(iter->old_spte);
1314 }
1315
1316 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1317 {
1318         return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1319 }
1320
1321 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1322                          struct kvm_gfn_range *range)
1323 {
1324         u64 new_spte;
1325
1326         /* Huge pages aren't expected to be modified without first being zapped. */
1327         WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1328
1329         if (iter->level != PG_LEVEL_4K ||
1330             !is_shadow_present_pte(iter->old_spte))
1331                 return false;
1332
1333         /*
1334          * Note, when changing a read-only SPTE, it's not strictly necessary to
1335          * zero the SPTE before setting the new PFN, but doing so preserves the
1336          * invariant that the PFN of a present * leaf SPTE can never change.
1337          * See __handle_changed_spte().
1338          */
1339         tdp_mmu_set_spte(kvm, iter, 0);
1340
1341         if (!pte_write(range->pte)) {
1342                 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1343                                                                   pte_pfn(range->pte));
1344
1345                 tdp_mmu_set_spte(kvm, iter, new_spte);
1346         }
1347
1348         return true;
1349 }
1350
1351 /*
1352  * Handle the changed_pte MMU notifier for the TDP MMU.
1353  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1354  * notifier.
1355  * Returns non-zero if a flush is needed before releasing the MMU lock.
1356  */
1357 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1358 {
1359         /*
1360          * No need to handle the remote TLB flush under RCU protection, the
1361          * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1362          * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
1363          */
1364         return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1365 }
1366
1367 /*
1368  * Remove write access from all SPTEs at or above min_level that map GFNs
1369  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1370  * be flushed.
1371  */
1372 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1373                              gfn_t start, gfn_t end, int min_level)
1374 {
1375         struct tdp_iter iter;
1376         u64 new_spte;
1377         bool spte_set = false;
1378
1379         rcu_read_lock();
1380
1381         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1382
1383         for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1384 retry:
1385                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1386                         continue;
1387
1388                 if (!is_shadow_present_pte(iter.old_spte) ||
1389                     !is_last_spte(iter.old_spte, iter.level) ||
1390                     !(iter.old_spte & PT_WRITABLE_MASK))
1391                         continue;
1392
1393                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1394
1395                 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1396                         goto retry;
1397
1398                 spte_set = true;
1399         }
1400
1401         rcu_read_unlock();
1402         return spte_set;
1403 }
1404
1405 /*
1406  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1407  * only affect leaf SPTEs down to min_level.
1408  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1409  */
1410 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1411                              const struct kvm_memory_slot *slot, int min_level)
1412 {
1413         struct kvm_mmu_page *root;
1414         bool spte_set = false;
1415
1416         lockdep_assert_held_read(&kvm->mmu_lock);
1417
1418         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1419                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1420                              slot->base_gfn + slot->npages, min_level);
1421
1422         return spte_set;
1423 }
1424
1425 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1426 {
1427         struct kvm_mmu_page *sp;
1428
1429         gfp |= __GFP_ZERO;
1430
1431         sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1432         if (!sp)
1433                 return NULL;
1434
1435         sp->spt = (void *)__get_free_page(gfp);
1436         if (!sp->spt) {
1437                 kmem_cache_free(mmu_page_header_cache, sp);
1438                 return NULL;
1439         }
1440
1441         return sp;
1442 }
1443
1444 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1445                                                        struct tdp_iter *iter,
1446                                                        bool shared)
1447 {
1448         struct kvm_mmu_page *sp;
1449
1450         /*
1451          * Since we are allocating while under the MMU lock we have to be
1452          * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1453          * reclaim and to avoid making any filesystem callbacks (which can end
1454          * up invoking KVM MMU notifiers, resulting in a deadlock).
1455          *
1456          * If this allocation fails we drop the lock and retry with reclaim
1457          * allowed.
1458          */
1459         sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1460         if (sp)
1461                 return sp;
1462
1463         rcu_read_unlock();
1464
1465         if (shared)
1466                 read_unlock(&kvm->mmu_lock);
1467         else
1468                 write_unlock(&kvm->mmu_lock);
1469
1470         iter->yielded = true;
1471         sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1472
1473         if (shared)
1474                 read_lock(&kvm->mmu_lock);
1475         else
1476                 write_lock(&kvm->mmu_lock);
1477
1478         rcu_read_lock();
1479
1480         return sp;
1481 }
1482
1483 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1484                                    struct kvm_mmu_page *sp, bool shared)
1485 {
1486         const u64 huge_spte = iter->old_spte;
1487         const int level = iter->level;
1488         int ret, i;
1489
1490         tdp_mmu_init_child_sp(sp, iter);
1491
1492         /*
1493          * No need for atomics when writing to sp->spt since the page table has
1494          * not been linked in yet and thus is not reachable from any other CPU.
1495          */
1496         for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1497                 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1498
1499         /*
1500          * Replace the huge spte with a pointer to the populated lower level
1501          * page table. Since we are making this change without a TLB flush vCPUs
1502          * will see a mix of the split mappings and the original huge mapping,
1503          * depending on what's currently in their TLB. This is fine from a
1504          * correctness standpoint since the translation will be the same either
1505          * way.
1506          */
1507         ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1508         if (ret)
1509                 goto out;
1510
1511         /*
1512          * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1513          * are overwriting from the page stats. But we have to manually update
1514          * the page stats with the new present child pages.
1515          */
1516         kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1517
1518 out:
1519         trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1520         return ret;
1521 }
1522
1523 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1524                                          struct kvm_mmu_page *root,
1525                                          gfn_t start, gfn_t end,
1526                                          int target_level, bool shared)
1527 {
1528         struct kvm_mmu_page *sp = NULL;
1529         struct tdp_iter iter;
1530         int ret = 0;
1531
1532         rcu_read_lock();
1533
1534         /*
1535          * Traverse the page table splitting all huge pages above the target
1536          * level into one lower level. For example, if we encounter a 1GB page
1537          * we split it into 512 2MB pages.
1538          *
1539          * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1540          * to visit an SPTE before ever visiting its children, which means we
1541          * will correctly recursively split huge pages that are more than one
1542          * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1543          * and then splitting each of those to 512 4KB pages).
1544          */
1545         for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1546 retry:
1547                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1548                         continue;
1549
1550                 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1551                         continue;
1552
1553                 if (!sp) {
1554                         sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1555                         if (!sp) {
1556                                 ret = -ENOMEM;
1557                                 trace_kvm_mmu_split_huge_page(iter.gfn,
1558                                                               iter.old_spte,
1559                                                               iter.level, ret);
1560                                 break;
1561                         }
1562
1563                         if (iter.yielded)
1564                                 continue;
1565                 }
1566
1567                 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1568                         goto retry;
1569
1570                 sp = NULL;
1571         }
1572
1573         rcu_read_unlock();
1574
1575         /*
1576          * It's possible to exit the loop having never used the last sp if, for
1577          * example, a vCPU doing HugePage NX splitting wins the race and
1578          * installs its own sp in place of the last sp we tried to split.
1579          */
1580         if (sp)
1581                 tdp_mmu_free_sp(sp);
1582
1583         return ret;
1584 }
1585
1586
1587 /*
1588  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1589  */
1590 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1591                                       const struct kvm_memory_slot *slot,
1592                                       gfn_t start, gfn_t end,
1593                                       int target_level, bool shared)
1594 {
1595         struct kvm_mmu_page *root;
1596         int r = 0;
1597
1598         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1599
1600         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1601                 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1602                 if (r) {
1603                         kvm_tdp_mmu_put_root(kvm, root, shared);
1604                         break;
1605                 }
1606         }
1607 }
1608
1609 /*
1610  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1611  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1612  * If AD bits are not enabled, this will require clearing the writable bit on
1613  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1614  * be flushed.
1615  */
1616 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1617                            gfn_t start, gfn_t end)
1618 {
1619         struct tdp_iter iter;
1620         u64 new_spte;
1621         bool spte_set = false;
1622
1623         rcu_read_lock();
1624
1625         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1626 retry:
1627                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1628                         continue;
1629
1630                 if (!is_shadow_present_pte(iter.old_spte))
1631                         continue;
1632
1633                 if (spte_ad_need_write_protect(iter.old_spte)) {
1634                         if (is_writable_pte(iter.old_spte))
1635                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1636                         else
1637                                 continue;
1638                 } else {
1639                         if (iter.old_spte & shadow_dirty_mask)
1640                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1641                         else
1642                                 continue;
1643                 }
1644
1645                 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1646                         goto retry;
1647
1648                 spte_set = true;
1649         }
1650
1651         rcu_read_unlock();
1652         return spte_set;
1653 }
1654
1655 /*
1656  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1657  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1658  * If AD bits are not enabled, this will require clearing the writable bit on
1659  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1660  * be flushed.
1661  */
1662 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1663                                   const struct kvm_memory_slot *slot)
1664 {
1665         struct kvm_mmu_page *root;
1666         bool spte_set = false;
1667
1668         lockdep_assert_held_read(&kvm->mmu_lock);
1669
1670         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1671                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1672                                 slot->base_gfn + slot->npages);
1673
1674         return spte_set;
1675 }
1676
1677 /*
1678  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1679  * set in mask, starting at gfn. The given memslot is expected to contain all
1680  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1681  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1682  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1683  */
1684 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1685                                   gfn_t gfn, unsigned long mask, bool wrprot)
1686 {
1687         struct tdp_iter iter;
1688         u64 new_spte;
1689
1690         rcu_read_lock();
1691
1692         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1693                                     gfn + BITS_PER_LONG) {
1694                 if (!mask)
1695                         break;
1696
1697                 if (iter.level > PG_LEVEL_4K ||
1698                     !(mask & (1UL << (iter.gfn - gfn))))
1699                         continue;
1700
1701                 mask &= ~(1UL << (iter.gfn - gfn));
1702
1703                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1704                         if (is_writable_pte(iter.old_spte))
1705                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1706                         else
1707                                 continue;
1708                 } else {
1709                         if (iter.old_spte & shadow_dirty_mask)
1710                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1711                         else
1712                                 continue;
1713                 }
1714
1715                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1716         }
1717
1718         rcu_read_unlock();
1719 }
1720
1721 /*
1722  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1723  * set in mask, starting at gfn. The given memslot is expected to contain all
1724  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1725  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1726  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1727  */
1728 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1729                                        struct kvm_memory_slot *slot,
1730                                        gfn_t gfn, unsigned long mask,
1731                                        bool wrprot)
1732 {
1733         struct kvm_mmu_page *root;
1734
1735         lockdep_assert_held_write(&kvm->mmu_lock);
1736         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1737                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1738 }
1739
1740 /*
1741  * Clear leaf entries which could be replaced by large mappings, for
1742  * GFNs within the slot.
1743  */
1744 static void zap_collapsible_spte_range(struct kvm *kvm,
1745                                        struct kvm_mmu_page *root,
1746                                        const struct kvm_memory_slot *slot)
1747 {
1748         gfn_t start = slot->base_gfn;
1749         gfn_t end = start + slot->npages;
1750         struct tdp_iter iter;
1751         kvm_pfn_t pfn;
1752
1753         rcu_read_lock();
1754
1755         tdp_root_for_each_pte(iter, root, start, end) {
1756 retry:
1757                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1758                         continue;
1759
1760                 if (!is_shadow_present_pte(iter.old_spte) ||
1761                     !is_last_spte(iter.old_spte, iter.level))
1762                         continue;
1763
1764                 pfn = spte_to_pfn(iter.old_spte);
1765                 if (kvm_is_reserved_pfn(pfn) ||
1766                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1767                                                             pfn, PG_LEVEL_NUM))
1768                         continue;
1769
1770                 /* Note, a successful atomic zap also does a remote TLB flush. */
1771                 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1772                         goto retry;
1773         }
1774
1775         rcu_read_unlock();
1776 }
1777
1778 /*
1779  * Clear non-leaf entries (and free associated page tables) which could
1780  * be replaced by large mappings, for GFNs within the slot.
1781  */
1782 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1783                                        const struct kvm_memory_slot *slot)
1784 {
1785         struct kvm_mmu_page *root;
1786
1787         lockdep_assert_held_read(&kvm->mmu_lock);
1788
1789         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1790                 zap_collapsible_spte_range(kvm, root, slot);
1791 }
1792
1793 /*
1794  * Removes write access on the last level SPTE mapping this GFN and unsets the
1795  * MMU-writable bit to ensure future writes continue to be intercepted.
1796  * Returns true if an SPTE was set and a TLB flush is needed.
1797  */
1798 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1799                               gfn_t gfn, int min_level)
1800 {
1801         struct tdp_iter iter;
1802         u64 new_spte;
1803         bool spte_set = false;
1804
1805         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1806
1807         rcu_read_lock();
1808
1809         for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1810                 if (!is_shadow_present_pte(iter.old_spte) ||
1811                     !is_last_spte(iter.old_spte, iter.level))
1812                         continue;
1813
1814                 new_spte = iter.old_spte &
1815                         ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1816
1817                 if (new_spte == iter.old_spte)
1818                         break;
1819
1820                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1821                 spte_set = true;
1822         }
1823
1824         rcu_read_unlock();
1825
1826         return spte_set;
1827 }
1828
1829 /*
1830  * Removes write access on the last level SPTE mapping this GFN and unsets the
1831  * MMU-writable bit to ensure future writes continue to be intercepted.
1832  * Returns true if an SPTE was set and a TLB flush is needed.
1833  */
1834 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1835                                    struct kvm_memory_slot *slot, gfn_t gfn,
1836                                    int min_level)
1837 {
1838         struct kvm_mmu_page *root;
1839         bool spte_set = false;
1840
1841         lockdep_assert_held_write(&kvm->mmu_lock);
1842         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1843                 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1844
1845         return spte_set;
1846 }
1847
1848 /*
1849  * Return the level of the lowest level SPTE added to sptes.
1850  * That SPTE may be non-present.
1851  *
1852  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1853  */
1854 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1855                          int *root_level)
1856 {
1857         struct tdp_iter iter;
1858         struct kvm_mmu *mmu = vcpu->arch.mmu;
1859         gfn_t gfn = addr >> PAGE_SHIFT;
1860         int leaf = -1;
1861
1862         *root_level = vcpu->arch.mmu->shadow_root_level;
1863
1864         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1865                 leaf = iter.level;
1866                 sptes[leaf] = iter.old_spte;
1867         }
1868
1869         return leaf;
1870 }
1871
1872 /*
1873  * Returns the last level spte pointer of the shadow page walk for the given
1874  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1875  * walk could be performed, returns NULL and *spte does not contain valid data.
1876  *
1877  * Contract:
1878  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1879  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1880  *
1881  * WARNING: This function is only intended to be called during fast_page_fault.
1882  */
1883 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1884                                         u64 *spte)
1885 {
1886         struct tdp_iter iter;
1887         struct kvm_mmu *mmu = vcpu->arch.mmu;
1888         gfn_t gfn = addr >> PAGE_SHIFT;
1889         tdp_ptep_t sptep = NULL;
1890
1891         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1892                 *spte = iter.old_spte;
1893                 sptep = iter.sptep;
1894         }
1895
1896         /*
1897          * Perform the rcu_dereference to get the raw spte pointer value since
1898          * we are passing it up to fast_page_fault, which is shared with the
1899          * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1900          * annotation.
1901          *
1902          * This is safe since fast_page_fault obeys the contracts of this
1903          * function as well as all TDP MMU contracts around modifying SPTEs
1904          * outside of mmu_lock.
1905          */
1906         return rcu_dereference(sptep);
1907 }