arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = true;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  20                 return false;
  21
  22         /* This should not be changed for the lifetime of the VM. */
  23         kvm->arch.tdp_mmu_enabled = true;
  24
  25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  28
  29         return true;
  30 }
  31
  32 /* Arbitrarily returns true so that this may be used in if statements. */
  33 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
  34                                                              bool shared)
  35 {
  36         if (shared)
  37                 lockdep_assert_held_read(&kvm->mmu_lock);
  38         else
  39                 lockdep_assert_held_write(&kvm->mmu_lock);
  40
  41         return true;
  42 }
  43
  44 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  45 {
  46         if (!kvm->arch.tdp_mmu_enabled)
  47                 return;
  48
  49         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
  50         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  51
  52         /*
  53          * Ensure that all the outstanding RCU callbacks to free shadow pages
  54          * can run before the VM is torn down.
  55          */
  56         rcu_barrier();
  57 }
  58
  59 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  60                           gfn_t start, gfn_t end, bool can_yield, bool flush,
  61                           bool shared);
  62
  63 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
  64 {
  65         free_page((unsigned long)sp->spt);
  66         kmem_cache_free(mmu_page_header_cache, sp);
  67 }
  68
  69 /*
  70  * This is called through call_rcu in order to free TDP page table memory
  71  * safely with respect to other kernel threads that may be operating on
  72  * the memory.
  73  * By only accessing TDP MMU page table memory in an RCU read critical
  74  * section, and freeing it after a grace period, lockless access to that
  75  * memory won't use it after it is freed.
  76  */
  77 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
  78 {
  79         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
  80                                                rcu_head);
  81
  82         tdp_mmu_free_sp(sp);
  83 }
  84
  85 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
  86                           bool shared)
  87 {
  88         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
  89
  90         if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
  91                 return;
  92
  93         WARN_ON(!root->tdp_mmu_page);
  94
  95         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
  96         list_del_rcu(&root->link);
  97         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
  98
  99         /*
 100          * A TLB flush is not necessary as KVM performs a local TLB flush when
 101          * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
 102          * to a different pCPU.  Note, the local TLB flush on reuse also
 103          * invalidates any paging-structure-cache entries, i.e. TLB entries for
 104          * intermediate paging structures, that may be zapped, as such entries
 105          * are associated with the ASID on both VMX and SVM.
 106          */
 107         (void)zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
 108
 109         call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
 110 }
 111
 112 /*
 113  * Returns the next root after @prev_root (or the first root if @prev_root is
 114  * NULL).  A reference to the returned root is acquired, and the reference to
 115  * @prev_root is released (the caller obviously must hold a reference to
 116  * @prev_root if it's non-NULL).
 117  *
 118  * If @only_valid is true, invalid roots are skipped.
 119  *
 120  * Returns NULL if the end of tdp_mmu_roots was reached.
 121  */
 122 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 123                                               struct kvm_mmu_page *prev_root,
 124                                               bool shared, bool only_valid)
 125 {
 126         struct kvm_mmu_page *next_root;
 127
 128         rcu_read_lock();
 129
 130         if (prev_root)
 131                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 132                                                   &prev_root->link,
 133                                                   typeof(*prev_root), link);
 134         else
 135                 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 136                                                    typeof(*next_root), link);
 137
 138         while (next_root) {
 139                 if ((!only_valid || !next_root->role.invalid) &&
 140                     kvm_tdp_mmu_get_root(next_root))
 141                         break;
 142
 143                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 144                                 &next_root->link, typeof(*next_root), link);
 145         }
 146
 147         rcu_read_unlock();
 148
 149         if (prev_root)
 150                 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
 151
 152         return next_root;
 153 }
 154
 155 /*
 156  * Note: this iterator gets and puts references to the roots it iterates over.
 157  * This makes it safe to release the MMU lock and yield within the loop, but
 158  * if exiting the loop early, the caller must drop the reference to the most
 159  * recent root. (Unless keeping a live reference is desirable.)
 160  *
 161  * If shared is set, this function is operating under the MMU lock in read
 162  * mode. In the unlikely event that this thread must free a root, the lock
 163  * will be temporarily dropped and reacquired in write mode.
 164  */
 165 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
 166         for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);       \
 167              _root;                                                             \
 168              _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))      \
 169                 if (kvm_mmu_page_as_id(_root) != _as_id) {                      \
 170                 } else
 171
 172 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)    \
 173         __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
 174
 175 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)          \
 176         __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false)
 177
 178 /*
 179  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
 180  * the implication being that any flow that holds mmu_lock for read is
 181  * inherently yield-friendly and should use the yield-safe variant above.
 182  * Holding mmu_lock for write obviates the need for RCU protection as the list
 183  * is guaranteed to be stable.
 184  */
 185 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)                      \
 186         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)     \
 187                 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&    \
 188                     kvm_mmu_page_as_id(_root) != _as_id) {              \
 189                 } else
 190
 191 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
 192 {
 193         struct kvm_mmu_page *sp;
 194
 195         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 196         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 197
 198         return sp;
 199 }
 200
 201 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, gfn_t gfn,
 202                               union kvm_mmu_page_role role)
 203 {
 204         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 205
 206         sp->role = role;
 207         sp->gfn = gfn;
 208         sp->tdp_mmu_page = true;
 209
 210         trace_kvm_mmu_get_page(sp, true);
 211 }
 212
 213 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
 214                                   struct tdp_iter *iter)
 215 {
 216         struct kvm_mmu_page *parent_sp;
 217         union kvm_mmu_page_role role;
 218
 219         parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
 220
 221         role = parent_sp->role;
 222         role.level--;
 223
 224         tdp_mmu_init_sp(child_sp, iter->gfn, role);
 225 }
 226
 227 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 228 {
 229         union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
 230         struct kvm *kvm = vcpu->kvm;
 231         struct kvm_mmu_page *root;
 232
 233         lockdep_assert_held_write(&kvm->mmu_lock);
 234
 235         /*
 236          * Check for an existing root before allocating a new one.  Note, the
 237          * role check prevents consuming an invalid root.
 238          */
 239         for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
 240                 if (root->role.word == role.word &&
 241                     kvm_tdp_mmu_get_root(root))
 242                         goto out;
 243         }
 244
 245         root = tdp_mmu_alloc_sp(vcpu);
 246         tdp_mmu_init_sp(root, 0, role);
 247
 248         refcount_set(&root->tdp_mmu_root_count, 1);
 249
 250         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 251         list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
 252         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 253
 254 out:
 255         return __pa(root->spt);
 256 }
 257
 258 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 259                                 u64 old_spte, u64 new_spte, int level,
 260                                 bool shared);
 261
 262 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 263 {
 264         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 265                 return;
 266
 267         if (is_accessed_spte(old_spte) &&
 268             (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
 269              spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
 270                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 271 }
 272
 273 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 274                                           u64 old_spte, u64 new_spte, int level)
 275 {
 276         bool pfn_changed;
 277         struct kvm_memory_slot *slot;
 278
 279         if (level > PG_LEVEL_4K)
 280                 return;
 281
 282         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 283
 284         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 285             is_writable_pte(new_spte)) {
 286                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 287                 mark_page_dirty_in_slot(kvm, slot, gfn);
 288         }
 289 }
 290
 291 /**
 292  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
 293  *
 294  * @kvm: kvm instance
 295  * @sp: the page to be removed
 296  * @shared: This operation may not be running under the exclusive use of
 297  *          the MMU lock and the operation must synchronize with other
 298  *          threads that might be adding or removing pages.
 299  */
 300 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
 301                               bool shared)
 302 {
 303         if (shared)
 304                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 305         else
 306                 lockdep_assert_held_write(&kvm->mmu_lock);
 307
 308         list_del(&sp->link);
 309         if (sp->lpage_disallowed)
 310                 unaccount_huge_nx_page(kvm, sp);
 311
 312         if (shared)
 313                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 314 }
 315
 316 /**
 317  * handle_removed_pt() - handle a page table removed from the TDP structure
 318  *
 319  * @kvm: kvm instance
 320  * @pt: the page removed from the paging structure
 321  * @shared: This operation may not be running under the exclusive use
 322  *          of the MMU lock and the operation must synchronize with other
 323  *          threads that might be modifying SPTEs.
 324  *
 325  * Given a page table that has been removed from the TDP paging structure,
 326  * iterates through the page table to clear SPTEs and free child page tables.
 327  *
 328  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 329  * protection. Since this thread removed it from the paging structure,
 330  * this thread will be responsible for ensuring the page is freed. Hence the
 331  * early rcu_dereferences in the function.
 332  */
 333 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
 334 {
 335         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
 336         int level = sp->role.level;
 337         gfn_t base_gfn = sp->gfn;
 338         int i;
 339
 340         trace_kvm_mmu_prepare_zap_page(sp);
 341
 342         tdp_mmu_unlink_sp(kvm, sp, shared);
 343
 344         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 345                 u64 *sptep = rcu_dereference(pt) + i;
 346                 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
 347                 u64 old_child_spte;
 348
 349                 if (shared) {
 350                         /*
 351                          * Set the SPTE to a nonpresent value that other
 352                          * threads will not overwrite. If the SPTE was
 353                          * already marked as removed then another thread
 354                          * handling a page fault could overwrite it, so
 355                          * set the SPTE until it is set from some other
 356                          * value to the removed SPTE value.
 357                          */
 358                         for (;;) {
 359                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 360                                 if (!is_removed_spte(old_child_spte))
 361                                         break;
 362                                 cpu_relax();
 363                         }
 364                 } else {
 365                         /*
 366                          * If the SPTE is not MMU-present, there is no backing
 367                          * page associated with the SPTE and so no side effects
 368                          * that need to be recorded, and exclusive ownership of
 369                          * mmu_lock ensures the SPTE can't be made present.
 370                          * Note, zapping MMIO SPTEs is also unnecessary as they
 371                          * are guarded by the memslots generation, not by being
 372                          * unreachable.
 373                          */
 374                         old_child_spte = READ_ONCE(*sptep);
 375                         if (!is_shadow_present_pte(old_child_spte))
 376                                 continue;
 377
 378                         /*
 379                          * Marking the SPTE as a removed SPTE is not
 380                          * strictly necessary here as the MMU lock will
 381                          * stop other threads from concurrently modifying
 382                          * this SPTE. Using the removed SPTE value keeps
 383                          * the two branches consistent and simplifies
 384                          * the function.
 385                          */
 386                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 387                 }
 388                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 389                                     old_child_spte, REMOVED_SPTE, level,
 390                                     shared);
 391         }
 392
 393         kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
 394                                            KVM_PAGES_PER_HPAGE(level + 1));
 395
 396         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 397 }
 398
 399 /**
 400  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
 401  * @kvm: kvm instance
 402  * @as_id: the address space of the paging structure the SPTE was a part of
 403  * @gfn: the base GFN that was mapped by the SPTE
 404  * @old_spte: The value of the SPTE before the change
 405  * @new_spte: The value of the SPTE after the change
 406  * @level: the level of the PT the SPTE is part of in the paging structure
 407  * @shared: This operation may not be running under the exclusive use of
 408  *          the MMU lock and the operation must synchronize with other
 409  *          threads that might be modifying SPTEs.
 410  *
 411  * Handle bookkeeping that might result from the modification of a SPTE.
 412  * This function must be called for all TDP SPTE modifications.
 413  */
 414 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 415                                   u64 old_spte, u64 new_spte, int level,
 416                                   bool shared)
 417 {
 418         bool was_present = is_shadow_present_pte(old_spte);
 419         bool is_present = is_shadow_present_pte(new_spte);
 420         bool was_leaf = was_present && is_last_spte(old_spte, level);
 421         bool is_leaf = is_present && is_last_spte(new_spte, level);
 422         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 423
 424         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 425         WARN_ON(level < PG_LEVEL_4K);
 426         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 427
 428         /*
 429          * If this warning were to trigger it would indicate that there was a
 430          * missing MMU notifier or a race with some notifier handler.
 431          * A present, leaf SPTE should never be directly replaced with another
 432          * present leaf SPTE pointing to a different PFN. A notifier handler
 433          * should be zapping the SPTE before the main MM's page table is
 434          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 435          * thread before replacement.
 436          */
 437         if (was_leaf && is_leaf && pfn_changed) {
 438                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 439                        "SPTE with another present leaf SPTE mapping a\n"
 440                        "different PFN!\n"
 441                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 442                        as_id, gfn, old_spte, new_spte, level);
 443
 444                 /*
 445                  * Crash the host to prevent error propagation and guest data
 446                  * corruption.
 447                  */
 448                 BUG();
 449         }
 450
 451         if (old_spte == new_spte)
 452                 return;
 453
 454         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 455
 456         if (is_leaf)
 457                 check_spte_writable_invariants(new_spte);
 458
 459         /*
 460          * The only times a SPTE should be changed from a non-present to
 461          * non-present state is when an MMIO entry is installed/modified/
 462          * removed. In that case, there is nothing to do here.
 463          */
 464         if (!was_present && !is_present) {
 465                 /*
 466                  * If this change does not involve a MMIO SPTE or removed SPTE,
 467                  * it is unexpected. Log the change, though it should not
 468                  * impact the guest since both the former and current SPTEs
 469                  * are nonpresent.
 470                  */
 471                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 472                             !is_mmio_spte(new_spte) &&
 473                             !is_removed_spte(new_spte)))
 474                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 475                                "should not be replaced with another,\n"
 476                                "different nonpresent SPTE, unless one or both\n"
 477                                "are MMIO SPTEs, or the new SPTE is\n"
 478                                "a temporary removed SPTE.\n"
 479                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 480                                as_id, gfn, old_spte, new_spte, level);
 481                 return;
 482         }
 483
 484         if (is_leaf != was_leaf)
 485                 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
 486
 487         if (was_leaf && is_dirty_spte(old_spte) &&
 488             (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
 489                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 490
 491         /*
 492          * Recursively handle child PTs if the change removed a subtree from
 493          * the paging structure.
 494          */
 495         if (was_present && !was_leaf && (pfn_changed || !is_present))
 496                 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
 497 }
 498
 499 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 500                                 u64 old_spte, u64 new_spte, int level,
 501                                 bool shared)
 502 {
 503         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 504                               shared);
 505         handle_changed_spte_acc_track(old_spte, new_spte, level);
 506         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 507                                       new_spte, level);
 508 }
 509
 510 /*
 511  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
 512  * and handle the associated bookkeeping.  Do not mark the page dirty
 513  * in KVM's dirty bitmaps.
 514  *
 515  * If setting the SPTE fails because it has changed, iter->old_spte will be
 516  * refreshed to the current value of the spte.
 517  *
 518  * @kvm: kvm instance
 519  * @iter: a tdp_iter instance currently on the SPTE that should be set
 520  * @new_spte: The value the SPTE should be set to
 521  * Return:
 522  * * 0      - If the SPTE was set.
 523  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
 524  *            no side-effects other than setting iter->old_spte to the last
 525  *            known value of the spte.
 526  */
 527 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
 528                                           struct tdp_iter *iter,
 529                                           u64 new_spte)
 530 {
 531         u64 *sptep = rcu_dereference(iter->sptep);
 532         u64 old_spte;
 533
 534         WARN_ON_ONCE(iter->yielded);
 535
 536         lockdep_assert_held_read(&kvm->mmu_lock);
 537
 538         /*
 539          * Do not change removed SPTEs. Only the thread that froze the SPTE
 540          * may modify it.
 541          */
 542         if (is_removed_spte(iter->old_spte))
 543                 return -EBUSY;
 544
 545         /*
 546          * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
 547          * does not hold the mmu_lock.
 548          */
 549         old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
 550         if (old_spte != iter->old_spte) {
 551                 /*
 552                  * The page table entry was modified by a different logical
 553                  * CPU. Refresh iter->old_spte with the current value so the
 554                  * caller operates on fresh data, e.g. if it retries
 555                  * tdp_mmu_set_spte_atomic().
 556                  */
 557                 iter->old_spte = old_spte;
 558                 return -EBUSY;
 559         }
 560
 561         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 562                               new_spte, iter->level, true);
 563         handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
 564
 565         return 0;
 566 }
 567
 568 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 569                                           struct tdp_iter *iter)
 570 {
 571         int ret;
 572
 573         /*
 574          * Freeze the SPTE by setting it to a special,
 575          * non-present value. This will stop other threads from
 576          * immediately installing a present entry in its place
 577          * before the TLBs are flushed.
 578          */
 579         ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
 580         if (ret)
 581                 return ret;
 582
 583         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 584                                            KVM_PAGES_PER_HPAGE(iter->level));
 585
 586         /*
 587          * No other thread can overwrite the removed SPTE as they
 588          * must either wait on the MMU lock or use
 589          * tdp_mmu_set_spte_atomic which will not overwrite the
 590          * special removed SPTE value. No bookkeeping is needed
 591          * here since the SPTE is going from non-present
 592          * to non-present.
 593          */
 594         WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 595
 596         return 0;
 597 }
 598
 599
 600 /*
 601  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 602  * @kvm: kvm instance
 603  * @iter: a tdp_iter instance currently on the SPTE that should be set
 604  * @new_spte: The value the SPTE should be set to
 605  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 606  *                    of the page. Should be set unless handling an MMU
 607  *                    notifier for access tracking. Leaving record_acc_track
 608  *                    unset in that case prevents page accesses from being
 609  *                    double counted.
 610  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 611  *                    appropriate for the change being made. Should be set
 612  *                    unless performing certain dirty logging operations.
 613  *                    Leaving record_dirty_log unset in that case prevents page
 614  *                    writes from being double counted.
 615  */
 616 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 617                                       u64 new_spte, bool record_acc_track,
 618                                       bool record_dirty_log)
 619 {
 620         WARN_ON_ONCE(iter->yielded);
 621
 622         lockdep_assert_held_write(&kvm->mmu_lock);
 623
 624         /*
 625          * No thread should be using this function to set SPTEs to the
 626          * temporary removed SPTE value.
 627          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 628          * should be used. If operating under the MMU lock in write mode, the
 629          * use of the removed SPTE should not be necessary.
 630          */
 631         WARN_ON(is_removed_spte(iter->old_spte));
 632
 633         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 634
 635         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 636                               new_spte, iter->level, false);
 637         if (record_acc_track)
 638                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 639                                               iter->level);
 640         if (record_dirty_log)
 641                 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
 642                                               iter->old_spte, new_spte,
 643                                               iter->level);
 644 }
 645
 646 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 647                                     u64 new_spte)
 648 {
 649         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 650 }
 651
 652 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 653                                                  struct tdp_iter *iter,
 654                                                  u64 new_spte)
 655 {
 656         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 657 }
 658
 659 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 660                                                  struct tdp_iter *iter,
 661                                                  u64 new_spte)
 662 {
 663         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 664 }
 665
 666 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 667         for_each_tdp_pte(_iter, _root, _start, _end)
 668
 669 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 670         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 671                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 672                     !is_last_spte(_iter.old_spte, _iter.level))         \
 673                         continue;                                       \
 674                 else
 675
 676 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 677         for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
 678
 679 /*
 680  * Yield if the MMU lock is contended or this thread needs to return control
 681  * to the scheduler.
 682  *
 683  * If this function should yield and flush is set, it will perform a remote
 684  * TLB flush before yielding.
 685  *
 686  * If this function yields, iter->yielded is set and the caller must skip to
 687  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
 688  * over the paging structures to allow the iterator to continue its traversal
 689  * from the paging structure root.
 690  *
 691  * Returns true if this function yielded.
 692  */
 693 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
 694                                                           struct tdp_iter *iter,
 695                                                           bool flush, bool shared)
 696 {
 697         WARN_ON(iter->yielded);
 698
 699         /* Ensure forward progress has been made before yielding. */
 700         if (iter->next_last_level_gfn == iter->yielded_gfn)
 701                 return false;
 702
 703         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 704                 rcu_read_unlock();
 705
 706                 if (flush)
 707                         kvm_flush_remote_tlbs(kvm);
 708
 709                 if (shared)
 710                         cond_resched_rwlock_read(&kvm->mmu_lock);
 711                 else
 712                         cond_resched_rwlock_write(&kvm->mmu_lock);
 713
 714                 rcu_read_lock();
 715
 716                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 717
 718                 iter->yielded = true;
 719         }
 720
 721         return iter->yielded;
 722 }
 723
 724 /*
 725  * Tears down the mappings for the range of gfns, [start, end), and frees the
 726  * non-root pages mapping GFNs strictly within that range. Returns true if
 727  * SPTEs have been cleared and a TLB flush is needed before releasing the
 728  * MMU lock.
 729  *
 730  * If can_yield is true, will release the MMU lock and reschedule if the
 731  * scheduler needs the CPU or there is contention on the MMU lock. If this
 732  * function cannot yield, it will not release the MMU lock or reschedule and
 733  * the caller must ensure it does not supply too large a GFN range, or the
 734  * operation can cause a soft lockup.
 735  *
 736  * If shared is true, this thread holds the MMU lock in read mode and must
 737  * account for the possibility that other threads are modifying the paging
 738  * structures concurrently. If shared is false, this thread should hold the
 739  * MMU lock in write mode.
 740  */
 741 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 742                           gfn_t start, gfn_t end, bool can_yield, bool flush,
 743                           bool shared)
 744 {
 745         gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 746         bool zap_all = (start == 0 && end >= max_gfn_host);
 747         struct tdp_iter iter;
 748
 749         /*
 750          * No need to try to step down in the iterator when zapping all SPTEs,
 751          * zapping the top-level non-leaf SPTEs will recurse on their children.
 752          */
 753         int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
 754
 755         /*
 756          * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
 757          * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
 758          * and so KVM will never install a SPTE for such addresses.
 759          */
 760         end = min(end, max_gfn_host);
 761
 762         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 763
 764         rcu_read_lock();
 765
 766         for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
 767 retry:
 768                 if (can_yield &&
 769                     tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
 770                         flush = false;
 771                         continue;
 772                 }
 773
 774                 if (!is_shadow_present_pte(iter.old_spte))
 775                         continue;
 776
 777                 /*
 778                  * If this is a non-last-level SPTE that covers a larger range
 779                  * than should be zapped, continue, and zap the mappings at a
 780                  * lower level, except when zapping all SPTEs.
 781                  */
 782                 if (!zap_all &&
 783                     (iter.gfn < start ||
 784                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 785                     !is_last_spte(iter.old_spte, iter.level))
 786                         continue;
 787
 788                 if (!shared) {
 789                         tdp_mmu_set_spte(kvm, &iter, 0);
 790                         flush = true;
 791                 } else if (tdp_mmu_zap_spte_atomic(kvm, &iter)) {
 792                         goto retry;
 793                 }
 794         }
 795
 796         rcu_read_unlock();
 797         return flush;
 798 }
 799
 800 /*
 801  * Tears down the mappings for the range of gfns, [start, end), and frees the
 802  * non-root pages mapping GFNs strictly within that range. Returns true if
 803  * SPTEs have been cleared and a TLB flush is needed before releasing the
 804  * MMU lock.
 805  */
 806 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
 807                                  gfn_t end, bool can_yield, bool flush)
 808 {
 809         struct kvm_mmu_page *root;
 810
 811         for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
 812                 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
 813                                       false);
 814
 815         return flush;
 816 }
 817
 818 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 819 {
 820         bool flush = false;
 821         int i;
 822
 823         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
 824                 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
 825
 826         if (flush)
 827                 kvm_flush_remote_tlbs(kvm);
 828 }
 829
 830 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
 831                                                   struct kvm_mmu_page *prev_root)
 832 {
 833         struct kvm_mmu_page *next_root;
 834
 835         if (prev_root)
 836                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 837                                                   &prev_root->link,
 838                                                   typeof(*prev_root), link);
 839         else
 840                 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 841                                                    typeof(*next_root), link);
 842
 843         while (next_root && !(next_root->role.invalid &&
 844                               refcount_read(&next_root->tdp_mmu_root_count)))
 845                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 846                                                   &next_root->link,
 847                                                   typeof(*next_root), link);
 848
 849         return next_root;
 850 }
 851
 852 /*
 853  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
 854  * zap" completes.  Since kvm_tdp_mmu_invalidate_all_roots() has acquired a
 855  * reference to each invalidated root, roots will not be freed until after this
 856  * function drops the gifted reference, e.g. so that vCPUs don't get stuck with
 857  * tearing down paging structures.
 858  */
 859 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
 860 {
 861         struct kvm_mmu_page *next_root;
 862         struct kvm_mmu_page *root;
 863
 864         lockdep_assert_held_read(&kvm->mmu_lock);
 865
 866         rcu_read_lock();
 867
 868         root = next_invalidated_root(kvm, NULL);
 869
 870         while (root) {
 871                 next_root = next_invalidated_root(kvm, root);
 872
 873                 rcu_read_unlock();
 874
 875                 /*
 876                  * A TLB flush is unnecessary, invalidated roots are guaranteed
 877                  * to be unreachable by the guest (see kvm_tdp_mmu_put_root()
 878                  * for more details), and unlike the legacy MMU, no vCPU kick
 879                  * is needed to play nice with lockless shadow walks as the TDP
 880                  * MMU protects its paging structures via RCU.  Note, zapping
 881                  * will still flush on yield, but that's a minor performance
 882                  * blip and not a functional issue.
 883                  */
 884                 (void)zap_gfn_range(kvm, root, 0, -1ull, true, false, true);
 885
 886                 /*
 887                  * Put the reference acquired in
 888                  * kvm_tdp_mmu_invalidate_roots
 889                  */
 890                 kvm_tdp_mmu_put_root(kvm, root, true);
 891
 892                 root = next_root;
 893
 894                 rcu_read_lock();
 895         }
 896
 897         rcu_read_unlock();
 898 }
 899
 900 /*
 901  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
 902  * is about to be zapped, e.g. in response to a memslots update.  The caller is
 903  * responsible for invoking kvm_tdp_mmu_zap_invalidated_roots() to do the actual
 904  * zapping.
 905  *
 906  * Take a reference on all roots to prevent the root from being freed before it
 907  * is zapped by this thread.  Freeing a root is not a correctness issue, but if
 908  * a vCPU drops the last reference to a root prior to the root being zapped, it
 909  * will get stuck with tearing down the entire paging structure.
 910  *
 911  * Get a reference even if the root is already invalid,
 912  * kvm_tdp_mmu_zap_invalidated_roots() assumes it was gifted a reference to all
 913  * invalid roots, e.g. there's no epoch to identify roots that were invalidated
 914  * by a previous call.  Roots stay on the list until the last reference is
 915  * dropped, so even though all invalid roots are zapped, a root may not go away
 916  * for quite some time, e.g. if a vCPU blocks across multiple memslot updates.
 917  *
 918  * Because mmu_lock is held for write, it should be impossible to observe a
 919  * root with zero refcount, i.e. the list of roots cannot be stale.
 920  *
 921  * This has essentially the same effect for the TDP MMU
 922  * as updating mmu_valid_gen does for the shadow MMU.
 923  */
 924 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
 925 {
 926         struct kvm_mmu_page *root;
 927
 928         lockdep_assert_held_write(&kvm->mmu_lock);
 929         list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
 930                 if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
 931                         root->role.invalid = true;
 932         }
 933 }
 934
 935 /*
 936  * Installs a last-level SPTE to handle a TDP page fault.
 937  * (NPT/EPT violation/misconfiguration)
 938  */
 939 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 940                                           struct kvm_page_fault *fault,
 941                                           struct tdp_iter *iter)
 942 {
 943         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
 944         u64 new_spte;
 945         int ret = RET_PF_FIXED;
 946         bool wrprot = false;
 947
 948         WARN_ON(sp->role.level != fault->goal_level);
 949         if (unlikely(!fault->slot))
 950                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 951         else
 952                 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
 953                                          fault->pfn, iter->old_spte, fault->prefetch, true,
 954                                          fault->map_writable, &new_spte);
 955
 956         if (new_spte == iter->old_spte)
 957                 ret = RET_PF_SPURIOUS;
 958         else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 959                 return RET_PF_RETRY;
 960
 961         /*
 962          * If the page fault was caused by a write but the page is write
 963          * protected, emulation is needed. If the emulation was skipped,
 964          * the vCPU would have the same fault again.
 965          */
 966         if (wrprot) {
 967                 if (fault->write)
 968                         ret = RET_PF_EMULATE;
 969         }
 970
 971         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 972         if (unlikely(is_mmio_spte(new_spte))) {
 973                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 974                                      new_spte);
 975                 ret = RET_PF_EMULATE;
 976         } else {
 977                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 978                                        rcu_dereference(iter->sptep));
 979         }
 980
 981         /*
 982          * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
 983          * consistent with legacy MMU behavior.
 984          */
 985         if (ret != RET_PF_SPURIOUS)
 986                 vcpu->stat.pf_fixed++;
 987
 988         return ret;
 989 }
 990
 991 /*
 992  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
 993  * provided page table.
 994  *
 995  * @kvm: kvm instance
 996  * @iter: a tdp_iter instance currently on the SPTE that should be set
 997  * @sp: The new TDP page table to install.
 998  * @account_nx: True if this page table is being installed to split a
 999  *              non-executable huge page.
1000  * @shared: This operation is running under the MMU lock in read mode.
1001  *
1002  * Returns: 0 if the new page table was installed. Non-0 if the page table
1003  *          could not be installed (e.g. the atomic compare-exchange failed).
1004  */
1005 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1006                            struct kvm_mmu_page *sp, bool account_nx,
1007                            bool shared)
1008 {
1009         u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
1010         int ret = 0;
1011
1012         if (shared) {
1013                 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1014                 if (ret)
1015                         return ret;
1016         } else {
1017                 tdp_mmu_set_spte(kvm, iter, spte);
1018         }
1019
1020         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1021         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1022         if (account_nx)
1023                 account_huge_nx_page(kvm, sp);
1024         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1025
1026         return 0;
1027 }
1028
1029 /*
1030  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1031  * page tables and SPTEs to translate the faulting guest physical address.
1032  */
1033 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1034 {
1035         struct kvm_mmu *mmu = vcpu->arch.mmu;
1036         struct tdp_iter iter;
1037         struct kvm_mmu_page *sp;
1038         int ret;
1039
1040         kvm_mmu_hugepage_adjust(vcpu, fault);
1041
1042         trace_kvm_mmu_spte_requested(fault);
1043
1044         rcu_read_lock();
1045
1046         tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1047                 if (fault->nx_huge_page_workaround_enabled)
1048                         disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1049
1050                 if (iter.level == fault->goal_level)
1051                         break;
1052
1053                 /*
1054                  * If there is an SPTE mapping a large page at a higher level
1055                  * than the target, that SPTE must be cleared and replaced
1056                  * with a non-leaf SPTE.
1057                  */
1058                 if (is_shadow_present_pte(iter.old_spte) &&
1059                     is_large_pte(iter.old_spte)) {
1060                         if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1061                                 break;
1062
1063                         /*
1064                          * The iter must explicitly re-read the spte here
1065                          * because the new value informs the !present
1066                          * path below.
1067                          */
1068                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1069                 }
1070
1071                 if (!is_shadow_present_pte(iter.old_spte)) {
1072                         bool account_nx = fault->huge_page_disallowed &&
1073                                           fault->req_level >= iter.level;
1074
1075                         /*
1076                          * If SPTE has been frozen by another thread, just
1077                          * give up and retry, avoiding unnecessary page table
1078                          * allocation and free.
1079                          */
1080                         if (is_removed_spte(iter.old_spte))
1081                                 break;
1082
1083                         sp = tdp_mmu_alloc_sp(vcpu);
1084                         tdp_mmu_init_child_sp(sp, &iter);
1085
1086                         if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1087                                 tdp_mmu_free_sp(sp);
1088                                 break;
1089                         }
1090                 }
1091         }
1092
1093         if (iter.level != fault->goal_level) {
1094                 rcu_read_unlock();
1095                 return RET_PF_RETRY;
1096         }
1097
1098         ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1099         rcu_read_unlock();
1100
1101         return ret;
1102 }
1103
1104 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1105                                  bool flush)
1106 {
1107         return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
1108                                            range->end, range->may_block, flush);
1109 }
1110
1111 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1112                               struct kvm_gfn_range *range);
1113
1114 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1115                                                    struct kvm_gfn_range *range,
1116                                                    tdp_handler_t handler)
1117 {
1118         struct kvm_mmu_page *root;
1119         struct tdp_iter iter;
1120         bool ret = false;
1121
1122         rcu_read_lock();
1123
1124         /*
1125          * Don't support rescheduling, none of the MMU notifiers that funnel
1126          * into this helper allow blocking; it'd be dead, wasteful code.
1127          */
1128         for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1129                 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1130                         ret |= handler(kvm, &iter, range);
1131         }
1132
1133         rcu_read_unlock();
1134
1135         return ret;
1136 }
1137
1138 /*
1139  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1140  * if any of the GFNs in the range have been accessed.
1141  */
1142 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1143                           struct kvm_gfn_range *range)
1144 {
1145         u64 new_spte = 0;
1146
1147         /* If we have a non-accessed entry we don't need to change the pte. */
1148         if (!is_accessed_spte(iter->old_spte))
1149                 return false;
1150
1151         new_spte = iter->old_spte;
1152
1153         if (spte_ad_enabled(new_spte)) {
1154                 new_spte &= ~shadow_accessed_mask;
1155         } else {
1156                 /*
1157                  * Capture the dirty status of the page, so that it doesn't get
1158                  * lost when the SPTE is marked for access tracking.
1159                  */
1160                 if (is_writable_pte(new_spte))
1161                         kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1162
1163                 new_spte = mark_spte_for_access_track(new_spte);
1164         }
1165
1166         tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1167
1168         return true;
1169 }
1170
1171 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1172 {
1173         return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1174 }
1175
1176 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1177                          struct kvm_gfn_range *range)
1178 {
1179         return is_accessed_spte(iter->old_spte);
1180 }
1181
1182 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1183 {
1184         return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1185 }
1186
1187 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1188                          struct kvm_gfn_range *range)
1189 {
1190         u64 new_spte;
1191
1192         /* Huge pages aren't expected to be modified without first being zapped. */
1193         WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1194
1195         if (iter->level != PG_LEVEL_4K ||
1196             !is_shadow_present_pte(iter->old_spte))
1197                 return false;
1198
1199         /*
1200          * Note, when changing a read-only SPTE, it's not strictly necessary to
1201          * zero the SPTE before setting the new PFN, but doing so preserves the
1202          * invariant that the PFN of a present * leaf SPTE can never change.
1203          * See __handle_changed_spte().
1204          */
1205         tdp_mmu_set_spte(kvm, iter, 0);
1206
1207         if (!pte_write(range->pte)) {
1208                 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1209                                                                   pte_pfn(range->pte));
1210
1211                 tdp_mmu_set_spte(kvm, iter, new_spte);
1212         }
1213
1214         return true;
1215 }
1216
1217 /*
1218  * Handle the changed_pte MMU notifier for the TDP MMU.
1219  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1220  * notifier.
1221  * Returns non-zero if a flush is needed before releasing the MMU lock.
1222  */
1223 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1224 {
1225         bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1226
1227         /* FIXME: return 'flush' instead of flushing here. */
1228         if (flush)
1229                 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1230
1231         return false;
1232 }
1233
1234 /*
1235  * Remove write access from all SPTEs at or above min_level that map GFNs
1236  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1237  * be flushed.
1238  */
1239 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1240                              gfn_t start, gfn_t end, int min_level)
1241 {
1242         struct tdp_iter iter;
1243         u64 new_spte;
1244         bool spte_set = false;
1245
1246         rcu_read_lock();
1247
1248         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1249
1250         for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1251 retry:
1252                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1253                         continue;
1254
1255                 if (!is_shadow_present_pte(iter.old_spte) ||
1256                     !is_last_spte(iter.old_spte, iter.level) ||
1257                     !(iter.old_spte & PT_WRITABLE_MASK))
1258                         continue;
1259
1260                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1261
1262                 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1263                         goto retry;
1264
1265                 spte_set = true;
1266         }
1267
1268         rcu_read_unlock();
1269         return spte_set;
1270 }
1271
1272 /*
1273  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1274  * only affect leaf SPTEs down to min_level.
1275  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1276  */
1277 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1278                              const struct kvm_memory_slot *slot, int min_level)
1279 {
1280         struct kvm_mmu_page *root;
1281         bool spte_set = false;
1282
1283         lockdep_assert_held_read(&kvm->mmu_lock);
1284
1285         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1286                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1287                              slot->base_gfn + slot->npages, min_level);
1288
1289         return spte_set;
1290 }
1291
1292 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1293 {
1294         struct kvm_mmu_page *sp;
1295
1296         gfp |= __GFP_ZERO;
1297
1298         sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1299         if (!sp)
1300                 return NULL;
1301
1302         sp->spt = (void *)__get_free_page(gfp);
1303         if (!sp->spt) {
1304                 kmem_cache_free(mmu_page_header_cache, sp);
1305                 return NULL;
1306         }
1307
1308         return sp;
1309 }
1310
1311 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1312                                                        struct tdp_iter *iter,
1313                                                        bool shared)
1314 {
1315         struct kvm_mmu_page *sp;
1316
1317         /*
1318          * Since we are allocating while under the MMU lock we have to be
1319          * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1320          * reclaim and to avoid making any filesystem callbacks (which can end
1321          * up invoking KVM MMU notifiers, resulting in a deadlock).
1322          *
1323          * If this allocation fails we drop the lock and retry with reclaim
1324          * allowed.
1325          */
1326         sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1327         if (sp)
1328                 return sp;
1329
1330         rcu_read_unlock();
1331
1332         if (shared)
1333                 read_unlock(&kvm->mmu_lock);
1334         else
1335                 write_unlock(&kvm->mmu_lock);
1336
1337         iter->yielded = true;
1338         sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1339
1340         if (shared)
1341                 read_lock(&kvm->mmu_lock);
1342         else
1343                 write_lock(&kvm->mmu_lock);
1344
1345         rcu_read_lock();
1346
1347         return sp;
1348 }
1349
1350 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1351                                    struct kvm_mmu_page *sp, bool shared)
1352 {
1353         const u64 huge_spte = iter->old_spte;
1354         const int level = iter->level;
1355         int ret, i;
1356
1357         tdp_mmu_init_child_sp(sp, iter);
1358
1359         /*
1360          * No need for atomics when writing to sp->spt since the page table has
1361          * not been linked in yet and thus is not reachable from any other CPU.
1362          */
1363         for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1364                 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1365
1366         /*
1367          * Replace the huge spte with a pointer to the populated lower level
1368          * page table. Since we are making this change without a TLB flush vCPUs
1369          * will see a mix of the split mappings and the original huge mapping,
1370          * depending on what's currently in their TLB. This is fine from a
1371          * correctness standpoint since the translation will be the same either
1372          * way.
1373          */
1374         ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1375         if (ret)
1376                 goto out;
1377
1378         /*
1379          * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1380          * are overwriting from the page stats. But we have to manually update
1381          * the page stats with the new present child pages.
1382          */
1383         kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1384
1385 out:
1386         trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1387         return ret;
1388 }
1389
1390 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1391                                          struct kvm_mmu_page *root,
1392                                          gfn_t start, gfn_t end,
1393                                          int target_level, bool shared)
1394 {
1395         struct kvm_mmu_page *sp = NULL;
1396         struct tdp_iter iter;
1397         int ret = 0;
1398
1399         rcu_read_lock();
1400
1401         /*
1402          * Traverse the page table splitting all huge pages above the target
1403          * level into one lower level. For example, if we encounter a 1GB page
1404          * we split it into 512 2MB pages.
1405          *
1406          * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1407          * to visit an SPTE before ever visiting its children, which means we
1408          * will correctly recursively split huge pages that are more than one
1409          * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1410          * and then splitting each of those to 512 4KB pages).
1411          */
1412         for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1413 retry:
1414                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1415                         continue;
1416
1417                 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1418                         continue;
1419
1420                 if (!sp) {
1421                         sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1422                         if (!sp) {
1423                                 ret = -ENOMEM;
1424                                 trace_kvm_mmu_split_huge_page(iter.gfn,
1425                                                               iter.old_spte,
1426                                                               iter.level, ret);
1427                                 break;
1428                         }
1429
1430                         if (iter.yielded)
1431                                 continue;
1432                 }
1433
1434                 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1435                         goto retry;
1436
1437                 sp = NULL;
1438         }
1439
1440         rcu_read_unlock();
1441
1442         /*
1443          * It's possible to exit the loop having never used the last sp if, for
1444          * example, a vCPU doing HugePage NX splitting wins the race and
1445          * installs its own sp in place of the last sp we tried to split.
1446          */
1447         if (sp)
1448                 tdp_mmu_free_sp(sp);
1449
1450         return ret;
1451 }
1452
1453
1454 /*
1455  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1456  */
1457 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1458                                       const struct kvm_memory_slot *slot,
1459                                       gfn_t start, gfn_t end,
1460                                       int target_level, bool shared)
1461 {
1462         struct kvm_mmu_page *root;
1463         int r = 0;
1464
1465         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1466
1467         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1468                 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1469                 if (r) {
1470                         kvm_tdp_mmu_put_root(kvm, root, shared);
1471                         break;
1472                 }
1473         }
1474 }
1475
1476 /*
1477  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1478  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1479  * If AD bits are not enabled, this will require clearing the writable bit on
1480  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1481  * be flushed.
1482  */
1483 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1484                            gfn_t start, gfn_t end)
1485 {
1486         struct tdp_iter iter;
1487         u64 new_spte;
1488         bool spte_set = false;
1489
1490         rcu_read_lock();
1491
1492         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1493 retry:
1494                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1495                         continue;
1496
1497                 if (!is_shadow_present_pte(iter.old_spte))
1498                         continue;
1499
1500                 if (spte_ad_need_write_protect(iter.old_spte)) {
1501                         if (is_writable_pte(iter.old_spte))
1502                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1503                         else
1504                                 continue;
1505                 } else {
1506                         if (iter.old_spte & shadow_dirty_mask)
1507                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1508                         else
1509                                 continue;
1510                 }
1511
1512                 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1513                         goto retry;
1514
1515                 spte_set = true;
1516         }
1517
1518         rcu_read_unlock();
1519         return spte_set;
1520 }
1521
1522 /*
1523  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1524  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1525  * If AD bits are not enabled, this will require clearing the writable bit on
1526  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1527  * be flushed.
1528  */
1529 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1530                                   const struct kvm_memory_slot *slot)
1531 {
1532         struct kvm_mmu_page *root;
1533         bool spte_set = false;
1534
1535         lockdep_assert_held_read(&kvm->mmu_lock);
1536
1537         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1538                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1539                                 slot->base_gfn + slot->npages);
1540
1541         return spte_set;
1542 }
1543
1544 /*
1545  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1546  * set in mask, starting at gfn. The given memslot is expected to contain all
1547  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1548  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1549  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1550  */
1551 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1552                                   gfn_t gfn, unsigned long mask, bool wrprot)
1553 {
1554         struct tdp_iter iter;
1555         u64 new_spte;
1556
1557         rcu_read_lock();
1558
1559         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1560                                     gfn + BITS_PER_LONG) {
1561                 if (!mask)
1562                         break;
1563
1564                 if (iter.level > PG_LEVEL_4K ||
1565                     !(mask & (1UL << (iter.gfn - gfn))))
1566                         continue;
1567
1568                 mask &= ~(1UL << (iter.gfn - gfn));
1569
1570                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1571                         if (is_writable_pte(iter.old_spte))
1572                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1573                         else
1574                                 continue;
1575                 } else {
1576                         if (iter.old_spte & shadow_dirty_mask)
1577                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1578                         else
1579                                 continue;
1580                 }
1581
1582                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1583         }
1584
1585         rcu_read_unlock();
1586 }
1587
1588 /*
1589  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1590  * set in mask, starting at gfn. The given memslot is expected to contain all
1591  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1592  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1593  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1594  */
1595 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1596                                        struct kvm_memory_slot *slot,
1597                                        gfn_t gfn, unsigned long mask,
1598                                        bool wrprot)
1599 {
1600         struct kvm_mmu_page *root;
1601
1602         lockdep_assert_held_write(&kvm->mmu_lock);
1603         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1604                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1605 }
1606
1607 /*
1608  * Clear leaf entries which could be replaced by large mappings, for
1609  * GFNs within the slot.
1610  */
1611 static void zap_collapsible_spte_range(struct kvm *kvm,
1612                                        struct kvm_mmu_page *root,
1613                                        const struct kvm_memory_slot *slot)
1614 {
1615         gfn_t start = slot->base_gfn;
1616         gfn_t end = start + slot->npages;
1617         struct tdp_iter iter;
1618         kvm_pfn_t pfn;
1619
1620         rcu_read_lock();
1621
1622         tdp_root_for_each_pte(iter, root, start, end) {
1623 retry:
1624                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1625                         continue;
1626
1627                 if (!is_shadow_present_pte(iter.old_spte) ||
1628                     !is_last_spte(iter.old_spte, iter.level))
1629                         continue;
1630
1631                 pfn = spte_to_pfn(iter.old_spte);
1632                 if (kvm_is_reserved_pfn(pfn) ||
1633                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1634                                                             pfn, PG_LEVEL_NUM))
1635                         continue;
1636
1637                 /* Note, a successful atomic zap also does a remote TLB flush. */
1638                 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1639                         goto retry;
1640         }
1641
1642         rcu_read_unlock();
1643 }
1644
1645 /*
1646  * Clear non-leaf entries (and free associated page tables) which could
1647  * be replaced by large mappings, for GFNs within the slot.
1648  */
1649 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1650                                        const struct kvm_memory_slot *slot)
1651 {
1652         struct kvm_mmu_page *root;
1653
1654         lockdep_assert_held_read(&kvm->mmu_lock);
1655
1656         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1657                 zap_collapsible_spte_range(kvm, root, slot);
1658 }
1659
1660 /*
1661  * Removes write access on the last level SPTE mapping this GFN and unsets the
1662  * MMU-writable bit to ensure future writes continue to be intercepted.
1663  * Returns true if an SPTE was set and a TLB flush is needed.
1664  */
1665 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1666                               gfn_t gfn, int min_level)
1667 {
1668         struct tdp_iter iter;
1669         u64 new_spte;
1670         bool spte_set = false;
1671
1672         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1673
1674         rcu_read_lock();
1675
1676         for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1677                 if (!is_shadow_present_pte(iter.old_spte) ||
1678                     !is_last_spte(iter.old_spte, iter.level))
1679                         continue;
1680
1681                 new_spte = iter.old_spte &
1682                         ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1683
1684                 if (new_spte == iter.old_spte)
1685                         break;
1686
1687                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1688                 spte_set = true;
1689         }
1690
1691         rcu_read_unlock();
1692
1693         return spte_set;
1694 }
1695
1696 /*
1697  * Removes write access on the last level SPTE mapping this GFN and unsets the
1698  * MMU-writable bit to ensure future writes continue to be intercepted.
1699  * Returns true if an SPTE was set and a TLB flush is needed.
1700  */
1701 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1702                                    struct kvm_memory_slot *slot, gfn_t gfn,
1703                                    int min_level)
1704 {
1705         struct kvm_mmu_page *root;
1706         bool spte_set = false;
1707
1708         lockdep_assert_held_write(&kvm->mmu_lock);
1709         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1710                 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1711
1712         return spte_set;
1713 }
1714
1715 /*
1716  * Return the level of the lowest level SPTE added to sptes.
1717  * That SPTE may be non-present.
1718  *
1719  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1720  */
1721 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1722                          int *root_level)
1723 {
1724         struct tdp_iter iter;
1725         struct kvm_mmu *mmu = vcpu->arch.mmu;
1726         gfn_t gfn = addr >> PAGE_SHIFT;
1727         int leaf = -1;
1728
1729         *root_level = vcpu->arch.mmu->shadow_root_level;
1730
1731         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1732                 leaf = iter.level;
1733                 sptes[leaf] = iter.old_spte;
1734         }
1735
1736         return leaf;
1737 }
1738
1739 /*
1740  * Returns the last level spte pointer of the shadow page walk for the given
1741  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1742  * walk could be performed, returns NULL and *spte does not contain valid data.
1743  *
1744  * Contract:
1745  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1746  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1747  *
1748  * WARNING: This function is only intended to be called during fast_page_fault.
1749  */
1750 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1751                                         u64 *spte)
1752 {
1753         struct tdp_iter iter;
1754         struct kvm_mmu *mmu = vcpu->arch.mmu;
1755         gfn_t gfn = addr >> PAGE_SHIFT;
1756         tdp_ptep_t sptep = NULL;
1757
1758         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1759                 *spte = iter.old_spte;
1760                 sptep = iter.sptep;
1761         }
1762
1763         /*
1764          * Perform the rcu_dereference to get the raw spte pointer value since
1765          * we are passing it up to fast_page_fault, which is shared with the
1766          * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1767          * annotation.
1768          *
1769          * This is safe since fast_page_fault obeys the contracts of this
1770          * function as well as all TDP MMU contracts around modifying SPTEs
1771          * outside of mmu_lock.
1772          */
1773         return rcu_dereference(sptep);
1774 }