arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = false;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  20                 return false;
  21
  22         /* This should not be changed for the lifetime of the VM. */
  23         kvm->arch.tdp_mmu_enabled = true;
  24
  25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  28
  29         return true;
  30 }
  31
  32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
  33                                                              bool shared)
  34 {
  35         if (shared)
  36                 lockdep_assert_held_read(&kvm->mmu_lock);
  37         else
  38                 lockdep_assert_held_write(&kvm->mmu_lock);
  39 }
  40
  41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  42 {
  43         if (!kvm->arch.tdp_mmu_enabled)
  44                 return;
  45
  46         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
  47         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  48
  49         /*
  50          * Ensure that all the outstanding RCU callbacks to free shadow pages
  51          * can run before the VM is torn down.
  52          */
  53         rcu_barrier();
  54 }
  55
  56 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  57                           gfn_t start, gfn_t end, bool can_yield, bool flush,
  58                           bool shared);
  59
  60 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
  61 {
  62         free_page((unsigned long)sp->spt);
  63         kmem_cache_free(mmu_page_header_cache, sp);
  64 }
  65
  66 /*
  67  * This is called through call_rcu in order to free TDP page table memory
  68  * safely with respect to other kernel threads that may be operating on
  69  * the memory.
  70  * By only accessing TDP MMU page table memory in an RCU read critical
  71  * section, and freeing it after a grace period, lockless access to that
  72  * memory won't use it after it is freed.
  73  */
  74 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
  75 {
  76         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
  77                                                rcu_head);
  78
  79         tdp_mmu_free_sp(sp);
  80 }
  81
  82 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
  83                           bool shared)
  84 {
  85         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
  86
  87         if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
  88                 return;
  89
  90         WARN_ON(!root->tdp_mmu_page);
  91
  92         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
  93         list_del_rcu(&root->link);
  94         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
  95
  96         zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
  97
  98         call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
  99 }
 100
 101 /*
 102  * Finds the next valid root after root (or the first valid root if root
 103  * is NULL), takes a reference on it, and returns that next root. If root
 104  * is not NULL, this thread should have already taken a reference on it, and
 105  * that reference will be dropped. If no valid root is found, this
 106  * function will return NULL.
 107  */
 108 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 109                                               struct kvm_mmu_page *prev_root,
 110                                               bool shared)
 111 {
 112         struct kvm_mmu_page *next_root;
 113
 114         rcu_read_lock();
 115
 116         if (prev_root)
 117                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 118                                                   &prev_root->link,
 119                                                   typeof(*prev_root), link);
 120         else
 121                 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 122                                                    typeof(*next_root), link);
 123
 124         while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
 125                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 126                                 &next_root->link, typeof(*next_root), link);
 127
 128         rcu_read_unlock();
 129
 130         if (prev_root)
 131                 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
 132
 133         return next_root;
 134 }
 135
 136 /*
 137  * Note: this iterator gets and puts references to the roots it iterates over.
 138  * This makes it safe to release the MMU lock and yield within the loop, but
 139  * if exiting the loop early, the caller must drop the reference to the most
 140  * recent root. (Unless keeping a live reference is desirable.)
 141  *
 142  * If shared is set, this function is operating under the MMU lock in read
 143  * mode. In the unlikely event that this thread must free a root, the lock
 144  * will be temporarily dropped and reacquired in write mode.
 145  */
 146 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)  \
 147         for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);            \
 148              _root;                                                     \
 149              _root = tdp_mmu_next_root(_kvm, _root, _shared))           \
 150                 if (kvm_mmu_page_as_id(_root) != _as_id) {              \
 151                 } else
 152
 153 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)                              \
 154         list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,         \
 155                                 lockdep_is_held_type(&kvm->mmu_lock, 0) ||      \
 156                                 lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
 157                 if (kvm_mmu_page_as_id(_root) != _as_id) {              \
 158                 } else
 159
 160 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
 161                                                    int level)
 162 {
 163         union kvm_mmu_page_role role;
 164
 165         role = vcpu->arch.mmu->mmu_role.base;
 166         role.level = level;
 167         role.direct = true;
 168         role.gpte_is_8_bytes = true;
 169         role.access = ACC_ALL;
 170
 171         return role;
 172 }
 173
 174 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 175                                                int level)
 176 {
 177         struct kvm_mmu_page *sp;
 178
 179         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 180         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 181         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 182
 183         sp->role.word = page_role_for_level(vcpu, level).word;
 184         sp->gfn = gfn;
 185         sp->tdp_mmu_page = true;
 186
 187         trace_kvm_mmu_get_page(sp, true);
 188
 189         return sp;
 190 }
 191
 192 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 193 {
 194         union kvm_mmu_page_role role;
 195         struct kvm *kvm = vcpu->kvm;
 196         struct kvm_mmu_page *root;
 197
 198         lockdep_assert_held_write(&kvm->mmu_lock);
 199
 200         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 201
 202         /* Check for an existing root before allocating a new one. */
 203         for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
 204                 if (root->role.word == role.word &&
 205                     kvm_tdp_mmu_get_root(kvm, root))
 206                         goto out;
 207         }
 208
 209         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 210         refcount_set(&root->tdp_mmu_root_count, 1);
 211
 212         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 213         list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
 214         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 215
 216 out:
 217         return __pa(root->spt);
 218 }
 219
 220 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 221                                 u64 old_spte, u64 new_spte, int level,
 222                                 bool shared);
 223
 224 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 225 {
 226         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 227                 return;
 228
 229         if (is_accessed_spte(old_spte) &&
 230             (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
 231              spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
 232                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 233 }
 234
 235 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 236                                           u64 old_spte, u64 new_spte, int level)
 237 {
 238         bool pfn_changed;
 239         struct kvm_memory_slot *slot;
 240
 241         if (level > PG_LEVEL_4K)
 242                 return;
 243
 244         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 245
 246         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 247             is_writable_pte(new_spte)) {
 248                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 249                 mark_page_dirty_in_slot(kvm, slot, gfn);
 250         }
 251 }
 252
 253 /**
 254  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
 255  *
 256  * @kvm: kvm instance
 257  * @sp: the new page
 258  * @shared: This operation may not be running under the exclusive use of
 259  *          the MMU lock and the operation must synchronize with other
 260  *          threads that might be adding or removing pages.
 261  * @account_nx: This page replaces a NX large page and should be marked for
 262  *              eventual reclaim.
 263  */
 264 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 265                               bool shared, bool account_nx)
 266 {
 267         if (shared)
 268                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 269         else
 270                 lockdep_assert_held_write(&kvm->mmu_lock);
 271
 272         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
 273         if (account_nx)
 274                 account_huge_nx_page(kvm, sp);
 275
 276         if (shared)
 277                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 278 }
 279
 280 /**
 281  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
 282  *
 283  * @kvm: kvm instance
 284  * @sp: the page to be removed
 285  * @shared: This operation may not be running under the exclusive use of
 286  *          the MMU lock and the operation must synchronize with other
 287  *          threads that might be adding or removing pages.
 288  */
 289 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 290                                 bool shared)
 291 {
 292         if (shared)
 293                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 294         else
 295                 lockdep_assert_held_write(&kvm->mmu_lock);
 296
 297         list_del(&sp->link);
 298         if (sp->lpage_disallowed)
 299                 unaccount_huge_nx_page(kvm, sp);
 300
 301         if (shared)
 302                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 303 }
 304
 305 /**
 306  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
 307  *
 308  * @kvm: kvm instance
 309  * @pt: the page removed from the paging structure
 310  * @shared: This operation may not be running under the exclusive use
 311  *          of the MMU lock and the operation must synchronize with other
 312  *          threads that might be modifying SPTEs.
 313  *
 314  * Given a page table that has been removed from the TDP paging structure,
 315  * iterates through the page table to clear SPTEs and free child page tables.
 316  *
 317  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 318  * protection. Since this thread removed it from the paging structure,
 319  * this thread will be responsible for ensuring the page is freed. Hence the
 320  * early rcu_dereferences in the function.
 321  */
 322 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
 323                                         bool shared)
 324 {
 325         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
 326         int level = sp->role.level;
 327         gfn_t base_gfn = sp->gfn;
 328         u64 old_child_spte;
 329         u64 *sptep;
 330         gfn_t gfn;
 331         int i;
 332
 333         trace_kvm_mmu_prepare_zap_page(sp);
 334
 335         tdp_mmu_unlink_page(kvm, sp, shared);
 336
 337         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 338                 sptep = rcu_dereference(pt) + i;
 339                 gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
 340
 341                 if (shared) {
 342                         /*
 343                          * Set the SPTE to a nonpresent value that other
 344                          * threads will not overwrite. If the SPTE was
 345                          * already marked as removed then another thread
 346                          * handling a page fault could overwrite it, so
 347                          * set the SPTE until it is set from some other
 348                          * value to the removed SPTE value.
 349                          */
 350                         for (;;) {
 351                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 352                                 if (!is_removed_spte(old_child_spte))
 353                                         break;
 354                                 cpu_relax();
 355                         }
 356                 } else {
 357                         /*
 358                          * If the SPTE is not MMU-present, there is no backing
 359                          * page associated with the SPTE and so no side effects
 360                          * that need to be recorded, and exclusive ownership of
 361                          * mmu_lock ensures the SPTE can't be made present.
 362                          * Note, zapping MMIO SPTEs is also unnecessary as they
 363                          * are guarded by the memslots generation, not by being
 364                          * unreachable.
 365                          */
 366                         old_child_spte = READ_ONCE(*sptep);
 367                         if (!is_shadow_present_pte(old_child_spte))
 368                                 continue;
 369
 370                         /*
 371                          * Marking the SPTE as a removed SPTE is not
 372                          * strictly necessary here as the MMU lock will
 373                          * stop other threads from concurrently modifying
 374                          * this SPTE. Using the removed SPTE value keeps
 375                          * the two branches consistent and simplifies
 376                          * the function.
 377                          */
 378                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 379                 }
 380                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 381                                     old_child_spte, REMOVED_SPTE, level,
 382                                     shared);
 383         }
 384
 385         kvm_flush_remote_tlbs_with_address(kvm, gfn,
 386                                            KVM_PAGES_PER_HPAGE(level + 1));
 387
 388         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 389 }
 390
 391 /**
 392  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
 393  * @kvm: kvm instance
 394  * @as_id: the address space of the paging structure the SPTE was a part of
 395  * @gfn: the base GFN that was mapped by the SPTE
 396  * @old_spte: The value of the SPTE before the change
 397  * @new_spte: The value of the SPTE after the change
 398  * @level: the level of the PT the SPTE is part of in the paging structure
 399  * @shared: This operation may not be running under the exclusive use of
 400  *          the MMU lock and the operation must synchronize with other
 401  *          threads that might be modifying SPTEs.
 402  *
 403  * Handle bookkeeping that might result from the modification of a SPTE.
 404  * This function must be called for all TDP SPTE modifications.
 405  */
 406 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 407                                   u64 old_spte, u64 new_spte, int level,
 408                                   bool shared)
 409 {
 410         bool was_present = is_shadow_present_pte(old_spte);
 411         bool is_present = is_shadow_present_pte(new_spte);
 412         bool was_leaf = was_present && is_last_spte(old_spte, level);
 413         bool is_leaf = is_present && is_last_spte(new_spte, level);
 414         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 415
 416         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 417         WARN_ON(level < PG_LEVEL_4K);
 418         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 419
 420         /*
 421          * If this warning were to trigger it would indicate that there was a
 422          * missing MMU notifier or a race with some notifier handler.
 423          * A present, leaf SPTE should never be directly replaced with another
 424          * present leaf SPTE pointing to a different PFN. A notifier handler
 425          * should be zapping the SPTE before the main MM's page table is
 426          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 427          * thread before replacement.
 428          */
 429         if (was_leaf && is_leaf && pfn_changed) {
 430                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 431                        "SPTE with another present leaf SPTE mapping a\n"
 432                        "different PFN!\n"
 433                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 434                        as_id, gfn, old_spte, new_spte, level);
 435
 436                 /*
 437                  * Crash the host to prevent error propagation and guest data
 438                  * corruption.
 439                  */
 440                 BUG();
 441         }
 442
 443         if (old_spte == new_spte)
 444                 return;
 445
 446         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 447
 448         if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
 449                 if (is_large_pte(old_spte))
 450                         atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
 451                 else
 452                         atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
 453         }
 454
 455         /*
 456          * The only times a SPTE should be changed from a non-present to
 457          * non-present state is when an MMIO entry is installed/modified/
 458          * removed. In that case, there is nothing to do here.
 459          */
 460         if (!was_present && !is_present) {
 461                 /*
 462                  * If this change does not involve a MMIO SPTE or removed SPTE,
 463                  * it is unexpected. Log the change, though it should not
 464                  * impact the guest since both the former and current SPTEs
 465                  * are nonpresent.
 466                  */
 467                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 468                             !is_mmio_spte(new_spte) &&
 469                             !is_removed_spte(new_spte)))
 470                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 471                                "should not be replaced with another,\n"
 472                                "different nonpresent SPTE, unless one or both\n"
 473                                "are MMIO SPTEs, or the new SPTE is\n"
 474                                "a temporary removed SPTE.\n"
 475                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 476                                as_id, gfn, old_spte, new_spte, level);
 477                 return;
 478         }
 479
 480
 481         if (was_leaf && is_dirty_spte(old_spte) &&
 482             (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
 483                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 484
 485         /*
 486          * Recursively handle child PTs if the change removed a subtree from
 487          * the paging structure.
 488          */
 489         if (was_present && !was_leaf && (pfn_changed || !is_present))
 490                 handle_removed_tdp_mmu_page(kvm,
 491                                 spte_to_child_pt(old_spte, level), shared);
 492 }
 493
 494 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 495                                 u64 old_spte, u64 new_spte, int level,
 496                                 bool shared)
 497 {
 498         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 499                               shared);
 500         handle_changed_spte_acc_track(old_spte, new_spte, level);
 501         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 502                                       new_spte, level);
 503 }
 504
 505 /*
 506  * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
 507  * and handle the associated bookkeeping, but do not mark the page dirty
 508  * in KVM's dirty bitmaps.
 509  *
 510  * @kvm: kvm instance
 511  * @iter: a tdp_iter instance currently on the SPTE that should be set
 512  * @new_spte: The value the SPTE should be set to
 513  * Returns: true if the SPTE was set, false if it was not. If false is returned,
 514  *          this function will have no side-effects.
 515  */
 516 static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
 517                                                         struct tdp_iter *iter,
 518                                                         u64 new_spte)
 519 {
 520         lockdep_assert_held_read(&kvm->mmu_lock);
 521
 522         /*
 523          * Do not change removed SPTEs. Only the thread that froze the SPTE
 524          * may modify it.
 525          */
 526         if (is_removed_spte(iter->old_spte))
 527                 return false;
 528
 529         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
 530                       new_spte) != iter->old_spte)
 531                 return false;
 532
 533         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 534                               new_spte, iter->level, true);
 535         handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
 536
 537         return true;
 538 }
 539
 540 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
 541                                            struct tdp_iter *iter,
 542                                            u64 new_spte)
 543 {
 544         if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
 545                 return false;
 546
 547         handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
 548                                       iter->old_spte, new_spte, iter->level);
 549         return true;
 550 }
 551
 552 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 553                                            struct tdp_iter *iter)
 554 {
 555         /*
 556          * Freeze the SPTE by setting it to a special,
 557          * non-present value. This will stop other threads from
 558          * immediately installing a present entry in its place
 559          * before the TLBs are flushed.
 560          */
 561         if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
 562                 return false;
 563
 564         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 565                                            KVM_PAGES_PER_HPAGE(iter->level));
 566
 567         /*
 568          * No other thread can overwrite the removed SPTE as they
 569          * must either wait on the MMU lock or use
 570          * tdp_mmu_set_spte_atomic which will not overwrite the
 571          * special removed SPTE value. No bookkeeping is needed
 572          * here since the SPTE is going from non-present
 573          * to non-present.
 574          */
 575         WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 576
 577         return true;
 578 }
 579
 580
 581 /*
 582  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 583  * @kvm: kvm instance
 584  * @iter: a tdp_iter instance currently on the SPTE that should be set
 585  * @new_spte: The value the SPTE should be set to
 586  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 587  *                    of the page. Should be set unless handling an MMU
 588  *                    notifier for access tracking. Leaving record_acc_track
 589  *                    unset in that case prevents page accesses from being
 590  *                    double counted.
 591  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 592  *                    appropriate for the change being made. Should be set
 593  *                    unless performing certain dirty logging operations.
 594  *                    Leaving record_dirty_log unset in that case prevents page
 595  *                    writes from being double counted.
 596  */
 597 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 598                                       u64 new_spte, bool record_acc_track,
 599                                       bool record_dirty_log)
 600 {
 601         lockdep_assert_held_write(&kvm->mmu_lock);
 602
 603         /*
 604          * No thread should be using this function to set SPTEs to the
 605          * temporary removed SPTE value.
 606          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 607          * should be used. If operating under the MMU lock in write mode, the
 608          * use of the removed SPTE should not be necessary.
 609          */
 610         WARN_ON(is_removed_spte(iter->old_spte));
 611
 612         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 613
 614         __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 615                               new_spte, iter->level, false);
 616         if (record_acc_track)
 617                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 618                                               iter->level);
 619         if (record_dirty_log)
 620                 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
 621                                               iter->old_spte, new_spte,
 622                                               iter->level);
 623 }
 624
 625 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 626                                     u64 new_spte)
 627 {
 628         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 629 }
 630
 631 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 632                                                  struct tdp_iter *iter,
 633                                                  u64 new_spte)
 634 {
 635         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 636 }
 637
 638 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 639                                                  struct tdp_iter *iter,
 640                                                  u64 new_spte)
 641 {
 642         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 643 }
 644
 645 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 646         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 647
 648 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 649         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 650                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 651                     !is_last_spte(_iter.old_spte, _iter.level))         \
 652                         continue;                                       \
 653                 else
 654
 655 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 656         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 657                          _mmu->shadow_root_level, _start, _end)
 658
 659 /*
 660  * Yield if the MMU lock is contended or this thread needs to return control
 661  * to the scheduler.
 662  *
 663  * If this function should yield and flush is set, it will perform a remote
 664  * TLB flush before yielding.
 665  *
 666  * If this function yields, it will also reset the tdp_iter's walk over the
 667  * paging structure and the calling function should skip to the next
 668  * iteration to allow the iterator to continue its traversal from the
 669  * paging structure root.
 670  *
 671  * Return true if this function yielded and the iterator's traversal was reset.
 672  * Return false if a yield was not needed.
 673  */
 674 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 675                                              struct tdp_iter *iter, bool flush,
 676                                              bool shared)
 677 {
 678         /* Ensure forward progress has been made before yielding. */
 679         if (iter->next_last_level_gfn == iter->yielded_gfn)
 680                 return false;
 681
 682         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 683                 rcu_read_unlock();
 684
 685                 if (flush)
 686                         kvm_flush_remote_tlbs(kvm);
 687
 688                 if (shared)
 689                         cond_resched_rwlock_read(&kvm->mmu_lock);
 690                 else
 691                         cond_resched_rwlock_write(&kvm->mmu_lock);
 692
 693                 rcu_read_lock();
 694
 695                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 696
 697                 tdp_iter_restart(iter);
 698
 699                 return true;
 700         }
 701
 702         return false;
 703 }
 704
 705 /*
 706  * Tears down the mappings for the range of gfns, [start, end), and frees the
 707  * non-root pages mapping GFNs strictly within that range. Returns true if
 708  * SPTEs have been cleared and a TLB flush is needed before releasing the
 709  * MMU lock.
 710  *
 711  * If can_yield is true, will release the MMU lock and reschedule if the
 712  * scheduler needs the CPU or there is contention on the MMU lock. If this
 713  * function cannot yield, it will not release the MMU lock or reschedule and
 714  * the caller must ensure it does not supply too large a GFN range, or the
 715  * operation can cause a soft lockup.
 716  *
 717  * If shared is true, this thread holds the MMU lock in read mode and must
 718  * account for the possibility that other threads are modifying the paging
 719  * structures concurrently. If shared is false, this thread should hold the
 720  * MMU lock in write mode.
 721  */
 722 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 723                           gfn_t start, gfn_t end, bool can_yield, bool flush,
 724                           bool shared)
 725 {
 726         gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 727         bool zap_all = (start == 0 && end >= max_gfn_host);
 728         struct tdp_iter iter;
 729
 730         /*
 731          * No need to try to step down in the iterator when zapping all SPTEs,
 732          * zapping the top-level non-leaf SPTEs will recurse on their children.
 733          */
 734         int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
 735
 736         /*
 737          * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
 738          * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
 739          * and so KVM will never install a SPTE for such addresses.
 740          */
 741         end = min(end, max_gfn_host);
 742
 743         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 744
 745         rcu_read_lock();
 746
 747         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
 748                                    min_level, start, end) {
 749 retry:
 750                 if (can_yield &&
 751                     tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
 752                         flush = false;
 753                         continue;
 754                 }
 755
 756                 if (!is_shadow_present_pte(iter.old_spte))
 757                         continue;
 758
 759                 /*
 760                  * If this is a non-last-level SPTE that covers a larger range
 761                  * than should be zapped, continue, and zap the mappings at a
 762                  * lower level, except when zapping all SPTEs.
 763                  */
 764                 if (!zap_all &&
 765                     (iter.gfn < start ||
 766                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 767                     !is_last_spte(iter.old_spte, iter.level))
 768                         continue;
 769
 770                 if (!shared) {
 771                         tdp_mmu_set_spte(kvm, &iter, 0);
 772                         flush = true;
 773                 } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
 774                         /*
 775                          * The iter must explicitly re-read the SPTE because
 776                          * the atomic cmpxchg failed.
 777                          */
 778                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
 779                         goto retry;
 780                 }
 781         }
 782
 783         rcu_read_unlock();
 784         return flush;
 785 }
 786
 787 /*
 788  * Tears down the mappings for the range of gfns, [start, end), and frees the
 789  * non-root pages mapping GFNs strictly within that range. Returns true if
 790  * SPTEs have been cleared and a TLB flush is needed before releasing the
 791  * MMU lock.
 792  *
 793  * If shared is true, this thread holds the MMU lock in read mode and must
 794  * account for the possibility that other threads are modifying the paging
 795  * structures concurrently. If shared is false, this thread should hold the
 796  * MMU in write mode.
 797  */
 798 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
 799                                  gfn_t end, bool can_yield, bool flush,
 800                                  bool shared)
 801 {
 802         struct kvm_mmu_page *root;
 803
 804         for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
 805                 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
 806                                       shared);
 807
 808         return flush;
 809 }
 810
 811 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 812 {
 813         bool flush = false;
 814         int i;
 815
 816         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
 817                 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
 818                                                   flush, false);
 819
 820         if (flush)
 821                 kvm_flush_remote_tlbs(kvm);
 822 }
 823
 824 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
 825                                                   struct kvm_mmu_page *prev_root)
 826 {
 827         struct kvm_mmu_page *next_root;
 828
 829         if (prev_root)
 830                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 831                                                   &prev_root->link,
 832                                                   typeof(*prev_root), link);
 833         else
 834                 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 835                                                    typeof(*next_root), link);
 836
 837         while (next_root && !(next_root->role.invalid &&
 838                               refcount_read(&next_root->tdp_mmu_root_count)))
 839                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 840                                                   &next_root->link,
 841                                                   typeof(*next_root), link);
 842
 843         return next_root;
 844 }
 845
 846 /*
 847  * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
 848  * invalidated root, they will not be freed until this function drops the
 849  * reference. Before dropping that reference, tear down the paging
 850  * structure so that whichever thread does drop the last reference
 851  * only has to do a trivial amount of work. Since the roots are invalid,
 852  * no new SPTEs should be created under them.
 853  */
 854 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
 855 {
 856         struct kvm_mmu_page *next_root;
 857         struct kvm_mmu_page *root;
 858         bool flush = false;
 859
 860         lockdep_assert_held_read(&kvm->mmu_lock);
 861
 862         rcu_read_lock();
 863
 864         root = next_invalidated_root(kvm, NULL);
 865
 866         while (root) {
 867                 next_root = next_invalidated_root(kvm, root);
 868
 869                 rcu_read_unlock();
 870
 871                 flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
 872
 873                 /*
 874                  * Put the reference acquired in
 875                  * kvm_tdp_mmu_invalidate_roots
 876                  */
 877                 kvm_tdp_mmu_put_root(kvm, root, true);
 878
 879                 root = next_root;
 880
 881                 rcu_read_lock();
 882         }
 883
 884         rcu_read_unlock();
 885
 886         if (flush)
 887                 kvm_flush_remote_tlbs(kvm);
 888 }
 889
 890 /*
 891  * Mark each TDP MMU root as invalid so that other threads
 892  * will drop their references and allow the root count to
 893  * go to 0.
 894  *
 895  * Also take a reference on all roots so that this thread
 896  * can do the bulk of the work required to free the roots
 897  * once they are invalidated. Without this reference, a
 898  * vCPU thread might drop the last reference to a root and
 899  * get stuck with tearing down the entire paging structure.
 900  *
 901  * Roots which have a zero refcount should be skipped as
 902  * they're already being torn down.
 903  * Already invalid roots should be referenced again so that
 904  * they aren't freed before kvm_tdp_mmu_zap_all_fast is
 905  * done with them.
 906  *
 907  * This has essentially the same effect for the TDP MMU
 908  * as updating mmu_valid_gen does for the shadow MMU.
 909  */
 910 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
 911 {
 912         struct kvm_mmu_page *root;
 913
 914         lockdep_assert_held_write(&kvm->mmu_lock);
 915         list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
 916                 if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
 917                         root->role.invalid = true;
 918 }
 919
 920 /*
 921  * Installs a last-level SPTE to handle a TDP page fault.
 922  * (NPT/EPT violation/misconfiguration)
 923  */
 924 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 925                                           int map_writable,
 926                                           struct tdp_iter *iter,
 927                                           kvm_pfn_t pfn, bool prefault)
 928 {
 929         u64 new_spte;
 930         int ret = RET_PF_FIXED;
 931         int make_spte_ret = 0;
 932
 933         if (unlikely(is_noslot_pfn(pfn)))
 934                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 935         else
 936                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 937                                          pfn, iter->old_spte, prefault, true,
 938                                          map_writable, !shadow_accessed_mask,
 939                                          &new_spte);
 940
 941         if (new_spte == iter->old_spte)
 942                 ret = RET_PF_SPURIOUS;
 943         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 944                 return RET_PF_RETRY;
 945
 946         /*
 947          * If the page fault was caused by a write but the page is write
 948          * protected, emulation is needed. If the emulation was skipped,
 949          * the vCPU would have the same fault again.
 950          */
 951         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 952                 if (write)
 953                         ret = RET_PF_EMULATE;
 954                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 955         }
 956
 957         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 958         if (unlikely(is_mmio_spte(new_spte))) {
 959                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 960                                      new_spte);
 961                 ret = RET_PF_EMULATE;
 962         } else {
 963                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 964                                        rcu_dereference(iter->sptep));
 965         }
 966
 967         /*
 968          * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
 969          * consistent with legacy MMU behavior.
 970          */
 971         if (ret != RET_PF_SPURIOUS)
 972                 vcpu->stat.pf_fixed++;
 973
 974         return ret;
 975 }
 976
 977 /*
 978  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 979  * page tables and SPTEs to translate the faulting guest physical address.
 980  */
 981 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 982                     int map_writable, int max_level, kvm_pfn_t pfn,
 983                     bool prefault)
 984 {
 985         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 986         bool write = error_code & PFERR_WRITE_MASK;
 987         bool exec = error_code & PFERR_FETCH_MASK;
 988         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 989         struct kvm_mmu *mmu = vcpu->arch.mmu;
 990         struct tdp_iter iter;
 991         struct kvm_mmu_page *sp;
 992         u64 *child_pt;
 993         u64 new_spte;
 994         int ret;
 995         gfn_t gfn = gpa >> PAGE_SHIFT;
 996         int level;
 997         int req_level;
 998
 999         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
1000                                         huge_page_disallowed, &req_level);
1001
1002         trace_kvm_mmu_spte_requested(gpa, level, pfn);
1003
1004         rcu_read_lock();
1005
1006         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1007                 if (nx_huge_page_workaround_enabled)
1008                         disallowed_hugepage_adjust(iter.old_spte, gfn,
1009                                                    iter.level, &pfn, &level);
1010
1011                 if (iter.level == level)
1012                         break;
1013
1014                 /*
1015                  * If there is an SPTE mapping a large page at a higher level
1016                  * than the target, that SPTE must be cleared and replaced
1017                  * with a non-leaf SPTE.
1018                  */
1019                 if (is_shadow_present_pte(iter.old_spte) &&
1020                     is_large_pte(iter.old_spte)) {
1021                         if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1022                                 break;
1023
1024                         /*
1025                          * The iter must explicitly re-read the spte here
1026                          * because the new value informs the !present
1027                          * path below.
1028                          */
1029                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1030                 }
1031
1032                 if (!is_shadow_present_pte(iter.old_spte)) {
1033                         /*
1034                          * If SPTE has been frozen by another thread, just
1035                          * give up and retry, avoiding unnecessary page table
1036                          * allocation and free.
1037                          */
1038                         if (is_removed_spte(iter.old_spte))
1039                                 break;
1040
1041                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
1042                         child_pt = sp->spt;
1043
1044                         new_spte = make_nonleaf_spte(child_pt,
1045                                                      !shadow_accessed_mask);
1046
1047                         if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
1048                                                     new_spte)) {
1049                                 tdp_mmu_link_page(vcpu->kvm, sp, true,
1050                                                   huge_page_disallowed &&
1051                                                   req_level >= iter.level);
1052
1053                                 trace_kvm_mmu_get_page(sp, true);
1054                         } else {
1055                                 tdp_mmu_free_sp(sp);
1056                                 break;
1057                         }
1058                 }
1059         }
1060
1061         if (iter.level != level) {
1062                 rcu_read_unlock();
1063                 return RET_PF_RETRY;
1064         }
1065
1066         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
1067                                               pfn, prefault);
1068         rcu_read_unlock();
1069
1070         return ret;
1071 }
1072
1073 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1074                                  bool flush)
1075 {
1076         struct kvm_mmu_page *root;
1077
1078         for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
1079                 flush |= zap_gfn_range(kvm, root, range->start, range->end,
1080                                        range->may_block, flush, false);
1081
1082         return flush;
1083 }
1084
1085 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1086                               struct kvm_gfn_range *range);
1087
1088 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1089                                                    struct kvm_gfn_range *range,
1090                                                    tdp_handler_t handler)
1091 {
1092         struct kvm_mmu_page *root;
1093         struct tdp_iter iter;
1094         bool ret = false;
1095
1096         rcu_read_lock();
1097
1098         /*
1099          * Don't support rescheduling, none of the MMU notifiers that funnel
1100          * into this helper allow blocking; it'd be dead, wasteful code.
1101          */
1102         for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1103                 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1104                         ret |= handler(kvm, &iter, range);
1105         }
1106
1107         rcu_read_unlock();
1108
1109         return ret;
1110 }
1111
1112 /*
1113  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1114  * if any of the GFNs in the range have been accessed.
1115  */
1116 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1117                           struct kvm_gfn_range *range)
1118 {
1119         u64 new_spte = 0;
1120
1121         /* If we have a non-accessed entry we don't need to change the pte. */
1122         if (!is_accessed_spte(iter->old_spte))
1123                 return false;
1124
1125         new_spte = iter->old_spte;
1126
1127         if (spte_ad_enabled(new_spte)) {
1128                 new_spte &= ~shadow_accessed_mask;
1129         } else {
1130                 /*
1131                  * Capture the dirty status of the page, so that it doesn't get
1132                  * lost when the SPTE is marked for access tracking.
1133                  */
1134                 if (is_writable_pte(new_spte))
1135                         kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1136
1137                 new_spte = mark_spte_for_access_track(new_spte);
1138         }
1139
1140         tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1141
1142         return true;
1143 }
1144
1145 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1146 {
1147         return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1148 }
1149
1150 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1151                          struct kvm_gfn_range *range)
1152 {
1153         return is_accessed_spte(iter->old_spte);
1154 }
1155
1156 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1157 {
1158         return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1159 }
1160
1161 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1162                          struct kvm_gfn_range *range)
1163 {
1164         u64 new_spte;
1165
1166         /* Huge pages aren't expected to be modified without first being zapped. */
1167         WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1168
1169         if (iter->level != PG_LEVEL_4K ||
1170             !is_shadow_present_pte(iter->old_spte))
1171                 return false;
1172
1173         /*
1174          * Note, when changing a read-only SPTE, it's not strictly necessary to
1175          * zero the SPTE before setting the new PFN, but doing so preserves the
1176          * invariant that the PFN of a present * leaf SPTE can never change.
1177          * See __handle_changed_spte().
1178          */
1179         tdp_mmu_set_spte(kvm, iter, 0);
1180
1181         if (!pte_write(range->pte)) {
1182                 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1183                                                                   pte_pfn(range->pte));
1184
1185                 tdp_mmu_set_spte(kvm, iter, new_spte);
1186         }
1187
1188         return true;
1189 }
1190
1191 /*
1192  * Handle the changed_pte MMU notifier for the TDP MMU.
1193  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1194  * notifier.
1195  * Returns non-zero if a flush is needed before releasing the MMU lock.
1196  */
1197 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1198 {
1199         bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1200
1201         /* FIXME: return 'flush' instead of flushing here. */
1202         if (flush)
1203                 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1204
1205         return false;
1206 }
1207
1208 /*
1209  * Remove write access from all SPTEs at or above min_level that map GFNs
1210  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1211  * be flushed.
1212  */
1213 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1214                              gfn_t start, gfn_t end, int min_level)
1215 {
1216         struct tdp_iter iter;
1217         u64 new_spte;
1218         bool spte_set = false;
1219
1220         rcu_read_lock();
1221
1222         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1223
1224         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1225                                    min_level, start, end) {
1226 retry:
1227                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1228                         continue;
1229
1230                 if (!is_shadow_present_pte(iter.old_spte) ||
1231                     !is_last_spte(iter.old_spte, iter.level) ||
1232                     !(iter.old_spte & PT_WRITABLE_MASK))
1233                         continue;
1234
1235                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1236
1237                 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1238                                                           new_spte)) {
1239                         /*
1240                          * The iter must explicitly re-read the SPTE because
1241                          * the atomic cmpxchg failed.
1242                          */
1243                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1244                         goto retry;
1245                 }
1246                 spte_set = true;
1247         }
1248
1249         rcu_read_unlock();
1250         return spte_set;
1251 }
1252
1253 /*
1254  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1255  * only affect leaf SPTEs down to min_level.
1256  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1257  */
1258 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1259                              int min_level)
1260 {
1261         struct kvm_mmu_page *root;
1262         bool spte_set = false;
1263
1264         lockdep_assert_held_read(&kvm->mmu_lock);
1265
1266         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1267                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1268                              slot->base_gfn + slot->npages, min_level);
1269
1270         return spte_set;
1271 }
1272
1273 /*
1274  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1275  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1276  * If AD bits are not enabled, this will require clearing the writable bit on
1277  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1278  * be flushed.
1279  */
1280 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1281                            gfn_t start, gfn_t end)
1282 {
1283         struct tdp_iter iter;
1284         u64 new_spte;
1285         bool spte_set = false;
1286
1287         rcu_read_lock();
1288
1289         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1290 retry:
1291                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1292                         continue;
1293
1294                 if (spte_ad_need_write_protect(iter.old_spte)) {
1295                         if (is_writable_pte(iter.old_spte))
1296                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1297                         else
1298                                 continue;
1299                 } else {
1300                         if (iter.old_spte & shadow_dirty_mask)
1301                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1302                         else
1303                                 continue;
1304                 }
1305
1306                 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1307                                                           new_spte)) {
1308                         /*
1309                          * The iter must explicitly re-read the SPTE because
1310                          * the atomic cmpxchg failed.
1311                          */
1312                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1313                         goto retry;
1314                 }
1315                 spte_set = true;
1316         }
1317
1318         rcu_read_unlock();
1319         return spte_set;
1320 }
1321
1322 /*
1323  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1324  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1325  * If AD bits are not enabled, this will require clearing the writable bit on
1326  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1327  * be flushed.
1328  */
1329 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1330 {
1331         struct kvm_mmu_page *root;
1332         bool spte_set = false;
1333
1334         lockdep_assert_held_read(&kvm->mmu_lock);
1335
1336         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1337                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1338                                 slot->base_gfn + slot->npages);
1339
1340         return spte_set;
1341 }
1342
1343 /*
1344  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1345  * set in mask, starting at gfn. The given memslot is expected to contain all
1346  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1347  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1348  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1349  */
1350 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1351                                   gfn_t gfn, unsigned long mask, bool wrprot)
1352 {
1353         struct tdp_iter iter;
1354         u64 new_spte;
1355
1356         rcu_read_lock();
1357
1358         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1359                                     gfn + BITS_PER_LONG) {
1360                 if (!mask)
1361                         break;
1362
1363                 if (iter.level > PG_LEVEL_4K ||
1364                     !(mask & (1UL << (iter.gfn - gfn))))
1365                         continue;
1366
1367                 mask &= ~(1UL << (iter.gfn - gfn));
1368
1369                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1370                         if (is_writable_pte(iter.old_spte))
1371                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1372                         else
1373                                 continue;
1374                 } else {
1375                         if (iter.old_spte & shadow_dirty_mask)
1376                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1377                         else
1378                                 continue;
1379                 }
1380
1381                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1382         }
1383
1384         rcu_read_unlock();
1385 }
1386
1387 /*
1388  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1389  * set in mask, starting at gfn. The given memslot is expected to contain all
1390  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1391  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1392  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1393  */
1394 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1395                                        struct kvm_memory_slot *slot,
1396                                        gfn_t gfn, unsigned long mask,
1397                                        bool wrprot)
1398 {
1399         struct kvm_mmu_page *root;
1400
1401         lockdep_assert_held_write(&kvm->mmu_lock);
1402         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1403                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1404 }
1405
1406 /*
1407  * Clear leaf entries which could be replaced by large mappings, for
1408  * GFNs within the slot.
1409  */
1410 static bool zap_collapsible_spte_range(struct kvm *kvm,
1411                                        struct kvm_mmu_page *root,
1412                                        const struct kvm_memory_slot *slot,
1413                                        bool flush)
1414 {
1415         gfn_t start = slot->base_gfn;
1416         gfn_t end = start + slot->npages;
1417         struct tdp_iter iter;
1418         kvm_pfn_t pfn;
1419
1420         rcu_read_lock();
1421
1422         tdp_root_for_each_pte(iter, root, start, end) {
1423 retry:
1424                 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1425                         flush = false;
1426                         continue;
1427                 }
1428
1429                 if (!is_shadow_present_pte(iter.old_spte) ||
1430                     !is_last_spte(iter.old_spte, iter.level))
1431                         continue;
1432
1433                 pfn = spte_to_pfn(iter.old_spte);
1434                 if (kvm_is_reserved_pfn(pfn) ||
1435                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1436                                                             pfn, PG_LEVEL_NUM))
1437                         continue;
1438
1439                 if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1440                         /*
1441                          * The iter must explicitly re-read the SPTE because
1442                          * the atomic cmpxchg failed.
1443                          */
1444                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1445                         goto retry;
1446                 }
1447                 flush = true;
1448         }
1449
1450         rcu_read_unlock();
1451
1452         return flush;
1453 }
1454
1455 /*
1456  * Clear non-leaf entries (and free associated page tables) which could
1457  * be replaced by large mappings, for GFNs within the slot.
1458  */
1459 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1460                                        const struct kvm_memory_slot *slot,
1461                                        bool flush)
1462 {
1463         struct kvm_mmu_page *root;
1464
1465         lockdep_assert_held_read(&kvm->mmu_lock);
1466
1467         for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1468                 flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1469
1470         return flush;
1471 }
1472
1473 /*
1474  * Removes write access on the last level SPTE mapping this GFN and unsets the
1475  * MMU-writable bit to ensure future writes continue to be intercepted.
1476  * Returns true if an SPTE was set and a TLB flush is needed.
1477  */
1478 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1479                               gfn_t gfn, int min_level)
1480 {
1481         struct tdp_iter iter;
1482         u64 new_spte;
1483         bool spte_set = false;
1484
1485         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1486
1487         rcu_read_lock();
1488
1489         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1490                                    min_level, gfn, gfn + 1) {
1491                 if (!is_shadow_present_pte(iter.old_spte) ||
1492                     !is_last_spte(iter.old_spte, iter.level))
1493                         continue;
1494
1495                 if (!is_writable_pte(iter.old_spte))
1496                         break;
1497
1498                 new_spte = iter.old_spte &
1499                         ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1500
1501                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1502                 spte_set = true;
1503         }
1504
1505         rcu_read_unlock();
1506
1507         return spte_set;
1508 }
1509
1510 /*
1511  * Removes write access on the last level SPTE mapping this GFN and unsets the
1512  * MMU-writable bit to ensure future writes continue to be intercepted.
1513  * Returns true if an SPTE was set and a TLB flush is needed.
1514  */
1515 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1516                                    struct kvm_memory_slot *slot, gfn_t gfn,
1517                                    int min_level)
1518 {
1519         struct kvm_mmu_page *root;
1520         bool spte_set = false;
1521
1522         lockdep_assert_held_write(&kvm->mmu_lock);
1523         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1524                 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1525
1526         return spte_set;
1527 }
1528
1529 /*
1530  * Return the level of the lowest level SPTE added to sptes.
1531  * That SPTE may be non-present.
1532  */
1533 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1534                          int *root_level)
1535 {
1536         struct tdp_iter iter;
1537         struct kvm_mmu *mmu = vcpu->arch.mmu;
1538         gfn_t gfn = addr >> PAGE_SHIFT;
1539         int leaf = -1;
1540
1541         *root_level = vcpu->arch.mmu->shadow_root_level;
1542
1543         rcu_read_lock();
1544
1545         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1546                 leaf = iter.level;
1547                 sptes[leaf] = iter.old_spte;
1548         }
1549
1550         rcu_read_unlock();
1551
1552         return leaf;
1553 }