arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = false;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  20                 return;
  21
  22         /* This should not be changed for the lifetime of the VM. */
  23         kvm->arch.tdp_mmu_enabled = true;
  24
  25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  28 }
  29
  30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  31 {
  32         if (!kvm->arch.tdp_mmu_enabled)
  33                 return;
  34
  35         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  36
  37         /*
  38          * Ensure that all the outstanding RCU callbacks to free shadow pages
  39          * can run before the VM is torn down.
  40          */
  41         rcu_barrier();
  42 }
  43
  44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
  45 {
  46         if (kvm_mmu_put_root(kvm, root))
  47                 kvm_tdp_mmu_free_root(kvm, root);
  48 }
  49
  50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
  51                                            struct kvm_mmu_page *root)
  52 {
  53         lockdep_assert_held_write(&kvm->mmu_lock);
  54
  55         if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
  56                 return false;
  57
  58         kvm_mmu_get_root(kvm, root);
  59         return true;
  60
  61 }
  62
  63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  64                                                      struct kvm_mmu_page *root)
  65 {
  66         struct kvm_mmu_page *next_root;
  67
  68         next_root = list_next_entry(root, link);
  69         tdp_mmu_put_root(kvm, root);
  70         return next_root;
  71 }
  72
  73 /*
  74  * Note: this iterator gets and puts references to the roots it iterates over.
  75  * This makes it safe to release the MMU lock and yield within the loop, but
  76  * if exiting the loop early, the caller must drop the reference to the most
  77  * recent root. (Unless keeping a live reference is desirable.)
  78  */
  79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                           \
  80         for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,        \
  81                                       typeof(*_root), link);            \
  82              tdp_mmu_next_root_valid(_kvm, _root);                      \
  83              _root = tdp_mmu_next_root(_kvm, _root))
  84
  85 #define for_each_tdp_mmu_root(_kvm, _root)                              \
  86         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
  87
  88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  89                           gfn_t start, gfn_t end, bool can_yield);
  90
  91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  92 {
  93         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
  94
  95         lockdep_assert_held_write(&kvm->mmu_lock);
  96
  97         WARN_ON(root->root_count);
  98         WARN_ON(!root->tdp_mmu_page);
  99
 100         list_del(&root->link);
 101
 102         zap_gfn_range(kvm, root, 0, max_gfn, false);
 103
 104         free_page((unsigned long)root->spt);
 105         kmem_cache_free(mmu_page_header_cache, root);
 106 }
 107
 108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
 109                                                    int level)
 110 {
 111         union kvm_mmu_page_role role;
 112
 113         role = vcpu->arch.mmu->mmu_role.base;
 114         role.level = level;
 115         role.direct = true;
 116         role.gpte_is_8_bytes = true;
 117         role.access = ACC_ALL;
 118
 119         return role;
 120 }
 121
 122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 123                                                int level)
 124 {
 125         struct kvm_mmu_page *sp;
 126
 127         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 128         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 129         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 130
 131         sp->role.word = page_role_for_level(vcpu, level).word;
 132         sp->gfn = gfn;
 133         sp->tdp_mmu_page = true;
 134
 135         trace_kvm_mmu_get_page(sp, true);
 136
 137         return sp;
 138 }
 139
 140 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
 141 {
 142         union kvm_mmu_page_role role;
 143         struct kvm *kvm = vcpu->kvm;
 144         struct kvm_mmu_page *root;
 145
 146         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 147
 148         write_lock(&kvm->mmu_lock);
 149
 150         /* Check for an existing root before allocating a new one. */
 151         for_each_tdp_mmu_root(kvm, root) {
 152                 if (root->role.word == role.word) {
 153                         kvm_mmu_get_root(kvm, root);
 154                         write_unlock(&kvm->mmu_lock);
 155                         return root;
 156                 }
 157         }
 158
 159         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 160         root->root_count = 1;
 161
 162         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 163
 164         write_unlock(&kvm->mmu_lock);
 165
 166         return root;
 167 }
 168
 169 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 170 {
 171         struct kvm_mmu_page *root;
 172
 173         root = get_tdp_mmu_vcpu_root(vcpu);
 174         if (!root)
 175                 return INVALID_PAGE;
 176
 177         return __pa(root->spt);
 178 }
 179
 180 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 181 {
 182         free_page((unsigned long)sp->spt);
 183         kmem_cache_free(mmu_page_header_cache, sp);
 184 }
 185
 186 /*
 187  * This is called through call_rcu in order to free TDP page table memory
 188  * safely with respect to other kernel threads that may be operating on
 189  * the memory.
 190  * By only accessing TDP MMU page table memory in an RCU read critical
 191  * section, and freeing it after a grace period, lockless access to that
 192  * memory won't use it after it is freed.
 193  */
 194 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
 195 {
 196         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
 197                                                rcu_head);
 198
 199         tdp_mmu_free_sp(sp);
 200 }
 201
 202 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 203                                 u64 old_spte, u64 new_spte, int level,
 204                                 bool shared);
 205
 206 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
 207 {
 208         return sp->role.smm ? 1 : 0;
 209 }
 210
 211 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 212 {
 213         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 214
 215         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 216                 return;
 217
 218         if (is_accessed_spte(old_spte) &&
 219             (!is_accessed_spte(new_spte) || pfn_changed))
 220                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 221 }
 222
 223 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 224                                           u64 old_spte, u64 new_spte, int level)
 225 {
 226         bool pfn_changed;
 227         struct kvm_memory_slot *slot;
 228
 229         if (level > PG_LEVEL_4K)
 230                 return;
 231
 232         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 233
 234         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 235             is_writable_pte(new_spte)) {
 236                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 237                 mark_page_dirty_in_slot(kvm, slot, gfn);
 238         }
 239 }
 240
 241 /**
 242  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
 243  *
 244  * @kvm: kvm instance
 245  * @sp: the new page
 246  * @shared: This operation may not be running under the exclusive use of
 247  *          the MMU lock and the operation must synchronize with other
 248  *          threads that might be adding or removing pages.
 249  * @account_nx: This page replaces a NX large page and should be marked for
 250  *              eventual reclaim.
 251  */
 252 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 253                               bool shared, bool account_nx)
 254 {
 255         if (shared)
 256                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 257         else
 258                 lockdep_assert_held_write(&kvm->mmu_lock);
 259
 260         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
 261         if (account_nx)
 262                 account_huge_nx_page(kvm, sp);
 263
 264         if (shared)
 265                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 266 }
 267
 268 /**
 269  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
 270  *
 271  * @kvm: kvm instance
 272  * @sp: the page to be removed
 273  * @shared: This operation may not be running under the exclusive use of
 274  *          the MMU lock and the operation must synchronize with other
 275  *          threads that might be adding or removing pages.
 276  */
 277 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 278                                 bool shared)
 279 {
 280         if (shared)
 281                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 282         else
 283                 lockdep_assert_held_write(&kvm->mmu_lock);
 284
 285         list_del(&sp->link);
 286         if (sp->lpage_disallowed)
 287                 unaccount_huge_nx_page(kvm, sp);
 288
 289         if (shared)
 290                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 291 }
 292
 293 /**
 294  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
 295  *
 296  * @kvm: kvm instance
 297  * @pt: the page removed from the paging structure
 298  * @shared: This operation may not be running under the exclusive use
 299  *          of the MMU lock and the operation must synchronize with other
 300  *          threads that might be modifying SPTEs.
 301  *
 302  * Given a page table that has been removed from the TDP paging structure,
 303  * iterates through the page table to clear SPTEs and free child page tables.
 304  */
 305 static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
 306                                         bool shared)
 307 {
 308         struct kvm_mmu_page *sp = sptep_to_sp(pt);
 309         int level = sp->role.level;
 310         gfn_t base_gfn = sp->gfn;
 311         u64 old_child_spte;
 312         u64 *sptep;
 313         gfn_t gfn;
 314         int i;
 315
 316         trace_kvm_mmu_prepare_zap_page(sp);
 317
 318         tdp_mmu_unlink_page(kvm, sp, shared);
 319
 320         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 321                 sptep = pt + i;
 322                 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 323
 324                 if (shared) {
 325                         /*
 326                          * Set the SPTE to a nonpresent value that other
 327                          * threads will not overwrite. If the SPTE was
 328                          * already marked as removed then another thread
 329                          * handling a page fault could overwrite it, so
 330                          * set the SPTE until it is set from some other
 331                          * value to the removed SPTE value.
 332                          */
 333                         for (;;) {
 334                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 335                                 if (!is_removed_spte(old_child_spte))
 336                                         break;
 337                                 cpu_relax();
 338                         }
 339                 } else {
 340                         /*
 341                          * If the SPTE is not MMU-present, there is no backing
 342                          * page associated with the SPTE and so no side effects
 343                          * that need to be recorded, and exclusive ownership of
 344                          * mmu_lock ensures the SPTE can't be made present.
 345                          * Note, zapping MMIO SPTEs is also unnecessary as they
 346                          * are guarded by the memslots generation, not by being
 347                          * unreachable.
 348                          */
 349                         old_child_spte = READ_ONCE(*sptep);
 350                         if (!is_shadow_present_pte(old_child_spte))
 351                                 continue;
 352
 353                         /*
 354                          * Marking the SPTE as a removed SPTE is not
 355                          * strictly necessary here as the MMU lock will
 356                          * stop other threads from concurrently modifying
 357                          * this SPTE. Using the removed SPTE value keeps
 358                          * the two branches consistent and simplifies
 359                          * the function.
 360                          */
 361                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 362                 }
 363                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 364                                     old_child_spte, REMOVED_SPTE, level - 1,
 365                                     shared);
 366         }
 367
 368         kvm_flush_remote_tlbs_with_address(kvm, gfn,
 369                                            KVM_PAGES_PER_HPAGE(level));
 370
 371         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 372 }
 373
 374 /**
 375  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 376  * @kvm: kvm instance
 377  * @as_id: the address space of the paging structure the SPTE was a part of
 378  * @gfn: the base GFN that was mapped by the SPTE
 379  * @old_spte: The value of the SPTE before the change
 380  * @new_spte: The value of the SPTE after the change
 381  * @level: the level of the PT the SPTE is part of in the paging structure
 382  * @shared: This operation may not be running under the exclusive use of
 383  *          the MMU lock and the operation must synchronize with other
 384  *          threads that might be modifying SPTEs.
 385  *
 386  * Handle bookkeeping that might result from the modification of a SPTE.
 387  * This function must be called for all TDP SPTE modifications.
 388  */
 389 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 390                                   u64 old_spte, u64 new_spte, int level,
 391                                   bool shared)
 392 {
 393         bool was_present = is_shadow_present_pte(old_spte);
 394         bool is_present = is_shadow_present_pte(new_spte);
 395         bool was_leaf = was_present && is_last_spte(old_spte, level);
 396         bool is_leaf = is_present && is_last_spte(new_spte, level);
 397         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 398
 399         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 400         WARN_ON(level < PG_LEVEL_4K);
 401         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 402
 403         /*
 404          * If this warning were to trigger it would indicate that there was a
 405          * missing MMU notifier or a race with some notifier handler.
 406          * A present, leaf SPTE should never be directly replaced with another
 407          * present leaf SPTE pointing to a differnt PFN. A notifier handler
 408          * should be zapping the SPTE before the main MM's page table is
 409          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 410          * thread before replacement.
 411          */
 412         if (was_leaf && is_leaf && pfn_changed) {
 413                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 414                        "SPTE with another present leaf SPTE mapping a\n"
 415                        "different PFN!\n"
 416                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 417                        as_id, gfn, old_spte, new_spte, level);
 418
 419                 /*
 420                  * Crash the host to prevent error propagation and guest data
 421                  * courruption.
 422                  */
 423                 BUG();
 424         }
 425
 426         if (old_spte == new_spte)
 427                 return;
 428
 429         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 430
 431         /*
 432          * The only times a SPTE should be changed from a non-present to
 433          * non-present state is when an MMIO entry is installed/modified/
 434          * removed. In that case, there is nothing to do here.
 435          */
 436         if (!was_present && !is_present) {
 437                 /*
 438                  * If this change does not involve a MMIO SPTE or removed SPTE,
 439                  * it is unexpected. Log the change, though it should not
 440                  * impact the guest since both the former and current SPTEs
 441                  * are nonpresent.
 442                  */
 443                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 444                             !is_mmio_spte(new_spte) &&
 445                             !is_removed_spte(new_spte)))
 446                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 447                                "should not be replaced with another,\n"
 448                                "different nonpresent SPTE, unless one or both\n"
 449                                "are MMIO SPTEs, or the new SPTE is\n"
 450                                "a temporary removed SPTE.\n"
 451                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 452                                as_id, gfn, old_spte, new_spte, level);
 453                 return;
 454         }
 455
 456
 457         if (was_leaf && is_dirty_spte(old_spte) &&
 458             (!is_dirty_spte(new_spte) || pfn_changed))
 459                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 460
 461         /*
 462          * Recursively handle child PTs if the change removed a subtree from
 463          * the paging structure.
 464          */
 465         if (was_present && !was_leaf && (pfn_changed || !is_present))
 466                 handle_removed_tdp_mmu_page(kvm,
 467                                 spte_to_child_pt(old_spte, level), shared);
 468 }
 469
 470 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 471                                 u64 old_spte, u64 new_spte, int level,
 472                                 bool shared)
 473 {
 474         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 475                               shared);
 476         handle_changed_spte_acc_track(old_spte, new_spte, level);
 477         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 478                                       new_spte, level);
 479 }
 480
 481 /*
 482  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
 483  * associated bookkeeping
 484  *
 485  * @kvm: kvm instance
 486  * @iter: a tdp_iter instance currently on the SPTE that should be set
 487  * @new_spte: The value the SPTE should be set to
 488  * Returns: true if the SPTE was set, false if it was not. If false is returned,
 489  *          this function will have no side-effects.
 490  */
 491 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
 492                                            struct tdp_iter *iter,
 493                                            u64 new_spte)
 494 {
 495         u64 *root_pt = tdp_iter_root_pt(iter);
 496         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
 497         int as_id = kvm_mmu_page_as_id(root);
 498
 499         lockdep_assert_held_read(&kvm->mmu_lock);
 500
 501         /*
 502          * Do not change removed SPTEs. Only the thread that froze the SPTE
 503          * may modify it.
 504          */
 505         if (iter->old_spte == REMOVED_SPTE)
 506                 return false;
 507
 508         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
 509                       new_spte) != iter->old_spte)
 510                 return false;
 511
 512         handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
 513                             iter->level, true);
 514
 515         return true;
 516 }
 517
 518 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 519                                            struct tdp_iter *iter)
 520 {
 521         /*
 522          * Freeze the SPTE by setting it to a special,
 523          * non-present value. This will stop other threads from
 524          * immediately installing a present entry in its place
 525          * before the TLBs are flushed.
 526          */
 527         if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
 528                 return false;
 529
 530         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 531                                            KVM_PAGES_PER_HPAGE(iter->level));
 532
 533         /*
 534          * No other thread can overwrite the removed SPTE as they
 535          * must either wait on the MMU lock or use
 536          * tdp_mmu_set_spte_atomic which will not overrite the
 537          * special removed SPTE value. No bookkeeping is needed
 538          * here since the SPTE is going from non-present
 539          * to non-present.
 540          */
 541         WRITE_ONCE(*iter->sptep, 0);
 542
 543         return true;
 544 }
 545
 546
 547 /*
 548  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 549  * @kvm: kvm instance
 550  * @iter: a tdp_iter instance currently on the SPTE that should be set
 551  * @new_spte: The value the SPTE should be set to
 552  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 553  *                    of the page. Should be set unless handling an MMU
 554  *                    notifier for access tracking. Leaving record_acc_track
 555  *                    unset in that case prevents page accesses from being
 556  *                    double counted.
 557  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 558  *                    appropriate for the change being made. Should be set
 559  *                    unless performing certain dirty logging operations.
 560  *                    Leaving record_dirty_log unset in that case prevents page
 561  *                    writes from being double counted.
 562  */
 563 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 564                                       u64 new_spte, bool record_acc_track,
 565                                       bool record_dirty_log)
 566 {
 567         tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
 568         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
 569         int as_id = kvm_mmu_page_as_id(root);
 570
 571         lockdep_assert_held_write(&kvm->mmu_lock);
 572
 573         /*
 574          * No thread should be using this function to set SPTEs to the
 575          * temporary removed SPTE value.
 576          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 577          * should be used. If operating under the MMU lock in write mode, the
 578          * use of the removed SPTE should not be necessary.
 579          */
 580         WARN_ON(iter->old_spte == REMOVED_SPTE);
 581
 582         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 583
 584         __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
 585                               iter->level, false);
 586         if (record_acc_track)
 587                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 588                                               iter->level);
 589         if (record_dirty_log)
 590                 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
 591                                               iter->old_spte, new_spte,
 592                                               iter->level);
 593 }
 594
 595 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 596                                     u64 new_spte)
 597 {
 598         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 599 }
 600
 601 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 602                                                  struct tdp_iter *iter,
 603                                                  u64 new_spte)
 604 {
 605         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 606 }
 607
 608 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 609                                                  struct tdp_iter *iter,
 610                                                  u64 new_spte)
 611 {
 612         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 613 }
 614
 615 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 616         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 617
 618 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 619         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 620                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 621                     !is_last_spte(_iter.old_spte, _iter.level))         \
 622                         continue;                                       \
 623                 else
 624
 625 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 626         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 627                          _mmu->shadow_root_level, _start, _end)
 628
 629 /*
 630  * Yield if the MMU lock is contended or this thread needs to return control
 631  * to the scheduler.
 632  *
 633  * If this function should yield and flush is set, it will perform a remote
 634  * TLB flush before yielding.
 635  *
 636  * If this function yields, it will also reset the tdp_iter's walk over the
 637  * paging structure and the calling function should skip to the next
 638  * iteration to allow the iterator to continue its traversal from the
 639  * paging structure root.
 640  *
 641  * Return true if this function yielded and the iterator's traversal was reset.
 642  * Return false if a yield was not needed.
 643  */
 644 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 645                                              struct tdp_iter *iter, bool flush)
 646 {
 647         /* Ensure forward progress has been made before yielding. */
 648         if (iter->next_last_level_gfn == iter->yielded_gfn)
 649                 return false;
 650
 651         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 652                 rcu_read_unlock();
 653
 654                 if (flush)
 655                         kvm_flush_remote_tlbs(kvm);
 656
 657                 cond_resched_rwlock_write(&kvm->mmu_lock);
 658                 rcu_read_lock();
 659
 660                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 661
 662                 tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
 663                                iter->root_level, iter->min_level,
 664                                iter->next_last_level_gfn);
 665
 666                 return true;
 667         }
 668
 669         return false;
 670 }
 671
 672 /*
 673  * Tears down the mappings for the range of gfns, [start, end), and frees the
 674  * non-root pages mapping GFNs strictly within that range. Returns true if
 675  * SPTEs have been cleared and a TLB flush is needed before releasing the
 676  * MMU lock.
 677  * If can_yield is true, will release the MMU lock and reschedule if the
 678  * scheduler needs the CPU or there is contention on the MMU lock. If this
 679  * function cannot yield, it will not release the MMU lock or reschedule and
 680  * the caller must ensure it does not supply too large a GFN range, or the
 681  * operation can cause a soft lockup.
 682  */
 683 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 684                           gfn_t start, gfn_t end, bool can_yield)
 685 {
 686         struct tdp_iter iter;
 687         bool flush_needed = false;
 688
 689         rcu_read_lock();
 690
 691         tdp_root_for_each_pte(iter, root, start, end) {
 692                 if (can_yield &&
 693                     tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
 694                         flush_needed = false;
 695                         continue;
 696                 }
 697
 698                 if (!is_shadow_present_pte(iter.old_spte))
 699                         continue;
 700
 701                 /*
 702                  * If this is a non-last-level SPTE that covers a larger range
 703                  * than should be zapped, continue, and zap the mappings at a
 704                  * lower level.
 705                  */
 706                 if ((iter.gfn < start ||
 707                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 708                     !is_last_spte(iter.old_spte, iter.level))
 709                         continue;
 710
 711                 tdp_mmu_set_spte(kvm, &iter, 0);
 712                 flush_needed = true;
 713         }
 714
 715         rcu_read_unlock();
 716         return flush_needed;
 717 }
 718
 719 /*
 720  * Tears down the mappings for the range of gfns, [start, end), and frees the
 721  * non-root pages mapping GFNs strictly within that range. Returns true if
 722  * SPTEs have been cleared and a TLB flush is needed before releasing the
 723  * MMU lock.
 724  */
 725 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
 726 {
 727         struct kvm_mmu_page *root;
 728         bool flush = false;
 729
 730         for_each_tdp_mmu_root_yield_safe(kvm, root)
 731                 flush |= zap_gfn_range(kvm, root, start, end, true);
 732
 733         return flush;
 734 }
 735
 736 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 737 {
 738         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 739         bool flush;
 740
 741         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
 742         if (flush)
 743                 kvm_flush_remote_tlbs(kvm);
 744 }
 745
 746 /*
 747  * Installs a last-level SPTE to handle a TDP page fault.
 748  * (NPT/EPT violation/misconfiguration)
 749  */
 750 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 751                                           int map_writable,
 752                                           struct tdp_iter *iter,
 753                                           kvm_pfn_t pfn, bool prefault)
 754 {
 755         u64 new_spte;
 756         int ret = 0;
 757         int make_spte_ret = 0;
 758
 759         if (unlikely(is_noslot_pfn(pfn)))
 760                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 761         else
 762                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 763                                          pfn, iter->old_spte, prefault, true,
 764                                          map_writable, !shadow_accessed_mask,
 765                                          &new_spte);
 766
 767         if (new_spte == iter->old_spte)
 768                 ret = RET_PF_SPURIOUS;
 769         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 770                 return RET_PF_RETRY;
 771
 772         /*
 773          * If the page fault was caused by a write but the page is write
 774          * protected, emulation is needed. If the emulation was skipped,
 775          * the vCPU would have the same fault again.
 776          */
 777         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 778                 if (write)
 779                         ret = RET_PF_EMULATE;
 780                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 781         }
 782
 783         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 784         if (unlikely(is_mmio_spte(new_spte))) {
 785                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 786                                      new_spte);
 787                 ret = RET_PF_EMULATE;
 788         } else
 789                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 790                                        rcu_dereference(iter->sptep));
 791
 792         trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 793                                rcu_dereference(iter->sptep));
 794         if (!prefault)
 795                 vcpu->stat.pf_fixed++;
 796
 797         return ret;
 798 }
 799
 800 /*
 801  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 802  * page tables and SPTEs to translate the faulting guest physical address.
 803  */
 804 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 805                     int map_writable, int max_level, kvm_pfn_t pfn,
 806                     bool prefault)
 807 {
 808         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 809         bool write = error_code & PFERR_WRITE_MASK;
 810         bool exec = error_code & PFERR_FETCH_MASK;
 811         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 812         struct kvm_mmu *mmu = vcpu->arch.mmu;
 813         struct tdp_iter iter;
 814         struct kvm_mmu_page *sp;
 815         u64 *child_pt;
 816         u64 new_spte;
 817         int ret;
 818         gfn_t gfn = gpa >> PAGE_SHIFT;
 819         int level;
 820         int req_level;
 821
 822         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
 823                 return RET_PF_RETRY;
 824         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
 825                 return RET_PF_RETRY;
 826
 827         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
 828                                         huge_page_disallowed, &req_level);
 829
 830         trace_kvm_mmu_spte_requested(gpa, level, pfn);
 831
 832         rcu_read_lock();
 833
 834         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 835                 if (nx_huge_page_workaround_enabled)
 836                         disallowed_hugepage_adjust(iter.old_spte, gfn,
 837                                                    iter.level, &pfn, &level);
 838
 839                 if (iter.level == level)
 840                         break;
 841
 842                 /*
 843                  * If there is an SPTE mapping a large page at a higher level
 844                  * than the target, that SPTE must be cleared and replaced
 845                  * with a non-leaf SPTE.
 846                  */
 847                 if (is_shadow_present_pte(iter.old_spte) &&
 848                     is_large_pte(iter.old_spte)) {
 849                         if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
 850                                 break;
 851
 852                         /*
 853                          * The iter must explicitly re-read the spte here
 854                          * because the new value informs the !present
 855                          * path below.
 856                          */
 857                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
 858                 }
 859
 860                 if (!is_shadow_present_pte(iter.old_spte)) {
 861                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 862                         child_pt = sp->spt;
 863
 864                         new_spte = make_nonleaf_spte(child_pt,
 865                                                      !shadow_accessed_mask);
 866
 867                         if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
 868                                                     new_spte)) {
 869                                 tdp_mmu_link_page(vcpu->kvm, sp, true,
 870                                                   huge_page_disallowed &&
 871                                                   req_level >= iter.level);
 872
 873                                 trace_kvm_mmu_get_page(sp, true);
 874                         } else {
 875                                 tdp_mmu_free_sp(sp);
 876                                 break;
 877                         }
 878                 }
 879         }
 880
 881         if (iter.level != level) {
 882                 rcu_read_unlock();
 883                 return RET_PF_RETRY;
 884         }
 885
 886         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
 887                                               pfn, prefault);
 888         rcu_read_unlock();
 889
 890         return ret;
 891 }
 892
 893 static __always_inline int
 894 kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
 895                              unsigned long start,
 896                              unsigned long end,
 897                              unsigned long data,
 898                              int (*handler)(struct kvm *kvm,
 899                                             struct kvm_memory_slot *slot,
 900                                             struct kvm_mmu_page *root,
 901                                             gfn_t start,
 902                                             gfn_t end,
 903                                             unsigned long data))
 904 {
 905         struct kvm_memslots *slots;
 906         struct kvm_memory_slot *memslot;
 907         struct kvm_mmu_page *root;
 908         int ret = 0;
 909         int as_id;
 910
 911         for_each_tdp_mmu_root_yield_safe(kvm, root) {
 912                 as_id = kvm_mmu_page_as_id(root);
 913                 slots = __kvm_memslots(kvm, as_id);
 914                 kvm_for_each_memslot(memslot, slots) {
 915                         unsigned long hva_start, hva_end;
 916                         gfn_t gfn_start, gfn_end;
 917
 918                         hva_start = max(start, memslot->userspace_addr);
 919                         hva_end = min(end, memslot->userspace_addr +
 920                                       (memslot->npages << PAGE_SHIFT));
 921                         if (hva_start >= hva_end)
 922                                 continue;
 923                         /*
 924                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 925                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 926                          */
 927                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
 928                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 929
 930                         ret |= handler(kvm, memslot, root, gfn_start,
 931                                        gfn_end, data);
 932                 }
 933         }
 934
 935         return ret;
 936 }
 937
 938 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
 939                                      struct kvm_memory_slot *slot,
 940                                      struct kvm_mmu_page *root, gfn_t start,
 941                                      gfn_t end, unsigned long unused)
 942 {
 943         return zap_gfn_range(kvm, root, start, end, false);
 944 }
 945
 946 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
 947                               unsigned long end)
 948 {
 949         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 950                                             zap_gfn_range_hva_wrapper);
 951 }
 952
 953 /*
 954  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
 955  * if any of the GFNs in the range have been accessed.
 956  */
 957 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
 958                          struct kvm_mmu_page *root, gfn_t start, gfn_t end,
 959                          unsigned long unused)
 960 {
 961         struct tdp_iter iter;
 962         int young = 0;
 963         u64 new_spte = 0;
 964
 965         rcu_read_lock();
 966
 967         tdp_root_for_each_leaf_pte(iter, root, start, end) {
 968                 /*
 969                  * If we have a non-accessed entry we don't need to change the
 970                  * pte.
 971                  */
 972                 if (!is_accessed_spte(iter.old_spte))
 973                         continue;
 974
 975                 new_spte = iter.old_spte;
 976
 977                 if (spte_ad_enabled(new_spte)) {
 978                         clear_bit((ffs(shadow_accessed_mask) - 1),
 979                                   (unsigned long *)&new_spte);
 980                 } else {
 981                         /*
 982                          * Capture the dirty status of the page, so that it doesn't get
 983                          * lost when the SPTE is marked for access tracking.
 984                          */
 985                         if (is_writable_pte(new_spte))
 986                                 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 987
 988                         new_spte = mark_spte_for_access_track(new_spte);
 989                 }
 990                 new_spte &= ~shadow_dirty_mask;
 991
 992                 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
 993                 young = 1;
 994
 995                 trace_kvm_age_page(iter.gfn, iter.level, slot, young);
 996         }
 997
 998         rcu_read_unlock();
 999
1000         return young;
1001 }
1002
1003 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
1004                               unsigned long end)
1005 {
1006         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
1007                                             age_gfn_range);
1008 }
1009
1010 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
1011                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1012                         unsigned long unused2)
1013 {
1014         struct tdp_iter iter;
1015
1016         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
1017                 if (is_accessed_spte(iter.old_spte))
1018                         return 1;
1019
1020         return 0;
1021 }
1022
1023 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1024 {
1025         return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
1026                                             test_age_gfn);
1027 }
1028
1029 /*
1030  * Handle the changed_pte MMU notifier for the TDP MMU.
1031  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1032  * notifier.
1033  * Returns non-zero if a flush is needed before releasing the MMU lock.
1034  */
1035 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1036                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1037                         unsigned long data)
1038 {
1039         struct tdp_iter iter;
1040         pte_t *ptep = (pte_t *)data;
1041         kvm_pfn_t new_pfn;
1042         u64 new_spte;
1043         int need_flush = 0;
1044
1045         rcu_read_lock();
1046
1047         WARN_ON(pte_huge(*ptep));
1048
1049         new_pfn = pte_pfn(*ptep);
1050
1051         tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
1052                 if (iter.level != PG_LEVEL_4K)
1053                         continue;
1054
1055                 if (!is_shadow_present_pte(iter.old_spte))
1056                         break;
1057
1058                 tdp_mmu_set_spte(kvm, &iter, 0);
1059
1060                 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
1061
1062                 if (!pte_write(*ptep)) {
1063                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1064                                         iter.old_spte, new_pfn);
1065
1066                         tdp_mmu_set_spte(kvm, &iter, new_spte);
1067                 }
1068
1069                 need_flush = 1;
1070         }
1071
1072         if (need_flush)
1073                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1074
1075         rcu_read_unlock();
1076
1077         return 0;
1078 }
1079
1080 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1081                              pte_t *host_ptep)
1082 {
1083         return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
1084                                             (unsigned long)host_ptep,
1085                                             set_tdp_spte);
1086 }
1087
1088 /*
1089  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1090  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1091  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1092  */
1093 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1094                              gfn_t start, gfn_t end, int min_level)
1095 {
1096         struct tdp_iter iter;
1097         u64 new_spte;
1098         bool spte_set = false;
1099
1100         rcu_read_lock();
1101
1102         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1103
1104         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1105                                    min_level, start, end) {
1106                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1107                         continue;
1108
1109                 if (!is_shadow_present_pte(iter.old_spte) ||
1110                     !is_last_spte(iter.old_spte, iter.level) ||
1111                     !(iter.old_spte & PT_WRITABLE_MASK))
1112                         continue;
1113
1114                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1115
1116                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1117                 spte_set = true;
1118         }
1119
1120         rcu_read_unlock();
1121         return spte_set;
1122 }
1123
1124 /*
1125  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1126  * only affect leaf SPTEs down to min_level.
1127  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1128  */
1129 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1130                              int min_level)
1131 {
1132         struct kvm_mmu_page *root;
1133         int root_as_id;
1134         bool spte_set = false;
1135
1136         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1137                 root_as_id = kvm_mmu_page_as_id(root);
1138                 if (root_as_id != slot->as_id)
1139                         continue;
1140
1141                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1142                              slot->base_gfn + slot->npages, min_level);
1143         }
1144
1145         return spte_set;
1146 }
1147
1148 /*
1149  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1150  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1151  * If AD bits are not enabled, this will require clearing the writable bit on
1152  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1153  * be flushed.
1154  */
1155 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1156                            gfn_t start, gfn_t end)
1157 {
1158         struct tdp_iter iter;
1159         u64 new_spte;
1160         bool spte_set = false;
1161
1162         rcu_read_lock();
1163
1164         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1165                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1166                         continue;
1167
1168                 if (spte_ad_need_write_protect(iter.old_spte)) {
1169                         if (is_writable_pte(iter.old_spte))
1170                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1171                         else
1172                                 continue;
1173                 } else {
1174                         if (iter.old_spte & shadow_dirty_mask)
1175                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1176                         else
1177                                 continue;
1178                 }
1179
1180                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1181                 spte_set = true;
1182         }
1183
1184         rcu_read_unlock();
1185         return spte_set;
1186 }
1187
1188 /*
1189  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1190  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1191  * If AD bits are not enabled, this will require clearing the writable bit on
1192  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1193  * be flushed.
1194  */
1195 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1196 {
1197         struct kvm_mmu_page *root;
1198         int root_as_id;
1199         bool spte_set = false;
1200
1201         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1202                 root_as_id = kvm_mmu_page_as_id(root);
1203                 if (root_as_id != slot->as_id)
1204                         continue;
1205
1206                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1207                                 slot->base_gfn + slot->npages);
1208         }
1209
1210         return spte_set;
1211 }
1212
1213 /*
1214  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1215  * set in mask, starting at gfn. The given memslot is expected to contain all
1216  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1217  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1218  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1219  */
1220 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1221                                   gfn_t gfn, unsigned long mask, bool wrprot)
1222 {
1223         struct tdp_iter iter;
1224         u64 new_spte;
1225
1226         rcu_read_lock();
1227
1228         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1229                                     gfn + BITS_PER_LONG) {
1230                 if (!mask)
1231                         break;
1232
1233                 if (iter.level > PG_LEVEL_4K ||
1234                     !(mask & (1UL << (iter.gfn - gfn))))
1235                         continue;
1236
1237                 mask &= ~(1UL << (iter.gfn - gfn));
1238
1239                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1240                         if (is_writable_pte(iter.old_spte))
1241                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1242                         else
1243                                 continue;
1244                 } else {
1245                         if (iter.old_spte & shadow_dirty_mask)
1246                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1247                         else
1248                                 continue;
1249                 }
1250
1251                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1252         }
1253
1254         rcu_read_unlock();
1255 }
1256
1257 /*
1258  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1259  * set in mask, starting at gfn. The given memslot is expected to contain all
1260  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1261  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1262  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1263  */
1264 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1265                                        struct kvm_memory_slot *slot,
1266                                        gfn_t gfn, unsigned long mask,
1267                                        bool wrprot)
1268 {
1269         struct kvm_mmu_page *root;
1270         int root_as_id;
1271
1272         lockdep_assert_held_write(&kvm->mmu_lock);
1273         for_each_tdp_mmu_root(kvm, root) {
1274                 root_as_id = kvm_mmu_page_as_id(root);
1275                 if (root_as_id != slot->as_id)
1276                         continue;
1277
1278                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1279         }
1280 }
1281
1282 /*
1283  * Clear leaf entries which could be replaced by large mappings, for
1284  * GFNs within the slot.
1285  */
1286 static void zap_collapsible_spte_range(struct kvm *kvm,
1287                                        struct kvm_mmu_page *root,
1288                                        struct kvm_memory_slot *slot)
1289 {
1290         gfn_t start = slot->base_gfn;
1291         gfn_t end = start + slot->npages;
1292         struct tdp_iter iter;
1293         kvm_pfn_t pfn;
1294         bool spte_set = false;
1295
1296         rcu_read_lock();
1297
1298         tdp_root_for_each_pte(iter, root, start, end) {
1299                 if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
1300                         spte_set = false;
1301                         continue;
1302                 }
1303
1304                 if (!is_shadow_present_pte(iter.old_spte) ||
1305                     !is_last_spte(iter.old_spte, iter.level))
1306                         continue;
1307
1308                 pfn = spte_to_pfn(iter.old_spte);
1309                 if (kvm_is_reserved_pfn(pfn) ||
1310                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1311                                                             pfn, PG_LEVEL_NUM))
1312                         continue;
1313
1314                 tdp_mmu_set_spte(kvm, &iter, 0);
1315
1316                 spte_set = true;
1317         }
1318
1319         rcu_read_unlock();
1320         if (spte_set)
1321                 kvm_flush_remote_tlbs(kvm);
1322 }
1323
1324 /*
1325  * Clear non-leaf entries (and free associated page tables) which could
1326  * be replaced by large mappings, for GFNs within the slot.
1327  */
1328 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1329                                        struct kvm_memory_slot *slot)
1330 {
1331         struct kvm_mmu_page *root;
1332         int root_as_id;
1333
1334         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1335                 root_as_id = kvm_mmu_page_as_id(root);
1336                 if (root_as_id != slot->as_id)
1337                         continue;
1338
1339                 zap_collapsible_spte_range(kvm, root, slot);
1340         }
1341 }
1342
1343 /*
1344  * Removes write access on the last level SPTE mapping this GFN and unsets the
1345  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1346  * Returns true if an SPTE was set and a TLB flush is needed.
1347  */
1348 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1349                               gfn_t gfn)
1350 {
1351         struct tdp_iter iter;
1352         u64 new_spte;
1353         bool spte_set = false;
1354
1355         rcu_read_lock();
1356
1357         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1358                 if (!is_writable_pte(iter.old_spte))
1359                         break;
1360
1361                 new_spte = iter.old_spte &
1362                         ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
1363
1364                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1365                 spte_set = true;
1366         }
1367
1368         rcu_read_unlock();
1369
1370         return spte_set;
1371 }
1372
1373 /*
1374  * Removes write access on the last level SPTE mapping this GFN and unsets the
1375  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1376  * Returns true if an SPTE was set and a TLB flush is needed.
1377  */
1378 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1379                                    struct kvm_memory_slot *slot, gfn_t gfn)
1380 {
1381         struct kvm_mmu_page *root;
1382         int root_as_id;
1383         bool spte_set = false;
1384
1385         lockdep_assert_held_write(&kvm->mmu_lock);
1386         for_each_tdp_mmu_root(kvm, root) {
1387                 root_as_id = kvm_mmu_page_as_id(root);
1388                 if (root_as_id != slot->as_id)
1389                         continue;
1390
1391                 spte_set |= write_protect_gfn(kvm, root, gfn);
1392         }
1393         return spte_set;
1394 }
1395
1396 /*
1397  * Return the level of the lowest level SPTE added to sptes.
1398  * That SPTE may be non-present.
1399  */
1400 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1401                          int *root_level)
1402 {
1403         struct tdp_iter iter;
1404         struct kvm_mmu *mmu = vcpu->arch.mmu;
1405         gfn_t gfn = addr >> PAGE_SHIFT;
1406         int leaf = -1;
1407
1408         *root_level = vcpu->arch.mmu->shadow_root_level;
1409
1410         rcu_read_lock();
1411
1412         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1413                 leaf = iter.level;
1414                 sptes[leaf] = iter.old_spte;
1415         }
1416
1417         rcu_read_unlock();
1418
1419         return leaf;
1420 }