1 // SPDX-License-Identifier: GPL-2.0
4 #include "mmu_internal.h"
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
32 if (!kvm->arch.tdp_mmu_enabled)
35 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
38 * Ensure that all the outstanding RCU callbacks to free shadow pages
39 * can run before the VM is torn down.
44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
46 if (kvm_mmu_put_root(kvm, root))
47 kvm_tdp_mmu_free_root(kvm, root);
50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
51 struct kvm_mmu_page *root)
53 lockdep_assert_held_write(&kvm->mmu_lock);
55 if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
58 kvm_mmu_get_root(kvm, root);
63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
64 struct kvm_mmu_page *root)
66 struct kvm_mmu_page *next_root;
68 next_root = list_next_entry(root, link);
69 tdp_mmu_put_root(kvm, root);
74 * Note: this iterator gets and puts references to the roots it iterates over.
75 * This makes it safe to release the MMU lock and yield within the loop, but
76 * if exiting the loop early, the caller must drop the reference to the most
77 * recent root. (Unless keeping a live reference is desirable.)
79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
80 for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \
81 typeof(*_root), link); \
82 tdp_mmu_next_root_valid(_kvm, _root); \
83 _root = tdp_mmu_next_root(_kvm, _root))
85 #define for_each_tdp_mmu_root(_kvm, _root) \
86 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
89 gfn_t start, gfn_t end, bool can_yield);
91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
93 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
95 lockdep_assert_held_write(&kvm->mmu_lock);
97 WARN_ON(root->root_count);
98 WARN_ON(!root->tdp_mmu_page);
100 list_del(&root->link);
102 zap_gfn_range(kvm, root, 0, max_gfn, false);
104 free_page((unsigned long)root->spt);
105 kmem_cache_free(mmu_page_header_cache, root);
108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
111 union kvm_mmu_page_role role;
113 role = vcpu->arch.mmu->mmu_role.base;
116 role.gpte_is_8_bytes = true;
117 role.access = ACC_ALL;
122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
125 struct kvm_mmu_page *sp;
127 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
128 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
129 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
131 sp->role.word = page_role_for_level(vcpu, level).word;
133 sp->tdp_mmu_page = true;
135 trace_kvm_mmu_get_page(sp, true);
140 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
142 union kvm_mmu_page_role role;
143 struct kvm *kvm = vcpu->kvm;
144 struct kvm_mmu_page *root;
146 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
148 write_lock(&kvm->mmu_lock);
150 /* Check for an existing root before allocating a new one. */
151 for_each_tdp_mmu_root(kvm, root) {
152 if (root->role.word == role.word) {
153 kvm_mmu_get_root(kvm, root);
154 write_unlock(&kvm->mmu_lock);
159 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
160 root->root_count = 1;
162 list_add(&root->link, &kvm->arch.tdp_mmu_roots);
164 write_unlock(&kvm->mmu_lock);
169 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
171 struct kvm_mmu_page *root;
173 root = get_tdp_mmu_vcpu_root(vcpu);
177 return __pa(root->spt);
180 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
182 free_page((unsigned long)sp->spt);
183 kmem_cache_free(mmu_page_header_cache, sp);
187 * This is called through call_rcu in order to free TDP page table memory
188 * safely with respect to other kernel threads that may be operating on
190 * By only accessing TDP MMU page table memory in an RCU read critical
191 * section, and freeing it after a grace period, lockless access to that
192 * memory won't use it after it is freed.
194 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
196 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
202 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
203 u64 old_spte, u64 new_spte, int level,
206 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
208 return sp->role.smm ? 1 : 0;
211 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
213 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
215 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
218 if (is_accessed_spte(old_spte) &&
219 (!is_accessed_spte(new_spte) || pfn_changed))
220 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
223 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
224 u64 old_spte, u64 new_spte, int level)
227 struct kvm_memory_slot *slot;
229 if (level > PG_LEVEL_4K)
232 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
234 if ((!is_writable_pte(old_spte) || pfn_changed) &&
235 is_writable_pte(new_spte)) {
236 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
237 mark_page_dirty_in_slot(kvm, slot, gfn);
242 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
246 * @shared: This operation may not be running under the exclusive use of
247 * the MMU lock and the operation must synchronize with other
248 * threads that might be adding or removing pages.
249 * @account_nx: This page replaces a NX large page and should be marked for
252 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
253 bool shared, bool account_nx)
256 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
258 lockdep_assert_held_write(&kvm->mmu_lock);
260 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
262 account_huge_nx_page(kvm, sp);
265 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
269 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
272 * @sp: the page to be removed
273 * @shared: This operation may not be running under the exclusive use of
274 * the MMU lock and the operation must synchronize with other
275 * threads that might be adding or removing pages.
277 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
281 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
283 lockdep_assert_held_write(&kvm->mmu_lock);
286 if (sp->lpage_disallowed)
287 unaccount_huge_nx_page(kvm, sp);
290 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
294 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
297 * @pt: the page removed from the paging structure
298 * @shared: This operation may not be running under the exclusive use
299 * of the MMU lock and the operation must synchronize with other
300 * threads that might be modifying SPTEs.
302 * Given a page table that has been removed from the TDP paging structure,
303 * iterates through the page table to clear SPTEs and free child page tables.
305 static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
308 struct kvm_mmu_page *sp = sptep_to_sp(pt);
309 int level = sp->role.level;
310 gfn_t base_gfn = sp->gfn;
316 trace_kvm_mmu_prepare_zap_page(sp);
318 tdp_mmu_unlink_page(kvm, sp, shared);
320 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
322 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
326 * Set the SPTE to a nonpresent value that other
327 * threads will not overwrite. If the SPTE was
328 * already marked as removed then another thread
329 * handling a page fault could overwrite it, so
330 * set the SPTE until it is set from some other
331 * value to the removed SPTE value.
334 old_child_spte = xchg(sptep, REMOVED_SPTE);
335 if (!is_removed_spte(old_child_spte))
340 old_child_spte = READ_ONCE(*sptep);
343 * Marking the SPTE as a removed SPTE is not
344 * strictly necessary here as the MMU lock will
345 * stop other threads from concurrently modifying
346 * this SPTE. Using the removed SPTE value keeps
347 * the two branches consistent and simplifies
350 WRITE_ONCE(*sptep, REMOVED_SPTE);
352 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
353 old_child_spte, REMOVED_SPTE, level - 1,
357 kvm_flush_remote_tlbs_with_address(kvm, gfn,
358 KVM_PAGES_PER_HPAGE(level));
360 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
364 * handle_changed_spte - handle bookkeeping associated with an SPTE change
366 * @as_id: the address space of the paging structure the SPTE was a part of
367 * @gfn: the base GFN that was mapped by the SPTE
368 * @old_spte: The value of the SPTE before the change
369 * @new_spte: The value of the SPTE after the change
370 * @level: the level of the PT the SPTE is part of in the paging structure
371 * @shared: This operation may not be running under the exclusive use of
372 * the MMU lock and the operation must synchronize with other
373 * threads that might be modifying SPTEs.
375 * Handle bookkeeping that might result from the modification of a SPTE.
376 * This function must be called for all TDP SPTE modifications.
378 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
379 u64 old_spte, u64 new_spte, int level,
382 bool was_present = is_shadow_present_pte(old_spte);
383 bool is_present = is_shadow_present_pte(new_spte);
384 bool was_leaf = was_present && is_last_spte(old_spte, level);
385 bool is_leaf = is_present && is_last_spte(new_spte, level);
386 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
388 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
389 WARN_ON(level < PG_LEVEL_4K);
390 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
393 * If this warning were to trigger it would indicate that there was a
394 * missing MMU notifier or a race with some notifier handler.
395 * A present, leaf SPTE should never be directly replaced with another
396 * present leaf SPTE pointing to a differnt PFN. A notifier handler
397 * should be zapping the SPTE before the main MM's page table is
398 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
399 * thread before replacement.
401 if (was_leaf && is_leaf && pfn_changed) {
402 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
403 "SPTE with another present leaf SPTE mapping a\n"
405 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
406 as_id, gfn, old_spte, new_spte, level);
409 * Crash the host to prevent error propagation and guest data
415 if (old_spte == new_spte)
418 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
421 * The only times a SPTE should be changed from a non-present to
422 * non-present state is when an MMIO entry is installed/modified/
423 * removed. In that case, there is nothing to do here.
425 if (!was_present && !is_present) {
427 * If this change does not involve a MMIO SPTE or removed SPTE,
428 * it is unexpected. Log the change, though it should not
429 * impact the guest since both the former and current SPTEs
432 if (WARN_ON(!is_mmio_spte(old_spte) &&
433 !is_mmio_spte(new_spte) &&
434 !is_removed_spte(new_spte)))
435 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
436 "should not be replaced with another,\n"
437 "different nonpresent SPTE, unless one or both\n"
438 "are MMIO SPTEs, or the new SPTE is\n"
439 "a temporary removed SPTE.\n"
440 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
441 as_id, gfn, old_spte, new_spte, level);
446 if (was_leaf && is_dirty_spte(old_spte) &&
447 (!is_dirty_spte(new_spte) || pfn_changed))
448 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
451 * Recursively handle child PTs if the change removed a subtree from
452 * the paging structure.
454 if (was_present && !was_leaf && (pfn_changed || !is_present))
455 handle_removed_tdp_mmu_page(kvm,
456 spte_to_child_pt(old_spte, level), shared);
459 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
460 u64 old_spte, u64 new_spte, int level,
463 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
465 handle_changed_spte_acc_track(old_spte, new_spte, level);
466 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
471 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
472 * associated bookkeeping
475 * @iter: a tdp_iter instance currently on the SPTE that should be set
476 * @new_spte: The value the SPTE should be set to
477 * Returns: true if the SPTE was set, false if it was not. If false is returned,
478 * this function will have no side-effects.
480 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
481 struct tdp_iter *iter,
484 u64 *root_pt = tdp_iter_root_pt(iter);
485 struct kvm_mmu_page *root = sptep_to_sp(root_pt);
486 int as_id = kvm_mmu_page_as_id(root);
488 lockdep_assert_held_read(&kvm->mmu_lock);
491 * Do not change removed SPTEs. Only the thread that froze the SPTE
494 if (iter->old_spte == REMOVED_SPTE)
497 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
498 new_spte) != iter->old_spte)
501 handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
507 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
508 struct tdp_iter *iter)
511 * Freeze the SPTE by setting it to a special,
512 * non-present value. This will stop other threads from
513 * immediately installing a present entry in its place
514 * before the TLBs are flushed.
516 if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
519 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
520 KVM_PAGES_PER_HPAGE(iter->level));
523 * No other thread can overwrite the removed SPTE as they
524 * must either wait on the MMU lock or use
525 * tdp_mmu_set_spte_atomic which will not overrite the
526 * special removed SPTE value. No bookkeeping is needed
527 * here since the SPTE is going from non-present
530 WRITE_ONCE(*iter->sptep, 0);
537 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
539 * @iter: a tdp_iter instance currently on the SPTE that should be set
540 * @new_spte: The value the SPTE should be set to
541 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
542 * of the page. Should be set unless handling an MMU
543 * notifier for access tracking. Leaving record_acc_track
544 * unset in that case prevents page accesses from being
546 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
547 * appropriate for the change being made. Should be set
548 * unless performing certain dirty logging operations.
549 * Leaving record_dirty_log unset in that case prevents page
550 * writes from being double counted.
552 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
553 u64 new_spte, bool record_acc_track,
554 bool record_dirty_log)
556 tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
557 struct kvm_mmu_page *root = sptep_to_sp(root_pt);
558 int as_id = kvm_mmu_page_as_id(root);
560 lockdep_assert_held_write(&kvm->mmu_lock);
563 * No thread should be using this function to set SPTEs to the
564 * temporary removed SPTE value.
565 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
566 * should be used. If operating under the MMU lock in write mode, the
567 * use of the removed SPTE should not be necessary.
569 WARN_ON(iter->old_spte == REMOVED_SPTE);
571 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
573 __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
575 if (record_acc_track)
576 handle_changed_spte_acc_track(iter->old_spte, new_spte,
578 if (record_dirty_log)
579 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
580 iter->old_spte, new_spte,
584 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
587 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
590 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
591 struct tdp_iter *iter,
594 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
597 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
598 struct tdp_iter *iter,
601 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
604 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
605 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
607 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
608 tdp_root_for_each_pte(_iter, _root, _start, _end) \
609 if (!is_shadow_present_pte(_iter.old_spte) || \
610 !is_last_spte(_iter.old_spte, _iter.level)) \
614 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
615 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
616 _mmu->shadow_root_level, _start, _end)
619 * Yield if the MMU lock is contended or this thread needs to return control
622 * If this function should yield and flush is set, it will perform a remote
623 * TLB flush before yielding.
625 * If this function yields, it will also reset the tdp_iter's walk over the
626 * paging structure and the calling function should skip to the next
627 * iteration to allow the iterator to continue its traversal from the
628 * paging structure root.
630 * Return true if this function yielded and the iterator's traversal was reset.
631 * Return false if a yield was not needed.
633 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
634 struct tdp_iter *iter, bool flush)
636 /* Ensure forward progress has been made before yielding. */
637 if (iter->next_last_level_gfn == iter->yielded_gfn)
640 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
644 kvm_flush_remote_tlbs(kvm);
646 cond_resched_rwlock_write(&kvm->mmu_lock);
649 WARN_ON(iter->gfn > iter->next_last_level_gfn);
651 tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
652 iter->root_level, iter->min_level,
653 iter->next_last_level_gfn);
662 * Tears down the mappings for the range of gfns, [start, end), and frees the
663 * non-root pages mapping GFNs strictly within that range. Returns true if
664 * SPTEs have been cleared and a TLB flush is needed before releasing the
666 * If can_yield is true, will release the MMU lock and reschedule if the
667 * scheduler needs the CPU or there is contention on the MMU lock. If this
668 * function cannot yield, it will not release the MMU lock or reschedule and
669 * the caller must ensure it does not supply too large a GFN range, or the
670 * operation can cause a soft lockup.
672 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
673 gfn_t start, gfn_t end, bool can_yield)
675 struct tdp_iter iter;
676 bool flush_needed = false;
680 tdp_root_for_each_pte(iter, root, start, end) {
682 tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
683 flush_needed = false;
687 if (!is_shadow_present_pte(iter.old_spte))
691 * If this is a non-last-level SPTE that covers a larger range
692 * than should be zapped, continue, and zap the mappings at a
695 if ((iter.gfn < start ||
696 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
697 !is_last_spte(iter.old_spte, iter.level))
700 tdp_mmu_set_spte(kvm, &iter, 0);
709 * Tears down the mappings for the range of gfns, [start, end), and frees the
710 * non-root pages mapping GFNs strictly within that range. Returns true if
711 * SPTEs have been cleared and a TLB flush is needed before releasing the
714 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
716 struct kvm_mmu_page *root;
719 for_each_tdp_mmu_root_yield_safe(kvm, root)
720 flush |= zap_gfn_range(kvm, root, start, end, true);
725 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
727 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
730 flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
732 kvm_flush_remote_tlbs(kvm);
736 * Installs a last-level SPTE to handle a TDP page fault.
737 * (NPT/EPT violation/misconfiguration)
739 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
741 struct tdp_iter *iter,
742 kvm_pfn_t pfn, bool prefault)
746 int make_spte_ret = 0;
748 if (unlikely(is_noslot_pfn(pfn)))
749 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
751 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
752 pfn, iter->old_spte, prefault, true,
753 map_writable, !shadow_accessed_mask,
756 if (new_spte == iter->old_spte)
757 ret = RET_PF_SPURIOUS;
758 else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
762 * If the page fault was caused by a write but the page is write
763 * protected, emulation is needed. If the emulation was skipped,
764 * the vCPU would have the same fault again.
766 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
768 ret = RET_PF_EMULATE;
769 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
772 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
773 if (unlikely(is_mmio_spte(new_spte))) {
774 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
776 ret = RET_PF_EMULATE;
778 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
779 rcu_dereference(iter->sptep));
781 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
782 rcu_dereference(iter->sptep));
784 vcpu->stat.pf_fixed++;
790 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
791 * page tables and SPTEs to translate the faulting guest physical address.
793 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
794 int map_writable, int max_level, kvm_pfn_t pfn,
797 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
798 bool write = error_code & PFERR_WRITE_MASK;
799 bool exec = error_code & PFERR_FETCH_MASK;
800 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
801 struct kvm_mmu *mmu = vcpu->arch.mmu;
802 struct tdp_iter iter;
803 struct kvm_mmu_page *sp;
807 gfn_t gfn = gpa >> PAGE_SHIFT;
811 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
813 if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
816 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
817 huge_page_disallowed, &req_level);
819 trace_kvm_mmu_spte_requested(gpa, level, pfn);
823 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
824 if (nx_huge_page_workaround_enabled)
825 disallowed_hugepage_adjust(iter.old_spte, gfn,
826 iter.level, &pfn, &level);
828 if (iter.level == level)
832 * If there is an SPTE mapping a large page at a higher level
833 * than the target, that SPTE must be cleared and replaced
834 * with a non-leaf SPTE.
836 if (is_shadow_present_pte(iter.old_spte) &&
837 is_large_pte(iter.old_spte)) {
838 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
842 * The iter must explicitly re-read the spte here
843 * because the new value informs the !present
846 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
849 if (!is_shadow_present_pte(iter.old_spte)) {
850 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
853 new_spte = make_nonleaf_spte(child_pt,
854 !shadow_accessed_mask);
856 if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
858 tdp_mmu_link_page(vcpu->kvm, sp, true,
859 huge_page_disallowed &&
860 req_level >= iter.level);
862 trace_kvm_mmu_get_page(sp, true);
870 if (iter.level != level) {
875 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
882 static __always_inline int
883 kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
887 int (*handler)(struct kvm *kvm,
888 struct kvm_memory_slot *slot,
889 struct kvm_mmu_page *root,
894 struct kvm_memslots *slots;
895 struct kvm_memory_slot *memslot;
896 struct kvm_mmu_page *root;
900 for_each_tdp_mmu_root_yield_safe(kvm, root) {
901 as_id = kvm_mmu_page_as_id(root);
902 slots = __kvm_memslots(kvm, as_id);
903 kvm_for_each_memslot(memslot, slots) {
904 unsigned long hva_start, hva_end;
905 gfn_t gfn_start, gfn_end;
907 hva_start = max(start, memslot->userspace_addr);
908 hva_end = min(end, memslot->userspace_addr +
909 (memslot->npages << PAGE_SHIFT));
910 if (hva_start >= hva_end)
913 * {gfn(page) | page intersects with [hva_start, hva_end)} =
914 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
916 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
917 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
919 ret |= handler(kvm, memslot, root, gfn_start,
927 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
928 struct kvm_memory_slot *slot,
929 struct kvm_mmu_page *root, gfn_t start,
930 gfn_t end, unsigned long unused)
932 return zap_gfn_range(kvm, root, start, end, false);
935 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
938 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
939 zap_gfn_range_hva_wrapper);
943 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
944 * if any of the GFNs in the range have been accessed.
946 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
947 struct kvm_mmu_page *root, gfn_t start, gfn_t end,
948 unsigned long unused)
950 struct tdp_iter iter;
956 tdp_root_for_each_leaf_pte(iter, root, start, end) {
958 * If we have a non-accessed entry we don't need to change the
961 if (!is_accessed_spte(iter.old_spte))
964 new_spte = iter.old_spte;
966 if (spte_ad_enabled(new_spte)) {
967 clear_bit((ffs(shadow_accessed_mask) - 1),
968 (unsigned long *)&new_spte);
971 * Capture the dirty status of the page, so that it doesn't get
972 * lost when the SPTE is marked for access tracking.
974 if (is_writable_pte(new_spte))
975 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
977 new_spte = mark_spte_for_access_track(new_spte);
979 new_spte &= ~shadow_dirty_mask;
981 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
984 trace_kvm_age_page(iter.gfn, iter.level, slot, young);
992 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
995 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
999 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
1000 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1001 unsigned long unused2)
1003 struct tdp_iter iter;
1005 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
1006 if (is_accessed_spte(iter.old_spte))
1012 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1014 return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
1019 * Handle the changed_pte MMU notifier for the TDP MMU.
1020 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1022 * Returns non-zero if a flush is needed before releasing the MMU lock.
1024 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1025 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1028 struct tdp_iter iter;
1029 pte_t *ptep = (pte_t *)data;
1036 WARN_ON(pte_huge(*ptep));
1038 new_pfn = pte_pfn(*ptep);
1040 tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
1041 if (iter.level != PG_LEVEL_4K)
1044 if (!is_shadow_present_pte(iter.old_spte))
1047 tdp_mmu_set_spte(kvm, &iter, 0);
1049 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
1051 if (!pte_write(*ptep)) {
1052 new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1053 iter.old_spte, new_pfn);
1055 tdp_mmu_set_spte(kvm, &iter, new_spte);
1062 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1069 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1072 return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
1073 (unsigned long)host_ptep,
1078 * Remove write access from all the SPTEs mapping GFNs [start, end). If
1079 * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1080 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1082 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1083 gfn_t start, gfn_t end, int min_level)
1085 struct tdp_iter iter;
1087 bool spte_set = false;
1091 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1093 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1094 min_level, start, end) {
1095 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1098 if (!is_shadow_present_pte(iter.old_spte) ||
1099 !is_last_spte(iter.old_spte, iter.level) ||
1100 !(iter.old_spte & PT_WRITABLE_MASK))
1103 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1105 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1114 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1115 * only affect leaf SPTEs down to min_level.
1116 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1118 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1121 struct kvm_mmu_page *root;
1123 bool spte_set = false;
1125 for_each_tdp_mmu_root_yield_safe(kvm, root) {
1126 root_as_id = kvm_mmu_page_as_id(root);
1127 if (root_as_id != slot->as_id)
1130 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1131 slot->base_gfn + slot->npages, min_level);
1138 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1139 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1140 * If AD bits are not enabled, this will require clearing the writable bit on
1141 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1144 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1145 gfn_t start, gfn_t end)
1147 struct tdp_iter iter;
1149 bool spte_set = false;
1153 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1154 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1157 if (spte_ad_need_write_protect(iter.old_spte)) {
1158 if (is_writable_pte(iter.old_spte))
1159 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1163 if (iter.old_spte & shadow_dirty_mask)
1164 new_spte = iter.old_spte & ~shadow_dirty_mask;
1169 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1178 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1179 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1180 * If AD bits are not enabled, this will require clearing the writable bit on
1181 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1184 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1186 struct kvm_mmu_page *root;
1188 bool spte_set = false;
1190 for_each_tdp_mmu_root_yield_safe(kvm, root) {
1191 root_as_id = kvm_mmu_page_as_id(root);
1192 if (root_as_id != slot->as_id)
1195 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1196 slot->base_gfn + slot->npages);
1203 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1204 * set in mask, starting at gfn. The given memslot is expected to contain all
1205 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1206 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1207 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1209 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1210 gfn_t gfn, unsigned long mask, bool wrprot)
1212 struct tdp_iter iter;
1217 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1218 gfn + BITS_PER_LONG) {
1222 if (iter.level > PG_LEVEL_4K ||
1223 !(mask & (1UL << (iter.gfn - gfn))))
1226 mask &= ~(1UL << (iter.gfn - gfn));
1228 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1229 if (is_writable_pte(iter.old_spte))
1230 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1234 if (iter.old_spte & shadow_dirty_mask)
1235 new_spte = iter.old_spte & ~shadow_dirty_mask;
1240 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1247 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1248 * set in mask, starting at gfn. The given memslot is expected to contain all
1249 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1250 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1251 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1253 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1254 struct kvm_memory_slot *slot,
1255 gfn_t gfn, unsigned long mask,
1258 struct kvm_mmu_page *root;
1261 lockdep_assert_held_write(&kvm->mmu_lock);
1262 for_each_tdp_mmu_root(kvm, root) {
1263 root_as_id = kvm_mmu_page_as_id(root);
1264 if (root_as_id != slot->as_id)
1267 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1272 * Clear leaf entries which could be replaced by large mappings, for
1273 * GFNs within the slot.
1275 static void zap_collapsible_spte_range(struct kvm *kvm,
1276 struct kvm_mmu_page *root,
1277 struct kvm_memory_slot *slot)
1279 gfn_t start = slot->base_gfn;
1280 gfn_t end = start + slot->npages;
1281 struct tdp_iter iter;
1283 bool spte_set = false;
1287 tdp_root_for_each_pte(iter, root, start, end) {
1288 if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
1293 if (!is_shadow_present_pte(iter.old_spte) ||
1294 !is_last_spte(iter.old_spte, iter.level))
1297 pfn = spte_to_pfn(iter.old_spte);
1298 if (kvm_is_reserved_pfn(pfn) ||
1299 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1303 tdp_mmu_set_spte(kvm, &iter, 0);
1310 kvm_flush_remote_tlbs(kvm);
1314 * Clear non-leaf entries (and free associated page tables) which could
1315 * be replaced by large mappings, for GFNs within the slot.
1317 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1318 struct kvm_memory_slot *slot)
1320 struct kvm_mmu_page *root;
1323 for_each_tdp_mmu_root_yield_safe(kvm, root) {
1324 root_as_id = kvm_mmu_page_as_id(root);
1325 if (root_as_id != slot->as_id)
1328 zap_collapsible_spte_range(kvm, root, slot);
1333 * Removes write access on the last level SPTE mapping this GFN and unsets the
1334 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1335 * Returns true if an SPTE was set and a TLB flush is needed.
1337 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1340 struct tdp_iter iter;
1342 bool spte_set = false;
1346 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1347 if (!is_writable_pte(iter.old_spte))
1350 new_spte = iter.old_spte &
1351 ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
1353 tdp_mmu_set_spte(kvm, &iter, new_spte);
1363 * Removes write access on the last level SPTE mapping this GFN and unsets the
1364 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1365 * Returns true if an SPTE was set and a TLB flush is needed.
1367 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1368 struct kvm_memory_slot *slot, gfn_t gfn)
1370 struct kvm_mmu_page *root;
1372 bool spte_set = false;
1374 lockdep_assert_held_write(&kvm->mmu_lock);
1375 for_each_tdp_mmu_root(kvm, root) {
1376 root_as_id = kvm_mmu_page_as_id(root);
1377 if (root_as_id != slot->as_id)
1380 spte_set |= write_protect_gfn(kvm, root, gfn);
1386 * Return the level of the lowest level SPTE added to sptes.
1387 * That SPTE may be non-present.
1389 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1392 struct tdp_iter iter;
1393 struct kvm_mmu *mmu = vcpu->arch.mmu;
1394 gfn_t gfn = addr >> PAGE_SHIFT;
1397 *root_level = vcpu->arch.mmu->shadow_root_level;
1401 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1403 sptes[leaf] = iter.old_spte;