1 // SPDX-License-Identifier: GPL-2.0
4 #include "mmu_internal.h"
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
32 if (!kvm->arch.tdp_mmu_enabled)
35 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
38 * Ensure that all the outstanding RCU callbacks to free shadow pages
39 * can run before the VM is torn down.
44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
46 if (kvm_mmu_put_root(kvm, root))
47 kvm_tdp_mmu_free_root(kvm, root);
50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
51 struct kvm_mmu_page *root)
53 lockdep_assert_held_write(&kvm->mmu_lock);
55 if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
58 kvm_mmu_get_root(kvm, root);
63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
64 struct kvm_mmu_page *root)
66 struct kvm_mmu_page *next_root;
68 next_root = list_next_entry(root, link);
69 tdp_mmu_put_root(kvm, root);
74 * Note: this iterator gets and puts references to the roots it iterates over.
75 * This makes it safe to release the MMU lock and yield within the loop, but
76 * if exiting the loop early, the caller must drop the reference to the most
77 * recent root. (Unless keeping a live reference is desirable.)
79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
80 for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \
81 typeof(*_root), link); \
82 tdp_mmu_next_root_valid(_kvm, _root); \
83 _root = tdp_mmu_next_root(_kvm, _root)) \
84 if (kvm_mmu_page_as_id(_root) != _as_id) { \
87 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
88 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
89 if (kvm_mmu_page_as_id(_root) != _as_id) { \
92 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
93 gfn_t start, gfn_t end, bool can_yield, bool flush);
95 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
97 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
99 lockdep_assert_held_write(&kvm->mmu_lock);
101 WARN_ON(root->root_count);
102 WARN_ON(!root->tdp_mmu_page);
104 list_del(&root->link);
106 zap_gfn_range(kvm, root, 0, max_gfn, false, false);
108 free_page((unsigned long)root->spt);
109 kmem_cache_free(mmu_page_header_cache, root);
112 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
115 union kvm_mmu_page_role role;
117 role = vcpu->arch.mmu->mmu_role.base;
120 role.gpte_is_8_bytes = true;
121 role.access = ACC_ALL;
126 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
129 struct kvm_mmu_page *sp;
131 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
132 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
133 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
135 sp->role.word = page_role_for_level(vcpu, level).word;
137 sp->tdp_mmu_page = true;
139 trace_kvm_mmu_get_page(sp, true);
144 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
146 union kvm_mmu_page_role role;
147 struct kvm *kvm = vcpu->kvm;
148 struct kvm_mmu_page *root;
150 lockdep_assert_held_write(&kvm->mmu_lock);
152 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
154 /* Check for an existing root before allocating a new one. */
155 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
156 if (root->role.word == role.word) {
157 kvm_mmu_get_root(kvm, root);
162 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
163 root->root_count = 1;
165 list_add(&root->link, &kvm->arch.tdp_mmu_roots);
168 return __pa(root->spt);
171 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
173 free_page((unsigned long)sp->spt);
174 kmem_cache_free(mmu_page_header_cache, sp);
178 * This is called through call_rcu in order to free TDP page table memory
179 * safely with respect to other kernel threads that may be operating on
181 * By only accessing TDP MMU page table memory in an RCU read critical
182 * section, and freeing it after a grace period, lockless access to that
183 * memory won't use it after it is freed.
185 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
187 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
193 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
194 u64 old_spte, u64 new_spte, int level,
197 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
199 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
202 if (is_accessed_spte(old_spte) &&
203 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
204 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
205 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
208 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
209 u64 old_spte, u64 new_spte, int level)
212 struct kvm_memory_slot *slot;
214 if (level > PG_LEVEL_4K)
217 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
219 if ((!is_writable_pte(old_spte) || pfn_changed) &&
220 is_writable_pte(new_spte)) {
221 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
222 mark_page_dirty_in_slot(kvm, slot, gfn);
227 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
231 * @shared: This operation may not be running under the exclusive use of
232 * the MMU lock and the operation must synchronize with other
233 * threads that might be adding or removing pages.
234 * @account_nx: This page replaces a NX large page and should be marked for
237 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
238 bool shared, bool account_nx)
241 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
243 lockdep_assert_held_write(&kvm->mmu_lock);
245 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
247 account_huge_nx_page(kvm, sp);
250 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
254 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
257 * @sp: the page to be removed
258 * @shared: This operation may not be running under the exclusive use of
259 * the MMU lock and the operation must synchronize with other
260 * threads that might be adding or removing pages.
262 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
266 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
268 lockdep_assert_held_write(&kvm->mmu_lock);
271 if (sp->lpage_disallowed)
272 unaccount_huge_nx_page(kvm, sp);
275 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
279 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
282 * @pt: the page removed from the paging structure
283 * @shared: This operation may not be running under the exclusive use
284 * of the MMU lock and the operation must synchronize with other
285 * threads that might be modifying SPTEs.
287 * Given a page table that has been removed from the TDP paging structure,
288 * iterates through the page table to clear SPTEs and free child page tables.
290 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
291 * protection. Since this thread removed it from the paging structure,
292 * this thread will be responsible for ensuring the page is freed. Hence the
293 * early rcu_dereferences in the function.
295 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
298 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
299 int level = sp->role.level;
300 gfn_t base_gfn = sp->gfn;
306 trace_kvm_mmu_prepare_zap_page(sp);
308 tdp_mmu_unlink_page(kvm, sp, shared);
310 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
311 sptep = rcu_dereference(pt) + i;
312 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
316 * Set the SPTE to a nonpresent value that other
317 * threads will not overwrite. If the SPTE was
318 * already marked as removed then another thread
319 * handling a page fault could overwrite it, so
320 * set the SPTE until it is set from some other
321 * value to the removed SPTE value.
324 old_child_spte = xchg(sptep, REMOVED_SPTE);
325 if (!is_removed_spte(old_child_spte))
331 * If the SPTE is not MMU-present, there is no backing
332 * page associated with the SPTE and so no side effects
333 * that need to be recorded, and exclusive ownership of
334 * mmu_lock ensures the SPTE can't be made present.
335 * Note, zapping MMIO SPTEs is also unnecessary as they
336 * are guarded by the memslots generation, not by being
339 old_child_spte = READ_ONCE(*sptep);
340 if (!is_shadow_present_pte(old_child_spte))
344 * Marking the SPTE as a removed SPTE is not
345 * strictly necessary here as the MMU lock will
346 * stop other threads from concurrently modifying
347 * this SPTE. Using the removed SPTE value keeps
348 * the two branches consistent and simplifies
351 WRITE_ONCE(*sptep, REMOVED_SPTE);
353 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
354 old_child_spte, REMOVED_SPTE, level - 1,
358 kvm_flush_remote_tlbs_with_address(kvm, gfn,
359 KVM_PAGES_PER_HPAGE(level));
361 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
365 * handle_changed_spte - handle bookkeeping associated with an SPTE change
367 * @as_id: the address space of the paging structure the SPTE was a part of
368 * @gfn: the base GFN that was mapped by the SPTE
369 * @old_spte: The value of the SPTE before the change
370 * @new_spte: The value of the SPTE after the change
371 * @level: the level of the PT the SPTE is part of in the paging structure
372 * @shared: This operation may not be running under the exclusive use of
373 * the MMU lock and the operation must synchronize with other
374 * threads that might be modifying SPTEs.
376 * Handle bookkeeping that might result from the modification of a SPTE.
377 * This function must be called for all TDP SPTE modifications.
379 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
380 u64 old_spte, u64 new_spte, int level,
383 bool was_present = is_shadow_present_pte(old_spte);
384 bool is_present = is_shadow_present_pte(new_spte);
385 bool was_leaf = was_present && is_last_spte(old_spte, level);
386 bool is_leaf = is_present && is_last_spte(new_spte, level);
387 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
389 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
390 WARN_ON(level < PG_LEVEL_4K);
391 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
394 * If this warning were to trigger it would indicate that there was a
395 * missing MMU notifier or a race with some notifier handler.
396 * A present, leaf SPTE should never be directly replaced with another
397 * present leaf SPTE pointing to a differnt PFN. A notifier handler
398 * should be zapping the SPTE before the main MM's page table is
399 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
400 * thread before replacement.
402 if (was_leaf && is_leaf && pfn_changed) {
403 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
404 "SPTE with another present leaf SPTE mapping a\n"
406 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
407 as_id, gfn, old_spte, new_spte, level);
410 * Crash the host to prevent error propagation and guest data
416 if (old_spte == new_spte)
419 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
422 * The only times a SPTE should be changed from a non-present to
423 * non-present state is when an MMIO entry is installed/modified/
424 * removed. In that case, there is nothing to do here.
426 if (!was_present && !is_present) {
428 * If this change does not involve a MMIO SPTE or removed SPTE,
429 * it is unexpected. Log the change, though it should not
430 * impact the guest since both the former and current SPTEs
433 if (WARN_ON(!is_mmio_spte(old_spte) &&
434 !is_mmio_spte(new_spte) &&
435 !is_removed_spte(new_spte)))
436 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
437 "should not be replaced with another,\n"
438 "different nonpresent SPTE, unless one or both\n"
439 "are MMIO SPTEs, or the new SPTE is\n"
440 "a temporary removed SPTE.\n"
441 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
442 as_id, gfn, old_spte, new_spte, level);
447 if (was_leaf && is_dirty_spte(old_spte) &&
448 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
449 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
452 * Recursively handle child PTs if the change removed a subtree from
453 * the paging structure.
455 if (was_present && !was_leaf && (pfn_changed || !is_present))
456 handle_removed_tdp_mmu_page(kvm,
457 spte_to_child_pt(old_spte, level), shared);
460 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
461 u64 old_spte, u64 new_spte, int level,
464 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
466 handle_changed_spte_acc_track(old_spte, new_spte, level);
467 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
472 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
473 * associated bookkeeping
476 * @iter: a tdp_iter instance currently on the SPTE that should be set
477 * @new_spte: The value the SPTE should be set to
478 * Returns: true if the SPTE was set, false if it was not. If false is returned,
479 * this function will have no side-effects.
481 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
482 struct tdp_iter *iter,
485 lockdep_assert_held_read(&kvm->mmu_lock);
488 * Do not change removed SPTEs. Only the thread that froze the SPTE
491 if (is_removed_spte(iter->old_spte))
494 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
495 new_spte) != iter->old_spte)
498 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
499 new_spte, iter->level, true);
504 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
505 struct tdp_iter *iter)
508 * Freeze the SPTE by setting it to a special,
509 * non-present value. This will stop other threads from
510 * immediately installing a present entry in its place
511 * before the TLBs are flushed.
513 if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
516 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
517 KVM_PAGES_PER_HPAGE(iter->level));
520 * No other thread can overwrite the removed SPTE as they
521 * must either wait on the MMU lock or use
522 * tdp_mmu_set_spte_atomic which will not overrite the
523 * special removed SPTE value. No bookkeeping is needed
524 * here since the SPTE is going from non-present
527 WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
534 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
536 * @iter: a tdp_iter instance currently on the SPTE that should be set
537 * @new_spte: The value the SPTE should be set to
538 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
539 * of the page. Should be set unless handling an MMU
540 * notifier for access tracking. Leaving record_acc_track
541 * unset in that case prevents page accesses from being
543 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
544 * appropriate for the change being made. Should be set
545 * unless performing certain dirty logging operations.
546 * Leaving record_dirty_log unset in that case prevents page
547 * writes from being double counted.
549 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
550 u64 new_spte, bool record_acc_track,
551 bool record_dirty_log)
553 lockdep_assert_held_write(&kvm->mmu_lock);
556 * No thread should be using this function to set SPTEs to the
557 * temporary removed SPTE value.
558 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
559 * should be used. If operating under the MMU lock in write mode, the
560 * use of the removed SPTE should not be necessary.
562 WARN_ON(is_removed_spte(iter->old_spte));
564 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
566 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
567 new_spte, iter->level, false);
568 if (record_acc_track)
569 handle_changed_spte_acc_track(iter->old_spte, new_spte,
571 if (record_dirty_log)
572 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
573 iter->old_spte, new_spte,
577 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
580 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
583 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
584 struct tdp_iter *iter,
587 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
590 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
591 struct tdp_iter *iter,
594 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
597 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
598 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
600 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
601 tdp_root_for_each_pte(_iter, _root, _start, _end) \
602 if (!is_shadow_present_pte(_iter.old_spte) || \
603 !is_last_spte(_iter.old_spte, _iter.level)) \
607 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
608 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
609 _mmu->shadow_root_level, _start, _end)
612 * Yield if the MMU lock is contended or this thread needs to return control
615 * If this function should yield and flush is set, it will perform a remote
616 * TLB flush before yielding.
618 * If this function yields, it will also reset the tdp_iter's walk over the
619 * paging structure and the calling function should skip to the next
620 * iteration to allow the iterator to continue its traversal from the
621 * paging structure root.
623 * Return true if this function yielded and the iterator's traversal was reset.
624 * Return false if a yield was not needed.
626 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
627 struct tdp_iter *iter, bool flush)
629 /* Ensure forward progress has been made before yielding. */
630 if (iter->next_last_level_gfn == iter->yielded_gfn)
633 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
637 kvm_flush_remote_tlbs(kvm);
639 cond_resched_rwlock_write(&kvm->mmu_lock);
642 WARN_ON(iter->gfn > iter->next_last_level_gfn);
644 tdp_iter_restart(iter);
653 * Tears down the mappings for the range of gfns, [start, end), and frees the
654 * non-root pages mapping GFNs strictly within that range. Returns true if
655 * SPTEs have been cleared and a TLB flush is needed before releasing the
657 * If can_yield is true, will release the MMU lock and reschedule if the
658 * scheduler needs the CPU or there is contention on the MMU lock. If this
659 * function cannot yield, it will not release the MMU lock or reschedule and
660 * the caller must ensure it does not supply too large a GFN range, or the
661 * operation can cause a soft lockup. Note, in some use cases a flush may be
662 * required by prior actions. Ensure the pending flush is performed prior to
665 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
666 gfn_t start, gfn_t end, bool can_yield, bool flush)
668 struct tdp_iter iter;
672 tdp_root_for_each_pte(iter, root, start, end) {
674 tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
679 if (!is_shadow_present_pte(iter.old_spte))
683 * If this is a non-last-level SPTE that covers a larger range
684 * than should be zapped, continue, and zap the mappings at a
687 if ((iter.gfn < start ||
688 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
689 !is_last_spte(iter.old_spte, iter.level))
692 tdp_mmu_set_spte(kvm, &iter, 0);
701 * Tears down the mappings for the range of gfns, [start, end), and frees the
702 * non-root pages mapping GFNs strictly within that range. Returns true if
703 * SPTEs have been cleared and a TLB flush is needed before releasing the
706 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
707 gfn_t end, bool can_yield, bool flush)
709 struct kvm_mmu_page *root;
711 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
712 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
717 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
719 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
723 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
724 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, flush);
727 kvm_flush_remote_tlbs(kvm);
731 * Installs a last-level SPTE to handle a TDP page fault.
732 * (NPT/EPT violation/misconfiguration)
734 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
736 struct tdp_iter *iter,
737 kvm_pfn_t pfn, bool prefault)
741 int make_spte_ret = 0;
743 if (unlikely(is_noslot_pfn(pfn)))
744 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
746 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
747 pfn, iter->old_spte, prefault, true,
748 map_writable, !shadow_accessed_mask,
751 if (new_spte == iter->old_spte)
752 ret = RET_PF_SPURIOUS;
753 else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
757 * If the page fault was caused by a write but the page is write
758 * protected, emulation is needed. If the emulation was skipped,
759 * the vCPU would have the same fault again.
761 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
763 ret = RET_PF_EMULATE;
764 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
767 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
768 if (unlikely(is_mmio_spte(new_spte))) {
769 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
771 ret = RET_PF_EMULATE;
773 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
774 rcu_dereference(iter->sptep));
778 vcpu->stat.pf_fixed++;
784 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
785 * page tables and SPTEs to translate the faulting guest physical address.
787 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
788 int map_writable, int max_level, kvm_pfn_t pfn,
791 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
792 bool write = error_code & PFERR_WRITE_MASK;
793 bool exec = error_code & PFERR_FETCH_MASK;
794 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
795 struct kvm_mmu *mmu = vcpu->arch.mmu;
796 struct tdp_iter iter;
797 struct kvm_mmu_page *sp;
801 gfn_t gfn = gpa >> PAGE_SHIFT;
805 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
807 if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
810 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
811 huge_page_disallowed, &req_level);
813 trace_kvm_mmu_spte_requested(gpa, level, pfn);
817 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
818 if (nx_huge_page_workaround_enabled)
819 disallowed_hugepage_adjust(iter.old_spte, gfn,
820 iter.level, &pfn, &level);
822 if (iter.level == level)
826 * If there is an SPTE mapping a large page at a higher level
827 * than the target, that SPTE must be cleared and replaced
828 * with a non-leaf SPTE.
830 if (is_shadow_present_pte(iter.old_spte) &&
831 is_large_pte(iter.old_spte)) {
832 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
836 * The iter must explicitly re-read the spte here
837 * because the new value informs the !present
840 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
843 if (!is_shadow_present_pte(iter.old_spte)) {
844 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
847 new_spte = make_nonleaf_spte(child_pt,
848 !shadow_accessed_mask);
850 if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
852 tdp_mmu_link_page(vcpu->kvm, sp, true,
853 huge_page_disallowed &&
854 req_level >= iter.level);
856 trace_kvm_mmu_get_page(sp, true);
864 if (iter.level != level) {
869 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
876 typedef int (*tdp_handler_t)(struct kvm *kvm, struct kvm_memory_slot *slot,
877 struct kvm_mmu_page *root, gfn_t start, gfn_t end,
880 static __always_inline int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
884 tdp_handler_t handler)
886 struct kvm_memslots *slots;
887 struct kvm_memory_slot *memslot;
888 struct kvm_mmu_page *root;
892 for (as_id = 0; as_id < KVM_ADDRESS_SPACE_NUM; as_id++) {
893 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) {
894 slots = __kvm_memslots(kvm, as_id);
895 kvm_for_each_memslot(memslot, slots) {
896 unsigned long hva_start, hva_end;
897 gfn_t gfn_start, gfn_end;
899 hva_start = max(start, memslot->userspace_addr);
900 hva_end = min(end, memslot->userspace_addr +
901 (memslot->npages << PAGE_SHIFT));
902 if (hva_start >= hva_end)
905 * {gfn(page) | page intersects with [hva_start, hva_end)} =
906 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
908 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
909 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
911 ret |= handler(kvm, memslot, root, gfn_start,
920 static __always_inline int kvm_tdp_mmu_handle_hva(struct kvm *kvm,
923 tdp_handler_t handler)
925 return kvm_tdp_mmu_handle_hva_range(kvm, addr, addr + 1, data, handler);
928 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
929 struct kvm_memory_slot *slot,
930 struct kvm_mmu_page *root, gfn_t start,
931 gfn_t end, unsigned long unused)
933 return zap_gfn_range(kvm, root, start, end, false, false);
936 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
939 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
940 zap_gfn_range_hva_wrapper);
944 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
945 * if any of the GFNs in the range have been accessed.
947 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
948 struct kvm_mmu_page *root, gfn_t start, gfn_t end,
949 unsigned long unused)
951 struct tdp_iter iter;
957 tdp_root_for_each_leaf_pte(iter, root, start, end) {
959 * If we have a non-accessed entry we don't need to change the
962 if (!is_accessed_spte(iter.old_spte))
965 new_spte = iter.old_spte;
967 if (spte_ad_enabled(new_spte)) {
968 clear_bit((ffs(shadow_accessed_mask) - 1),
969 (unsigned long *)&new_spte);
972 * Capture the dirty status of the page, so that it doesn't get
973 * lost when the SPTE is marked for access tracking.
975 if (is_writable_pte(new_spte))
976 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
978 new_spte = mark_spte_for_access_track(new_spte);
981 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
990 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
993 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
997 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
998 struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
999 unsigned long unused)
1001 struct tdp_iter iter;
1003 tdp_root_for_each_leaf_pte(iter, root, gfn, end)
1004 if (is_accessed_spte(iter.old_spte))
1010 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1012 return kvm_tdp_mmu_handle_hva(kvm, hva, 0, test_age_gfn);
1016 * Handle the changed_pte MMU notifier for the TDP MMU.
1017 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1019 * Returns non-zero if a flush is needed before releasing the MMU lock.
1021 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1022 struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
1025 struct tdp_iter iter;
1026 pte_t *ptep = (pte_t *)data;
1033 WARN_ON(pte_huge(*ptep) || (gfn + 1) != end);
1035 new_pfn = pte_pfn(*ptep);
1037 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1038 if (iter.level != PG_LEVEL_4K)
1041 if (!is_shadow_present_pte(iter.old_spte))
1045 * Note, when changing a read-only SPTE, it's not strictly
1046 * necessary to zero the SPTE before setting the new PFN, but
1047 * doing so preserves the invariant that the PFN of a present
1048 * leaf SPTE can never change. See __handle_changed_spte().
1050 tdp_mmu_set_spte(kvm, &iter, 0);
1052 if (!pte_write(*ptep)) {
1053 new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1054 iter.old_spte, new_pfn);
1056 tdp_mmu_set_spte(kvm, &iter, new_spte);
1063 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1070 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1073 return kvm_tdp_mmu_handle_hva(kvm, address, (unsigned long)host_ptep,
1078 * Remove write access from all the SPTEs mapping GFNs [start, end). If
1079 * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1080 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1082 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1083 gfn_t start, gfn_t end, int min_level)
1085 struct tdp_iter iter;
1087 bool spte_set = false;
1091 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1093 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1094 min_level, start, end) {
1095 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1098 if (!is_shadow_present_pte(iter.old_spte) ||
1099 !is_last_spte(iter.old_spte, iter.level) ||
1100 !(iter.old_spte & PT_WRITABLE_MASK))
1103 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1105 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1114 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1115 * only affect leaf SPTEs down to min_level.
1116 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1118 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1121 struct kvm_mmu_page *root;
1122 bool spte_set = false;
1124 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1125 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1126 slot->base_gfn + slot->npages, min_level);
1132 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1133 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1134 * If AD bits are not enabled, this will require clearing the writable bit on
1135 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1138 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1139 gfn_t start, gfn_t end)
1141 struct tdp_iter iter;
1143 bool spte_set = false;
1147 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1148 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1151 if (spte_ad_need_write_protect(iter.old_spte)) {
1152 if (is_writable_pte(iter.old_spte))
1153 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1157 if (iter.old_spte & shadow_dirty_mask)
1158 new_spte = iter.old_spte & ~shadow_dirty_mask;
1163 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1172 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1173 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1174 * If AD bits are not enabled, this will require clearing the writable bit on
1175 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1178 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1180 struct kvm_mmu_page *root;
1181 bool spte_set = false;
1183 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1184 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1185 slot->base_gfn + slot->npages);
1191 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1192 * set in mask, starting at gfn. The given memslot is expected to contain all
1193 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1194 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1195 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1197 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1198 gfn_t gfn, unsigned long mask, bool wrprot)
1200 struct tdp_iter iter;
1205 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1206 gfn + BITS_PER_LONG) {
1210 if (iter.level > PG_LEVEL_4K ||
1211 !(mask & (1UL << (iter.gfn - gfn))))
1214 mask &= ~(1UL << (iter.gfn - gfn));
1216 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1217 if (is_writable_pte(iter.old_spte))
1218 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1222 if (iter.old_spte & shadow_dirty_mask)
1223 new_spte = iter.old_spte & ~shadow_dirty_mask;
1228 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1235 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1236 * set in mask, starting at gfn. The given memslot is expected to contain all
1237 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1238 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1239 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1241 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1242 struct kvm_memory_slot *slot,
1243 gfn_t gfn, unsigned long mask,
1246 struct kvm_mmu_page *root;
1248 lockdep_assert_held_write(&kvm->mmu_lock);
1249 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1250 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1254 * Clear leaf entries which could be replaced by large mappings, for
1255 * GFNs within the slot.
1257 static bool zap_collapsible_spte_range(struct kvm *kvm,
1258 struct kvm_mmu_page *root,
1259 struct kvm_memory_slot *slot,
1262 gfn_t start = slot->base_gfn;
1263 gfn_t end = start + slot->npages;
1264 struct tdp_iter iter;
1269 tdp_root_for_each_pte(iter, root, start, end) {
1270 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
1275 if (!is_shadow_present_pte(iter.old_spte) ||
1276 !is_last_spte(iter.old_spte, iter.level))
1279 pfn = spte_to_pfn(iter.old_spte);
1280 if (kvm_is_reserved_pfn(pfn) ||
1281 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1285 tdp_mmu_set_spte(kvm, &iter, 0);
1296 * Clear non-leaf entries (and free associated page tables) which could
1297 * be replaced by large mappings, for GFNs within the slot.
1299 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1300 struct kvm_memory_slot *slot, bool flush)
1302 struct kvm_mmu_page *root;
1304 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1305 flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1311 * Removes write access on the last level SPTE mapping this GFN and unsets the
1312 * MMU-writable bit to ensure future writes continue to be intercepted.
1313 * Returns true if an SPTE was set and a TLB flush is needed.
1315 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1318 struct tdp_iter iter;
1320 bool spte_set = false;
1324 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1325 if (!is_writable_pte(iter.old_spte))
1328 new_spte = iter.old_spte &
1329 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1331 tdp_mmu_set_spte(kvm, &iter, new_spte);
1341 * Removes write access on the last level SPTE mapping this GFN and unsets the
1342 * MMU-writable bit to ensure future writes continue to be intercepted.
1343 * Returns true if an SPTE was set and a TLB flush is needed.
1345 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1346 struct kvm_memory_slot *slot, gfn_t gfn)
1348 struct kvm_mmu_page *root;
1349 bool spte_set = false;
1351 lockdep_assert_held_write(&kvm->mmu_lock);
1352 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1353 spte_set |= write_protect_gfn(kvm, root, gfn);
1359 * Return the level of the lowest level SPTE added to sptes.
1360 * That SPTE may be non-present.
1362 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1365 struct tdp_iter iter;
1366 struct kvm_mmu *mmu = vcpu->arch.mmu;
1367 gfn_t gfn = addr >> PAGE_SHIFT;
1370 *root_level = vcpu->arch.mmu->shadow_root_level;
1374 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1376 sptes[leaf] = iter.old_spte;