1 // SPDX-License-Identifier: GPL-2.0
4 #include "mmu_internal.h"
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
32 if (!kvm->arch.tdp_mmu_enabled)
35 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
38 * Ensure that all the outstanding RCU callbacks to free shadow pages
39 * can run before the VM is torn down.
44 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
45 gfn_t start, gfn_t end, bool can_yield, bool flush);
47 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
49 free_page((unsigned long)sp->spt);
50 kmem_cache_free(mmu_page_header_cache, sp);
53 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
55 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
57 lockdep_assert_held_write(&kvm->mmu_lock);
59 if (--root->root_count)
62 WARN_ON(!root->tdp_mmu_page);
64 list_del(&root->link);
66 zap_gfn_range(kvm, root, 0, max_gfn, false, false);
68 tdp_mmu_free_sp(root);
72 * Finds the next valid root after root (or the first valid root if root
73 * is NULL), takes a reference on it, and returns that next root. If root
74 * is not NULL, this thread should have already taken a reference on it, and
75 * that reference will be dropped. If no valid root is found, this
76 * function will return NULL.
78 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
79 struct kvm_mmu_page *prev_root)
81 struct kvm_mmu_page *next_root;
83 lockdep_assert_held_write(&kvm->mmu_lock);
86 next_root = list_next_entry(prev_root, link);
88 next_root = list_first_entry(&kvm->arch.tdp_mmu_roots,
89 typeof(*next_root), link);
91 if (list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link))
94 kvm_tdp_mmu_get_root(kvm, next_root);
97 kvm_tdp_mmu_put_root(kvm, prev_root);
103 * Note: this iterator gets and puts references to the roots it iterates over.
104 * This makes it safe to release the MMU lock and yield within the loop, but
105 * if exiting the loop early, the caller must drop the reference to the most
106 * recent root. (Unless keeping a live reference is desirable.)
108 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
109 for (_root = tdp_mmu_next_root(_kvm, NULL); \
111 _root = tdp_mmu_next_root(_kvm, _root)) \
112 if (kvm_mmu_page_as_id(_root) != _as_id) { \
115 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
116 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
117 if (kvm_mmu_page_as_id(_root) != _as_id) { \
120 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
123 union kvm_mmu_page_role role;
125 role = vcpu->arch.mmu->mmu_role.base;
128 role.gpte_is_8_bytes = true;
129 role.access = ACC_ALL;
134 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
137 struct kvm_mmu_page *sp;
139 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
140 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
141 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
143 sp->role.word = page_role_for_level(vcpu, level).word;
145 sp->tdp_mmu_page = true;
147 trace_kvm_mmu_get_page(sp, true);
152 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
154 union kvm_mmu_page_role role;
155 struct kvm *kvm = vcpu->kvm;
156 struct kvm_mmu_page *root;
158 lockdep_assert_held_write(&kvm->mmu_lock);
160 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
162 /* Check for an existing root before allocating a new one. */
163 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
164 if (root->role.word == role.word) {
165 kvm_tdp_mmu_get_root(kvm, root);
170 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
171 root->root_count = 1;
173 list_add(&root->link, &kvm->arch.tdp_mmu_roots);
176 return __pa(root->spt);
180 * This is called through call_rcu in order to free TDP page table memory
181 * safely with respect to other kernel threads that may be operating on
183 * By only accessing TDP MMU page table memory in an RCU read critical
184 * section, and freeing it after a grace period, lockless access to that
185 * memory won't use it after it is freed.
187 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
189 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
195 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
196 u64 old_spte, u64 new_spte, int level,
199 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
201 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
204 if (is_accessed_spte(old_spte) &&
205 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
206 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
207 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
210 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
211 u64 old_spte, u64 new_spte, int level)
214 struct kvm_memory_slot *slot;
216 if (level > PG_LEVEL_4K)
219 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
221 if ((!is_writable_pte(old_spte) || pfn_changed) &&
222 is_writable_pte(new_spte)) {
223 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
224 mark_page_dirty_in_slot(kvm, slot, gfn);
229 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
233 * @shared: This operation may not be running under the exclusive use of
234 * the MMU lock and the operation must synchronize with other
235 * threads that might be adding or removing pages.
236 * @account_nx: This page replaces a NX large page and should be marked for
239 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
240 bool shared, bool account_nx)
243 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
245 lockdep_assert_held_write(&kvm->mmu_lock);
247 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
249 account_huge_nx_page(kvm, sp);
252 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
256 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
259 * @sp: the page to be removed
260 * @shared: This operation may not be running under the exclusive use of
261 * the MMU lock and the operation must synchronize with other
262 * threads that might be adding or removing pages.
264 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
268 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
270 lockdep_assert_held_write(&kvm->mmu_lock);
273 if (sp->lpage_disallowed)
274 unaccount_huge_nx_page(kvm, sp);
277 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
281 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
284 * @pt: the page removed from the paging structure
285 * @shared: This operation may not be running under the exclusive use
286 * of the MMU lock and the operation must synchronize with other
287 * threads that might be modifying SPTEs.
289 * Given a page table that has been removed from the TDP paging structure,
290 * iterates through the page table to clear SPTEs and free child page tables.
292 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
293 * protection. Since this thread removed it from the paging structure,
294 * this thread will be responsible for ensuring the page is freed. Hence the
295 * early rcu_dereferences in the function.
297 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
300 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
301 int level = sp->role.level;
302 gfn_t base_gfn = sp->gfn;
308 trace_kvm_mmu_prepare_zap_page(sp);
310 tdp_mmu_unlink_page(kvm, sp, shared);
312 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
313 sptep = rcu_dereference(pt) + i;
314 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
318 * Set the SPTE to a nonpresent value that other
319 * threads will not overwrite. If the SPTE was
320 * already marked as removed then another thread
321 * handling a page fault could overwrite it, so
322 * set the SPTE until it is set from some other
323 * value to the removed SPTE value.
326 old_child_spte = xchg(sptep, REMOVED_SPTE);
327 if (!is_removed_spte(old_child_spte))
333 * If the SPTE is not MMU-present, there is no backing
334 * page associated with the SPTE and so no side effects
335 * that need to be recorded, and exclusive ownership of
336 * mmu_lock ensures the SPTE can't be made present.
337 * Note, zapping MMIO SPTEs is also unnecessary as they
338 * are guarded by the memslots generation, not by being
341 old_child_spte = READ_ONCE(*sptep);
342 if (!is_shadow_present_pte(old_child_spte))
346 * Marking the SPTE as a removed SPTE is not
347 * strictly necessary here as the MMU lock will
348 * stop other threads from concurrently modifying
349 * this SPTE. Using the removed SPTE value keeps
350 * the two branches consistent and simplifies
353 WRITE_ONCE(*sptep, REMOVED_SPTE);
355 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
356 old_child_spte, REMOVED_SPTE, level - 1,
360 kvm_flush_remote_tlbs_with_address(kvm, gfn,
361 KVM_PAGES_PER_HPAGE(level));
363 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
367 * handle_changed_spte - handle bookkeeping associated with an SPTE change
369 * @as_id: the address space of the paging structure the SPTE was a part of
370 * @gfn: the base GFN that was mapped by the SPTE
371 * @old_spte: The value of the SPTE before the change
372 * @new_spte: The value of the SPTE after the change
373 * @level: the level of the PT the SPTE is part of in the paging structure
374 * @shared: This operation may not be running under the exclusive use of
375 * the MMU lock and the operation must synchronize with other
376 * threads that might be modifying SPTEs.
378 * Handle bookkeeping that might result from the modification of a SPTE.
379 * This function must be called for all TDP SPTE modifications.
381 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
382 u64 old_spte, u64 new_spte, int level,
385 bool was_present = is_shadow_present_pte(old_spte);
386 bool is_present = is_shadow_present_pte(new_spte);
387 bool was_leaf = was_present && is_last_spte(old_spte, level);
388 bool is_leaf = is_present && is_last_spte(new_spte, level);
389 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
391 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
392 WARN_ON(level < PG_LEVEL_4K);
393 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
396 * If this warning were to trigger it would indicate that there was a
397 * missing MMU notifier or a race with some notifier handler.
398 * A present, leaf SPTE should never be directly replaced with another
399 * present leaf SPTE pointing to a differnt PFN. A notifier handler
400 * should be zapping the SPTE before the main MM's page table is
401 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
402 * thread before replacement.
404 if (was_leaf && is_leaf && pfn_changed) {
405 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
406 "SPTE with another present leaf SPTE mapping a\n"
408 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
409 as_id, gfn, old_spte, new_spte, level);
412 * Crash the host to prevent error propagation and guest data
418 if (old_spte == new_spte)
421 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
424 * The only times a SPTE should be changed from a non-present to
425 * non-present state is when an MMIO entry is installed/modified/
426 * removed. In that case, there is nothing to do here.
428 if (!was_present && !is_present) {
430 * If this change does not involve a MMIO SPTE or removed SPTE,
431 * it is unexpected. Log the change, though it should not
432 * impact the guest since both the former and current SPTEs
435 if (WARN_ON(!is_mmio_spte(old_spte) &&
436 !is_mmio_spte(new_spte) &&
437 !is_removed_spte(new_spte)))
438 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
439 "should not be replaced with another,\n"
440 "different nonpresent SPTE, unless one or both\n"
441 "are MMIO SPTEs, or the new SPTE is\n"
442 "a temporary removed SPTE.\n"
443 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
444 as_id, gfn, old_spte, new_spte, level);
449 if (was_leaf && is_dirty_spte(old_spte) &&
450 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
451 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
454 * Recursively handle child PTs if the change removed a subtree from
455 * the paging structure.
457 if (was_present && !was_leaf && (pfn_changed || !is_present))
458 handle_removed_tdp_mmu_page(kvm,
459 spte_to_child_pt(old_spte, level), shared);
462 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
463 u64 old_spte, u64 new_spte, int level,
466 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
468 handle_changed_spte_acc_track(old_spte, new_spte, level);
469 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
474 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
475 * associated bookkeeping
478 * @iter: a tdp_iter instance currently on the SPTE that should be set
479 * @new_spte: The value the SPTE should be set to
480 * Returns: true if the SPTE was set, false if it was not. If false is returned,
481 * this function will have no side-effects.
483 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
484 struct tdp_iter *iter,
487 lockdep_assert_held_read(&kvm->mmu_lock);
490 * Do not change removed SPTEs. Only the thread that froze the SPTE
493 if (is_removed_spte(iter->old_spte))
496 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
497 new_spte) != iter->old_spte)
500 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
501 new_spte, iter->level, true);
506 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
507 struct tdp_iter *iter)
510 * Freeze the SPTE by setting it to a special,
511 * non-present value. This will stop other threads from
512 * immediately installing a present entry in its place
513 * before the TLBs are flushed.
515 if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
518 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
519 KVM_PAGES_PER_HPAGE(iter->level));
522 * No other thread can overwrite the removed SPTE as they
523 * must either wait on the MMU lock or use
524 * tdp_mmu_set_spte_atomic which will not overrite the
525 * special removed SPTE value. No bookkeeping is needed
526 * here since the SPTE is going from non-present
529 WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
536 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
538 * @iter: a tdp_iter instance currently on the SPTE that should be set
539 * @new_spte: The value the SPTE should be set to
540 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
541 * of the page. Should be set unless handling an MMU
542 * notifier for access tracking. Leaving record_acc_track
543 * unset in that case prevents page accesses from being
545 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
546 * appropriate for the change being made. Should be set
547 * unless performing certain dirty logging operations.
548 * Leaving record_dirty_log unset in that case prevents page
549 * writes from being double counted.
551 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
552 u64 new_spte, bool record_acc_track,
553 bool record_dirty_log)
555 lockdep_assert_held_write(&kvm->mmu_lock);
558 * No thread should be using this function to set SPTEs to the
559 * temporary removed SPTE value.
560 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
561 * should be used. If operating under the MMU lock in write mode, the
562 * use of the removed SPTE should not be necessary.
564 WARN_ON(is_removed_spte(iter->old_spte));
566 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
568 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
569 new_spte, iter->level, false);
570 if (record_acc_track)
571 handle_changed_spte_acc_track(iter->old_spte, new_spte,
573 if (record_dirty_log)
574 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
575 iter->old_spte, new_spte,
579 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
582 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
585 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
586 struct tdp_iter *iter,
589 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
592 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
593 struct tdp_iter *iter,
596 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
599 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
600 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
602 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
603 tdp_root_for_each_pte(_iter, _root, _start, _end) \
604 if (!is_shadow_present_pte(_iter.old_spte) || \
605 !is_last_spte(_iter.old_spte, _iter.level)) \
609 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
610 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
611 _mmu->shadow_root_level, _start, _end)
614 * Yield if the MMU lock is contended or this thread needs to return control
617 * If this function should yield and flush is set, it will perform a remote
618 * TLB flush before yielding.
620 * If this function yields, it will also reset the tdp_iter's walk over the
621 * paging structure and the calling function should skip to the next
622 * iteration to allow the iterator to continue its traversal from the
623 * paging structure root.
625 * Return true if this function yielded and the iterator's traversal was reset.
626 * Return false if a yield was not needed.
628 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
629 struct tdp_iter *iter, bool flush)
631 /* Ensure forward progress has been made before yielding. */
632 if (iter->next_last_level_gfn == iter->yielded_gfn)
635 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
639 kvm_flush_remote_tlbs(kvm);
641 cond_resched_rwlock_write(&kvm->mmu_lock);
644 WARN_ON(iter->gfn > iter->next_last_level_gfn);
646 tdp_iter_restart(iter);
655 * Tears down the mappings for the range of gfns, [start, end), and frees the
656 * non-root pages mapping GFNs strictly within that range. Returns true if
657 * SPTEs have been cleared and a TLB flush is needed before releasing the
659 * If can_yield is true, will release the MMU lock and reschedule if the
660 * scheduler needs the CPU or there is contention on the MMU lock. If this
661 * function cannot yield, it will not release the MMU lock or reschedule and
662 * the caller must ensure it does not supply too large a GFN range, or the
663 * operation can cause a soft lockup. Note, in some use cases a flush may be
664 * required by prior actions. Ensure the pending flush is performed prior to
667 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
668 gfn_t start, gfn_t end, bool can_yield, bool flush)
670 struct tdp_iter iter;
674 tdp_root_for_each_pte(iter, root, start, end) {
676 tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
681 if (!is_shadow_present_pte(iter.old_spte))
685 * If this is a non-last-level SPTE that covers a larger range
686 * than should be zapped, continue, and zap the mappings at a
689 if ((iter.gfn < start ||
690 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
691 !is_last_spte(iter.old_spte, iter.level))
694 tdp_mmu_set_spte(kvm, &iter, 0);
703 * Tears down the mappings for the range of gfns, [start, end), and frees the
704 * non-root pages mapping GFNs strictly within that range. Returns true if
705 * SPTEs have been cleared and a TLB flush is needed before releasing the
708 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
709 gfn_t end, bool can_yield, bool flush)
711 struct kvm_mmu_page *root;
713 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
714 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
719 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
721 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
725 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
726 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, flush);
729 kvm_flush_remote_tlbs(kvm);
733 * Installs a last-level SPTE to handle a TDP page fault.
734 * (NPT/EPT violation/misconfiguration)
736 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
738 struct tdp_iter *iter,
739 kvm_pfn_t pfn, bool prefault)
743 int make_spte_ret = 0;
745 if (unlikely(is_noslot_pfn(pfn)))
746 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
748 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
749 pfn, iter->old_spte, prefault, true,
750 map_writable, !shadow_accessed_mask,
753 if (new_spte == iter->old_spte)
754 ret = RET_PF_SPURIOUS;
755 else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
759 * If the page fault was caused by a write but the page is write
760 * protected, emulation is needed. If the emulation was skipped,
761 * the vCPU would have the same fault again.
763 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
765 ret = RET_PF_EMULATE;
766 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
769 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
770 if (unlikely(is_mmio_spte(new_spte))) {
771 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
773 ret = RET_PF_EMULATE;
775 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
776 rcu_dereference(iter->sptep));
780 vcpu->stat.pf_fixed++;
786 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
787 * page tables and SPTEs to translate the faulting guest physical address.
789 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
790 int map_writable, int max_level, kvm_pfn_t pfn,
793 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
794 bool write = error_code & PFERR_WRITE_MASK;
795 bool exec = error_code & PFERR_FETCH_MASK;
796 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
797 struct kvm_mmu *mmu = vcpu->arch.mmu;
798 struct tdp_iter iter;
799 struct kvm_mmu_page *sp;
803 gfn_t gfn = gpa >> PAGE_SHIFT;
807 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
809 if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
812 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
813 huge_page_disallowed, &req_level);
815 trace_kvm_mmu_spte_requested(gpa, level, pfn);
819 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
820 if (nx_huge_page_workaround_enabled)
821 disallowed_hugepage_adjust(iter.old_spte, gfn,
822 iter.level, &pfn, &level);
824 if (iter.level == level)
828 * If there is an SPTE mapping a large page at a higher level
829 * than the target, that SPTE must be cleared and replaced
830 * with a non-leaf SPTE.
832 if (is_shadow_present_pte(iter.old_spte) &&
833 is_large_pte(iter.old_spte)) {
834 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
838 * The iter must explicitly re-read the spte here
839 * because the new value informs the !present
842 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
845 if (!is_shadow_present_pte(iter.old_spte)) {
846 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
849 new_spte = make_nonleaf_spte(child_pt,
850 !shadow_accessed_mask);
852 if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
854 tdp_mmu_link_page(vcpu->kvm, sp, true,
855 huge_page_disallowed &&
856 req_level >= iter.level);
858 trace_kvm_mmu_get_page(sp, true);
866 if (iter.level != level) {
871 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
878 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
881 struct kvm_mmu_page *root;
883 for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
884 flush |= zap_gfn_range(kvm, root, range->start, range->end,
885 range->may_block, flush);
890 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
891 struct kvm_gfn_range *range);
893 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
894 struct kvm_gfn_range *range,
895 tdp_handler_t handler)
897 struct kvm_mmu_page *root;
898 struct tdp_iter iter;
904 * Don't support rescheduling, none of the MMU notifiers that funnel
905 * into this helper allow blocking; it'd be dead, wasteful code.
907 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
908 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
909 ret |= handler(kvm, &iter, range);
918 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
919 * if any of the GFNs in the range have been accessed.
921 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
922 struct kvm_gfn_range *range)
926 /* If we have a non-accessed entry we don't need to change the pte. */
927 if (!is_accessed_spte(iter->old_spte))
930 new_spte = iter->old_spte;
932 if (spte_ad_enabled(new_spte)) {
933 new_spte &= ~shadow_accessed_mask;
936 * Capture the dirty status of the page, so that it doesn't get
937 * lost when the SPTE is marked for access tracking.
939 if (is_writable_pte(new_spte))
940 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
942 new_spte = mark_spte_for_access_track(new_spte);
945 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
950 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
952 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
955 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
956 struct kvm_gfn_range *range)
958 return is_accessed_spte(iter->old_spte);
961 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
963 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
966 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
967 struct kvm_gfn_range *range)
971 /* Huge pages aren't expected to be modified without first being zapped. */
972 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
974 if (iter->level != PG_LEVEL_4K ||
975 !is_shadow_present_pte(iter->old_spte))
979 * Note, when changing a read-only SPTE, it's not strictly necessary to
980 * zero the SPTE before setting the new PFN, but doing so preserves the
981 * invariant that the PFN of a present * leaf SPTE can never change.
982 * See __handle_changed_spte().
984 tdp_mmu_set_spte(kvm, iter, 0);
986 if (!pte_write(range->pte)) {
987 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
988 pte_pfn(range->pte));
990 tdp_mmu_set_spte(kvm, iter, new_spte);
997 * Handle the changed_pte MMU notifier for the TDP MMU.
998 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1000 * Returns non-zero if a flush is needed before releasing the MMU lock.
1002 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1004 bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1006 /* FIXME: return 'flush' instead of flushing here. */
1008 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1014 * Remove write access from all the SPTEs mapping GFNs [start, end). If
1015 * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1016 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1018 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1019 gfn_t start, gfn_t end, int min_level)
1021 struct tdp_iter iter;
1023 bool spte_set = false;
1027 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1029 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1030 min_level, start, end) {
1031 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1034 if (!is_shadow_present_pte(iter.old_spte) ||
1035 !is_last_spte(iter.old_spte, iter.level) ||
1036 !(iter.old_spte & PT_WRITABLE_MASK))
1039 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1041 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1050 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1051 * only affect leaf SPTEs down to min_level.
1052 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1054 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1057 struct kvm_mmu_page *root;
1058 bool spte_set = false;
1060 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1061 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1062 slot->base_gfn + slot->npages, min_level);
1068 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1069 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1070 * If AD bits are not enabled, this will require clearing the writable bit on
1071 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1074 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1075 gfn_t start, gfn_t end)
1077 struct tdp_iter iter;
1079 bool spte_set = false;
1083 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1084 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1087 if (spte_ad_need_write_protect(iter.old_spte)) {
1088 if (is_writable_pte(iter.old_spte))
1089 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1093 if (iter.old_spte & shadow_dirty_mask)
1094 new_spte = iter.old_spte & ~shadow_dirty_mask;
1099 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1108 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1109 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1110 * If AD bits are not enabled, this will require clearing the writable bit on
1111 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1114 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1116 struct kvm_mmu_page *root;
1117 bool spte_set = false;
1119 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1120 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1121 slot->base_gfn + slot->npages);
1127 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1128 * set in mask, starting at gfn. The given memslot is expected to contain all
1129 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1130 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1131 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1133 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1134 gfn_t gfn, unsigned long mask, bool wrprot)
1136 struct tdp_iter iter;
1141 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1142 gfn + BITS_PER_LONG) {
1146 if (iter.level > PG_LEVEL_4K ||
1147 !(mask & (1UL << (iter.gfn - gfn))))
1150 mask &= ~(1UL << (iter.gfn - gfn));
1152 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1153 if (is_writable_pte(iter.old_spte))
1154 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1158 if (iter.old_spte & shadow_dirty_mask)
1159 new_spte = iter.old_spte & ~shadow_dirty_mask;
1164 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1171 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1172 * set in mask, starting at gfn. The given memslot is expected to contain all
1173 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1174 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1175 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1177 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1178 struct kvm_memory_slot *slot,
1179 gfn_t gfn, unsigned long mask,
1182 struct kvm_mmu_page *root;
1184 lockdep_assert_held_write(&kvm->mmu_lock);
1185 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1186 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1190 * Clear leaf entries which could be replaced by large mappings, for
1191 * GFNs within the slot.
1193 static bool zap_collapsible_spte_range(struct kvm *kvm,
1194 struct kvm_mmu_page *root,
1195 const struct kvm_memory_slot *slot,
1198 gfn_t start = slot->base_gfn;
1199 gfn_t end = start + slot->npages;
1200 struct tdp_iter iter;
1205 tdp_root_for_each_pte(iter, root, start, end) {
1206 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
1211 if (!is_shadow_present_pte(iter.old_spte) ||
1212 !is_last_spte(iter.old_spte, iter.level))
1215 pfn = spte_to_pfn(iter.old_spte);
1216 if (kvm_is_reserved_pfn(pfn) ||
1217 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1221 tdp_mmu_set_spte(kvm, &iter, 0);
1232 * Clear non-leaf entries (and free associated page tables) which could
1233 * be replaced by large mappings, for GFNs within the slot.
1235 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1236 const struct kvm_memory_slot *slot,
1239 struct kvm_mmu_page *root;
1241 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1242 flush = zap_collapsible_spte_range(kvm, root, slot, flush);
1248 * Removes write access on the last level SPTE mapping this GFN and unsets the
1249 * MMU-writable bit to ensure future writes continue to be intercepted.
1250 * Returns true if an SPTE was set and a TLB flush is needed.
1252 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1255 struct tdp_iter iter;
1257 bool spte_set = false;
1261 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1262 if (!is_writable_pte(iter.old_spte))
1265 new_spte = iter.old_spte &
1266 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1268 tdp_mmu_set_spte(kvm, &iter, new_spte);
1278 * Removes write access on the last level SPTE mapping this GFN and unsets the
1279 * MMU-writable bit to ensure future writes continue to be intercepted.
1280 * Returns true if an SPTE was set and a TLB flush is needed.
1282 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1283 struct kvm_memory_slot *slot, gfn_t gfn)
1285 struct kvm_mmu_page *root;
1286 bool spte_set = false;
1288 lockdep_assert_held_write(&kvm->mmu_lock);
1289 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1290 spte_set |= write_protect_gfn(kvm, root, gfn);
1296 * Return the level of the lowest level SPTE added to sptes.
1297 * That SPTE may be non-present.
1299 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1302 struct tdp_iter iter;
1303 struct kvm_mmu *mmu = vcpu->arch.mmu;
1304 gfn_t gfn = addr >> PAGE_SHIFT;
1307 *root_level = vcpu->arch.mmu->shadow_root_level;
1311 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1313 sptes[leaf] = iter.old_spte;