1 // SPDX-License-Identifier: GPL-2.0
4 #include "mmu_internal.h"
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
32 if (!kvm->arch.tdp_mmu_enabled)
35 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
38 * Ensure that all the outstanding RCU callbacks to free shadow pages
39 * can run before the VM is torn down.
44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
46 if (kvm_mmu_put_root(kvm, root))
47 kvm_tdp_mmu_free_root(kvm, root);
50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
51 struct kvm_mmu_page *root)
53 lockdep_assert_held_write(&kvm->mmu_lock);
55 if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
58 kvm_mmu_get_root(kvm, root);
63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
64 struct kvm_mmu_page *root)
66 struct kvm_mmu_page *next_root;
68 next_root = list_next_entry(root, link);
69 tdp_mmu_put_root(kvm, root);
74 * Note: this iterator gets and puts references to the roots it iterates over.
75 * This makes it safe to release the MMU lock and yield within the loop, but
76 * if exiting the loop early, the caller must drop the reference to the most
77 * recent root. (Unless keeping a live reference is desirable.)
79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
80 for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \
81 typeof(*_root), link); \
82 tdp_mmu_next_root_valid(_kvm, _root); \
83 _root = tdp_mmu_next_root(_kvm, _root))
85 #define for_each_tdp_mmu_root(_kvm, _root) \
86 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
89 gfn_t start, gfn_t end, bool can_yield);
91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
93 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
95 lockdep_assert_held_write(&kvm->mmu_lock);
97 WARN_ON(root->root_count);
98 WARN_ON(!root->tdp_mmu_page);
100 list_del(&root->link);
102 zap_gfn_range(kvm, root, 0, max_gfn, false);
104 free_page((unsigned long)root->spt);
105 kmem_cache_free(mmu_page_header_cache, root);
108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
111 union kvm_mmu_page_role role;
113 role = vcpu->arch.mmu->mmu_role.base;
116 role.gpte_is_8_bytes = true;
117 role.access = ACC_ALL;
122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
125 struct kvm_mmu_page *sp;
127 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
128 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
129 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
131 sp->role.word = page_role_for_level(vcpu, level).word;
133 sp->tdp_mmu_page = true;
135 trace_kvm_mmu_get_page(sp, true);
140 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
142 union kvm_mmu_page_role role;
143 struct kvm *kvm = vcpu->kvm;
144 struct kvm_mmu_page *root;
146 lockdep_assert_held_write(&kvm->mmu_lock);
148 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
150 /* Check for an existing root before allocating a new one. */
151 for_each_tdp_mmu_root(kvm, root) {
152 if (root->role.word == role.word) {
153 kvm_mmu_get_root(kvm, root);
158 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
159 root->root_count = 1;
161 list_add(&root->link, &kvm->arch.tdp_mmu_roots);
164 return __pa(root->spt);
167 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
169 free_page((unsigned long)sp->spt);
170 kmem_cache_free(mmu_page_header_cache, sp);
174 * This is called through call_rcu in order to free TDP page table memory
175 * safely with respect to other kernel threads that may be operating on
177 * By only accessing TDP MMU page table memory in an RCU read critical
178 * section, and freeing it after a grace period, lockless access to that
179 * memory won't use it after it is freed.
181 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
183 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
189 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
190 u64 old_spte, u64 new_spte, int level,
193 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
195 return sp->role.smm ? 1 : 0;
198 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
200 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
202 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
205 if (is_accessed_spte(old_spte) &&
206 (!is_accessed_spte(new_spte) || pfn_changed))
207 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
210 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
211 u64 old_spte, u64 new_spte, int level)
214 struct kvm_memory_slot *slot;
216 if (level > PG_LEVEL_4K)
219 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
221 if ((!is_writable_pte(old_spte) || pfn_changed) &&
222 is_writable_pte(new_spte)) {
223 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
224 mark_page_dirty_in_slot(kvm, slot, gfn);
229 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
233 * @shared: This operation may not be running under the exclusive use of
234 * the MMU lock and the operation must synchronize with other
235 * threads that might be adding or removing pages.
236 * @account_nx: This page replaces a NX large page and should be marked for
239 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
240 bool shared, bool account_nx)
243 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
245 lockdep_assert_held_write(&kvm->mmu_lock);
247 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
249 account_huge_nx_page(kvm, sp);
252 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
256 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
259 * @sp: the page to be removed
260 * @shared: This operation may not be running under the exclusive use of
261 * the MMU lock and the operation must synchronize with other
262 * threads that might be adding or removing pages.
264 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
268 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
270 lockdep_assert_held_write(&kvm->mmu_lock);
273 if (sp->lpage_disallowed)
274 unaccount_huge_nx_page(kvm, sp);
277 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
281 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
284 * @pt: the page removed from the paging structure
285 * @shared: This operation may not be running under the exclusive use
286 * of the MMU lock and the operation must synchronize with other
287 * threads that might be modifying SPTEs.
289 * Given a page table that has been removed from the TDP paging structure,
290 * iterates through the page table to clear SPTEs and free child page tables.
292 static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
295 struct kvm_mmu_page *sp = sptep_to_sp(pt);
296 int level = sp->role.level;
297 gfn_t base_gfn = sp->gfn;
303 trace_kvm_mmu_prepare_zap_page(sp);
305 tdp_mmu_unlink_page(kvm, sp, shared);
307 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
309 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
313 * Set the SPTE to a nonpresent value that other
314 * threads will not overwrite. If the SPTE was
315 * already marked as removed then another thread
316 * handling a page fault could overwrite it, so
317 * set the SPTE until it is set from some other
318 * value to the removed SPTE value.
321 old_child_spte = xchg(sptep, REMOVED_SPTE);
322 if (!is_removed_spte(old_child_spte))
328 * If the SPTE is not MMU-present, there is no backing
329 * page associated with the SPTE and so no side effects
330 * that need to be recorded, and exclusive ownership of
331 * mmu_lock ensures the SPTE can't be made present.
332 * Note, zapping MMIO SPTEs is also unnecessary as they
333 * are guarded by the memslots generation, not by being
336 old_child_spte = READ_ONCE(*sptep);
337 if (!is_shadow_present_pte(old_child_spte))
341 * Marking the SPTE as a removed SPTE is not
342 * strictly necessary here as the MMU lock will
343 * stop other threads from concurrently modifying
344 * this SPTE. Using the removed SPTE value keeps
345 * the two branches consistent and simplifies
348 WRITE_ONCE(*sptep, REMOVED_SPTE);
350 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
351 old_child_spte, REMOVED_SPTE, level - 1,
355 kvm_flush_remote_tlbs_with_address(kvm, gfn,
356 KVM_PAGES_PER_HPAGE(level));
358 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
362 * handle_changed_spte - handle bookkeeping associated with an SPTE change
364 * @as_id: the address space of the paging structure the SPTE was a part of
365 * @gfn: the base GFN that was mapped by the SPTE
366 * @old_spte: The value of the SPTE before the change
367 * @new_spte: The value of the SPTE after the change
368 * @level: the level of the PT the SPTE is part of in the paging structure
369 * @shared: This operation may not be running under the exclusive use of
370 * the MMU lock and the operation must synchronize with other
371 * threads that might be modifying SPTEs.
373 * Handle bookkeeping that might result from the modification of a SPTE.
374 * This function must be called for all TDP SPTE modifications.
376 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
377 u64 old_spte, u64 new_spte, int level,
380 bool was_present = is_shadow_present_pte(old_spte);
381 bool is_present = is_shadow_present_pte(new_spte);
382 bool was_leaf = was_present && is_last_spte(old_spte, level);
383 bool is_leaf = is_present && is_last_spte(new_spte, level);
384 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
386 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
387 WARN_ON(level < PG_LEVEL_4K);
388 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
391 * If this warning were to trigger it would indicate that there was a
392 * missing MMU notifier or a race with some notifier handler.
393 * A present, leaf SPTE should never be directly replaced with another
394 * present leaf SPTE pointing to a differnt PFN. A notifier handler
395 * should be zapping the SPTE before the main MM's page table is
396 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
397 * thread before replacement.
399 if (was_leaf && is_leaf && pfn_changed) {
400 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
401 "SPTE with another present leaf SPTE mapping a\n"
403 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
404 as_id, gfn, old_spte, new_spte, level);
407 * Crash the host to prevent error propagation and guest data
413 if (old_spte == new_spte)
416 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
419 * The only times a SPTE should be changed from a non-present to
420 * non-present state is when an MMIO entry is installed/modified/
421 * removed. In that case, there is nothing to do here.
423 if (!was_present && !is_present) {
425 * If this change does not involve a MMIO SPTE or removed SPTE,
426 * it is unexpected. Log the change, though it should not
427 * impact the guest since both the former and current SPTEs
430 if (WARN_ON(!is_mmio_spte(old_spte) &&
431 !is_mmio_spte(new_spte) &&
432 !is_removed_spte(new_spte)))
433 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
434 "should not be replaced with another,\n"
435 "different nonpresent SPTE, unless one or both\n"
436 "are MMIO SPTEs, or the new SPTE is\n"
437 "a temporary removed SPTE.\n"
438 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
439 as_id, gfn, old_spte, new_spte, level);
444 if (was_leaf && is_dirty_spte(old_spte) &&
445 (!is_dirty_spte(new_spte) || pfn_changed))
446 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
449 * Recursively handle child PTs if the change removed a subtree from
450 * the paging structure.
452 if (was_present && !was_leaf && (pfn_changed || !is_present))
453 handle_removed_tdp_mmu_page(kvm,
454 spte_to_child_pt(old_spte, level), shared);
457 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
458 u64 old_spte, u64 new_spte, int level,
461 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
463 handle_changed_spte_acc_track(old_spte, new_spte, level);
464 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
469 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
470 * associated bookkeeping
473 * @iter: a tdp_iter instance currently on the SPTE that should be set
474 * @new_spte: The value the SPTE should be set to
475 * Returns: true if the SPTE was set, false if it was not. If false is returned,
476 * this function will have no side-effects.
478 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
479 struct tdp_iter *iter,
482 u64 *root_pt = tdp_iter_root_pt(iter);
483 struct kvm_mmu_page *root = sptep_to_sp(root_pt);
484 int as_id = kvm_mmu_page_as_id(root);
486 lockdep_assert_held_read(&kvm->mmu_lock);
489 * Do not change removed SPTEs. Only the thread that froze the SPTE
492 if (iter->old_spte == REMOVED_SPTE)
495 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
496 new_spte) != iter->old_spte)
499 handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
505 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
506 struct tdp_iter *iter)
509 * Freeze the SPTE by setting it to a special,
510 * non-present value. This will stop other threads from
511 * immediately installing a present entry in its place
512 * before the TLBs are flushed.
514 if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
517 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
518 KVM_PAGES_PER_HPAGE(iter->level));
521 * No other thread can overwrite the removed SPTE as they
522 * must either wait on the MMU lock or use
523 * tdp_mmu_set_spte_atomic which will not overrite the
524 * special removed SPTE value. No bookkeeping is needed
525 * here since the SPTE is going from non-present
528 WRITE_ONCE(*iter->sptep, 0);
535 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
537 * @iter: a tdp_iter instance currently on the SPTE that should be set
538 * @new_spte: The value the SPTE should be set to
539 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
540 * of the page. Should be set unless handling an MMU
541 * notifier for access tracking. Leaving record_acc_track
542 * unset in that case prevents page accesses from being
544 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
545 * appropriate for the change being made. Should be set
546 * unless performing certain dirty logging operations.
547 * Leaving record_dirty_log unset in that case prevents page
548 * writes from being double counted.
550 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
551 u64 new_spte, bool record_acc_track,
552 bool record_dirty_log)
554 tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
555 struct kvm_mmu_page *root = sptep_to_sp(root_pt);
556 int as_id = kvm_mmu_page_as_id(root);
558 lockdep_assert_held_write(&kvm->mmu_lock);
561 * No thread should be using this function to set SPTEs to the
562 * temporary removed SPTE value.
563 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
564 * should be used. If operating under the MMU lock in write mode, the
565 * use of the removed SPTE should not be necessary.
567 WARN_ON(iter->old_spte == REMOVED_SPTE);
569 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
571 __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
573 if (record_acc_track)
574 handle_changed_spte_acc_track(iter->old_spte, new_spte,
576 if (record_dirty_log)
577 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
578 iter->old_spte, new_spte,
582 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
585 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
588 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
589 struct tdp_iter *iter,
592 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
595 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
596 struct tdp_iter *iter,
599 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
602 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
603 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
605 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
606 tdp_root_for_each_pte(_iter, _root, _start, _end) \
607 if (!is_shadow_present_pte(_iter.old_spte) || \
608 !is_last_spte(_iter.old_spte, _iter.level)) \
612 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
613 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
614 _mmu->shadow_root_level, _start, _end)
617 * Yield if the MMU lock is contended or this thread needs to return control
620 * If this function should yield and flush is set, it will perform a remote
621 * TLB flush before yielding.
623 * If this function yields, it will also reset the tdp_iter's walk over the
624 * paging structure and the calling function should skip to the next
625 * iteration to allow the iterator to continue its traversal from the
626 * paging structure root.
628 * Return true if this function yielded and the iterator's traversal was reset.
629 * Return false if a yield was not needed.
631 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
632 struct tdp_iter *iter, bool flush)
634 /* Ensure forward progress has been made before yielding. */
635 if (iter->next_last_level_gfn == iter->yielded_gfn)
638 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
642 kvm_flush_remote_tlbs(kvm);
644 cond_resched_rwlock_write(&kvm->mmu_lock);
647 WARN_ON(iter->gfn > iter->next_last_level_gfn);
649 tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
650 iter->root_level, iter->min_level,
651 iter->next_last_level_gfn);
660 * Tears down the mappings for the range of gfns, [start, end), and frees the
661 * non-root pages mapping GFNs strictly within that range. Returns true if
662 * SPTEs have been cleared and a TLB flush is needed before releasing the
664 * If can_yield is true, will release the MMU lock and reschedule if the
665 * scheduler needs the CPU or there is contention on the MMU lock. If this
666 * function cannot yield, it will not release the MMU lock or reschedule and
667 * the caller must ensure it does not supply too large a GFN range, or the
668 * operation can cause a soft lockup.
670 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
671 gfn_t start, gfn_t end, bool can_yield)
673 struct tdp_iter iter;
674 bool flush_needed = false;
678 tdp_root_for_each_pte(iter, root, start, end) {
680 tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
681 flush_needed = false;
685 if (!is_shadow_present_pte(iter.old_spte))
689 * If this is a non-last-level SPTE that covers a larger range
690 * than should be zapped, continue, and zap the mappings at a
693 if ((iter.gfn < start ||
694 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
695 !is_last_spte(iter.old_spte, iter.level))
698 tdp_mmu_set_spte(kvm, &iter, 0);
707 * Tears down the mappings for the range of gfns, [start, end), and frees the
708 * non-root pages mapping GFNs strictly within that range. Returns true if
709 * SPTEs have been cleared and a TLB flush is needed before releasing the
712 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
714 struct kvm_mmu_page *root;
717 for_each_tdp_mmu_root_yield_safe(kvm, root)
718 flush |= zap_gfn_range(kvm, root, start, end, true);
723 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
725 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
728 flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
730 kvm_flush_remote_tlbs(kvm);
734 * Installs a last-level SPTE to handle a TDP page fault.
735 * (NPT/EPT violation/misconfiguration)
737 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
739 struct tdp_iter *iter,
740 kvm_pfn_t pfn, bool prefault)
744 int make_spte_ret = 0;
746 if (unlikely(is_noslot_pfn(pfn)))
747 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
749 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
750 pfn, iter->old_spte, prefault, true,
751 map_writable, !shadow_accessed_mask,
754 if (new_spte == iter->old_spte)
755 ret = RET_PF_SPURIOUS;
756 else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
760 * If the page fault was caused by a write but the page is write
761 * protected, emulation is needed. If the emulation was skipped,
762 * the vCPU would have the same fault again.
764 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
766 ret = RET_PF_EMULATE;
767 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
770 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
771 if (unlikely(is_mmio_spte(new_spte))) {
772 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
774 ret = RET_PF_EMULATE;
776 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
777 rcu_dereference(iter->sptep));
779 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
780 rcu_dereference(iter->sptep));
782 vcpu->stat.pf_fixed++;
788 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
789 * page tables and SPTEs to translate the faulting guest physical address.
791 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
792 int map_writable, int max_level, kvm_pfn_t pfn,
795 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
796 bool write = error_code & PFERR_WRITE_MASK;
797 bool exec = error_code & PFERR_FETCH_MASK;
798 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
799 struct kvm_mmu *mmu = vcpu->arch.mmu;
800 struct tdp_iter iter;
801 struct kvm_mmu_page *sp;
805 gfn_t gfn = gpa >> PAGE_SHIFT;
809 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
811 if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
814 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
815 huge_page_disallowed, &req_level);
817 trace_kvm_mmu_spte_requested(gpa, level, pfn);
821 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
822 if (nx_huge_page_workaround_enabled)
823 disallowed_hugepage_adjust(iter.old_spte, gfn,
824 iter.level, &pfn, &level);
826 if (iter.level == level)
830 * If there is an SPTE mapping a large page at a higher level
831 * than the target, that SPTE must be cleared and replaced
832 * with a non-leaf SPTE.
834 if (is_shadow_present_pte(iter.old_spte) &&
835 is_large_pte(iter.old_spte)) {
836 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
840 * The iter must explicitly re-read the spte here
841 * because the new value informs the !present
844 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
847 if (!is_shadow_present_pte(iter.old_spte)) {
848 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
851 new_spte = make_nonleaf_spte(child_pt,
852 !shadow_accessed_mask);
854 if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
856 tdp_mmu_link_page(vcpu->kvm, sp, true,
857 huge_page_disallowed &&
858 req_level >= iter.level);
860 trace_kvm_mmu_get_page(sp, true);
868 if (iter.level != level) {
873 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
880 static __always_inline int
881 kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
885 int (*handler)(struct kvm *kvm,
886 struct kvm_memory_slot *slot,
887 struct kvm_mmu_page *root,
892 struct kvm_memslots *slots;
893 struct kvm_memory_slot *memslot;
894 struct kvm_mmu_page *root;
898 for_each_tdp_mmu_root_yield_safe(kvm, root) {
899 as_id = kvm_mmu_page_as_id(root);
900 slots = __kvm_memslots(kvm, as_id);
901 kvm_for_each_memslot(memslot, slots) {
902 unsigned long hva_start, hva_end;
903 gfn_t gfn_start, gfn_end;
905 hva_start = max(start, memslot->userspace_addr);
906 hva_end = min(end, memslot->userspace_addr +
907 (memslot->npages << PAGE_SHIFT));
908 if (hva_start >= hva_end)
911 * {gfn(page) | page intersects with [hva_start, hva_end)} =
912 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
914 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
915 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
917 ret |= handler(kvm, memslot, root, gfn_start,
925 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
926 struct kvm_memory_slot *slot,
927 struct kvm_mmu_page *root, gfn_t start,
928 gfn_t end, unsigned long unused)
930 return zap_gfn_range(kvm, root, start, end, false);
933 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
936 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
937 zap_gfn_range_hva_wrapper);
941 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
942 * if any of the GFNs in the range have been accessed.
944 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
945 struct kvm_mmu_page *root, gfn_t start, gfn_t end,
946 unsigned long unused)
948 struct tdp_iter iter;
954 tdp_root_for_each_leaf_pte(iter, root, start, end) {
956 * If we have a non-accessed entry we don't need to change the
959 if (!is_accessed_spte(iter.old_spte))
962 new_spte = iter.old_spte;
964 if (spte_ad_enabled(new_spte)) {
965 clear_bit((ffs(shadow_accessed_mask) - 1),
966 (unsigned long *)&new_spte);
969 * Capture the dirty status of the page, so that it doesn't get
970 * lost when the SPTE is marked for access tracking.
972 if (is_writable_pte(new_spte))
973 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
975 new_spte = mark_spte_for_access_track(new_spte);
977 new_spte &= ~shadow_dirty_mask;
979 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
982 trace_kvm_age_page(iter.gfn, iter.level, slot, young);
990 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
993 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
997 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
998 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
999 unsigned long unused2)
1001 struct tdp_iter iter;
1003 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
1004 if (is_accessed_spte(iter.old_spte))
1010 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1012 return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
1017 * Handle the changed_pte MMU notifier for the TDP MMU.
1018 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1020 * Returns non-zero if a flush is needed before releasing the MMU lock.
1022 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1023 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1026 struct tdp_iter iter;
1027 pte_t *ptep = (pte_t *)data;
1034 WARN_ON(pte_huge(*ptep));
1036 new_pfn = pte_pfn(*ptep);
1038 tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
1039 if (iter.level != PG_LEVEL_4K)
1042 if (!is_shadow_present_pte(iter.old_spte))
1045 tdp_mmu_set_spte(kvm, &iter, 0);
1047 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
1049 if (!pte_write(*ptep)) {
1050 new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1051 iter.old_spte, new_pfn);
1053 tdp_mmu_set_spte(kvm, &iter, new_spte);
1060 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1067 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1070 return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
1071 (unsigned long)host_ptep,
1076 * Remove write access from all the SPTEs mapping GFNs [start, end). If
1077 * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1078 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1080 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1081 gfn_t start, gfn_t end, int min_level)
1083 struct tdp_iter iter;
1085 bool spte_set = false;
1089 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1091 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1092 min_level, start, end) {
1093 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1096 if (!is_shadow_present_pte(iter.old_spte) ||
1097 !is_last_spte(iter.old_spte, iter.level) ||
1098 !(iter.old_spte & PT_WRITABLE_MASK))
1101 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1103 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1112 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1113 * only affect leaf SPTEs down to min_level.
1114 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1116 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1119 struct kvm_mmu_page *root;
1121 bool spte_set = false;
1123 for_each_tdp_mmu_root_yield_safe(kvm, root) {
1124 root_as_id = kvm_mmu_page_as_id(root);
1125 if (root_as_id != slot->as_id)
1128 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1129 slot->base_gfn + slot->npages, min_level);
1136 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1137 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1138 * If AD bits are not enabled, this will require clearing the writable bit on
1139 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1142 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1143 gfn_t start, gfn_t end)
1145 struct tdp_iter iter;
1147 bool spte_set = false;
1151 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1152 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1155 if (spte_ad_need_write_protect(iter.old_spte)) {
1156 if (is_writable_pte(iter.old_spte))
1157 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1161 if (iter.old_spte & shadow_dirty_mask)
1162 new_spte = iter.old_spte & ~shadow_dirty_mask;
1167 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1176 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1177 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1178 * If AD bits are not enabled, this will require clearing the writable bit on
1179 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1182 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1184 struct kvm_mmu_page *root;
1186 bool spte_set = false;
1188 for_each_tdp_mmu_root_yield_safe(kvm, root) {
1189 root_as_id = kvm_mmu_page_as_id(root);
1190 if (root_as_id != slot->as_id)
1193 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1194 slot->base_gfn + slot->npages);
1201 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1202 * set in mask, starting at gfn. The given memslot is expected to contain all
1203 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1204 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1205 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1207 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1208 gfn_t gfn, unsigned long mask, bool wrprot)
1210 struct tdp_iter iter;
1215 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1216 gfn + BITS_PER_LONG) {
1220 if (iter.level > PG_LEVEL_4K ||
1221 !(mask & (1UL << (iter.gfn - gfn))))
1224 mask &= ~(1UL << (iter.gfn - gfn));
1226 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1227 if (is_writable_pte(iter.old_spte))
1228 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1232 if (iter.old_spte & shadow_dirty_mask)
1233 new_spte = iter.old_spte & ~shadow_dirty_mask;
1238 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1245 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1246 * set in mask, starting at gfn. The given memslot is expected to contain all
1247 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1248 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1249 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1251 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1252 struct kvm_memory_slot *slot,
1253 gfn_t gfn, unsigned long mask,
1256 struct kvm_mmu_page *root;
1259 lockdep_assert_held_write(&kvm->mmu_lock);
1260 for_each_tdp_mmu_root(kvm, root) {
1261 root_as_id = kvm_mmu_page_as_id(root);
1262 if (root_as_id != slot->as_id)
1265 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1270 * Clear leaf entries which could be replaced by large mappings, for
1271 * GFNs within the slot.
1273 static void zap_collapsible_spte_range(struct kvm *kvm,
1274 struct kvm_mmu_page *root,
1275 struct kvm_memory_slot *slot)
1277 gfn_t start = slot->base_gfn;
1278 gfn_t end = start + slot->npages;
1279 struct tdp_iter iter;
1281 bool spte_set = false;
1285 tdp_root_for_each_pte(iter, root, start, end) {
1286 if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
1291 if (!is_shadow_present_pte(iter.old_spte) ||
1292 !is_last_spte(iter.old_spte, iter.level))
1295 pfn = spte_to_pfn(iter.old_spte);
1296 if (kvm_is_reserved_pfn(pfn) ||
1297 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1301 tdp_mmu_set_spte(kvm, &iter, 0);
1308 kvm_flush_remote_tlbs(kvm);
1312 * Clear non-leaf entries (and free associated page tables) which could
1313 * be replaced by large mappings, for GFNs within the slot.
1315 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1316 struct kvm_memory_slot *slot)
1318 struct kvm_mmu_page *root;
1321 for_each_tdp_mmu_root_yield_safe(kvm, root) {
1322 root_as_id = kvm_mmu_page_as_id(root);
1323 if (root_as_id != slot->as_id)
1326 zap_collapsible_spte_range(kvm, root, slot);
1331 * Removes write access on the last level SPTE mapping this GFN and unsets the
1332 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1333 * Returns true if an SPTE was set and a TLB flush is needed.
1335 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1338 struct tdp_iter iter;
1340 bool spte_set = false;
1344 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1345 if (!is_writable_pte(iter.old_spte))
1348 new_spte = iter.old_spte &
1349 ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
1351 tdp_mmu_set_spte(kvm, &iter, new_spte);
1361 * Removes write access on the last level SPTE mapping this GFN and unsets the
1362 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1363 * Returns true if an SPTE was set and a TLB flush is needed.
1365 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1366 struct kvm_memory_slot *slot, gfn_t gfn)
1368 struct kvm_mmu_page *root;
1370 bool spte_set = false;
1372 lockdep_assert_held_write(&kvm->mmu_lock);
1373 for_each_tdp_mmu_root(kvm, root) {
1374 root_as_id = kvm_mmu_page_as_id(root);
1375 if (root_as_id != slot->as_id)
1378 spte_set |= write_protect_gfn(kvm, root, gfn);
1384 * Return the level of the lowest level SPTE added to sptes.
1385 * That SPTE may be non-present.
1387 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1390 struct tdp_iter iter;
1391 struct kvm_mmu *mmu = vcpu->arch.mmu;
1392 gfn_t gfn = addr >> PAGE_SHIFT;
1395 *root_level = vcpu->arch.mmu->shadow_root_level;
1399 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1401 sptes[leaf] = iter.old_spte;