1 // SPDX-License-Identifier: GPL-2.0
4 #include "mmu_internal.h"
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
32 /* Arbitrarily returns true so that this may be used in if statements. */
33 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
37 lockdep_assert_held_read(&kvm->mmu_lock);
39 lockdep_assert_held_write(&kvm->mmu_lock);
44 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
46 if (!kvm->arch.tdp_mmu_enabled)
49 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
50 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
53 * Ensure that all the outstanding RCU callbacks to free shadow pages
54 * can run before the VM is torn down.
59 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
60 gfn_t start, gfn_t end, bool can_yield, bool flush,
63 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
65 free_page((unsigned long)sp->spt);
66 kmem_cache_free(mmu_page_header_cache, sp);
70 * This is called through call_rcu in order to free TDP page table memory
71 * safely with respect to other kernel threads that may be operating on
73 * By only accessing TDP MMU page table memory in an RCU read critical
74 * section, and freeing it after a grace period, lockless access to that
75 * memory won't use it after it is freed.
77 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
79 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
85 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
88 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
90 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
93 WARN_ON(!root->tdp_mmu_page);
95 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
96 list_del_rcu(&root->link);
97 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
100 * A TLB flush is not necessary as KVM performs a local TLB flush when
101 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
102 * to a different pCPU. Note, the local TLB flush on reuse also
103 * invalidates any paging-structure-cache entries, i.e. TLB entries for
104 * intermediate paging structures, that may be zapped, as such entries
105 * are associated with the ASID on both VMX and SVM.
107 (void)zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
109 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
113 * Returns the next root after @prev_root (or the first root if @prev_root is
114 * NULL). A reference to the returned root is acquired, and the reference to
115 * @prev_root is released (the caller obviously must hold a reference to
116 * @prev_root if it's non-NULL).
118 * If @only_valid is true, invalid roots are skipped.
120 * Returns NULL if the end of tdp_mmu_roots was reached.
122 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
123 struct kvm_mmu_page *prev_root,
124 bool shared, bool only_valid)
126 struct kvm_mmu_page *next_root;
131 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
133 typeof(*prev_root), link);
135 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
136 typeof(*next_root), link);
139 if ((!only_valid || !next_root->role.invalid) &&
140 kvm_tdp_mmu_get_root(next_root))
143 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
144 &next_root->link, typeof(*next_root), link);
150 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
156 * Note: this iterator gets and puts references to the roots it iterates over.
157 * This makes it safe to release the MMU lock and yield within the loop, but
158 * if exiting the loop early, the caller must drop the reference to the most
159 * recent root. (Unless keeping a live reference is desirable.)
161 * If shared is set, this function is operating under the MMU lock in read
162 * mode. In the unlikely event that this thread must free a root, the lock
163 * will be temporarily dropped and reacquired in write mode.
165 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
166 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
168 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
169 if (kvm_mmu_page_as_id(_root) != _as_id) { \
172 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
173 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
175 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
176 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false)
179 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
180 * the implication being that any flow that holds mmu_lock for read is
181 * inherently yield-friendly and should use the yield-safe variant above.
182 * Holding mmu_lock for write obviates the need for RCU protection as the list
183 * is guaranteed to be stable.
185 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
186 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
187 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
188 kvm_mmu_page_as_id(_root) != _as_id) { \
191 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
193 struct kvm_mmu_page *sp;
195 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
196 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
201 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, gfn_t gfn,
202 union kvm_mmu_page_role role)
204 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
208 sp->tdp_mmu_page = true;
210 trace_kvm_mmu_get_page(sp, true);
213 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
214 struct tdp_iter *iter)
216 struct kvm_mmu_page *parent_sp;
217 union kvm_mmu_page_role role;
219 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
221 role = parent_sp->role;
224 tdp_mmu_init_sp(child_sp, iter->gfn, role);
227 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
229 union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
230 struct kvm *kvm = vcpu->kvm;
231 struct kvm_mmu_page *root;
233 lockdep_assert_held_write(&kvm->mmu_lock);
236 * Check for an existing root before allocating a new one. Note, the
237 * role check prevents consuming an invalid root.
239 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
240 if (root->role.word == role.word &&
241 kvm_tdp_mmu_get_root(root))
245 root = tdp_mmu_alloc_sp(vcpu);
246 tdp_mmu_init_sp(root, 0, role);
248 refcount_set(&root->tdp_mmu_root_count, 1);
250 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
251 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
252 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
255 return __pa(root->spt);
258 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
259 u64 old_spte, u64 new_spte, int level,
262 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
264 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
267 if (is_accessed_spte(old_spte) &&
268 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
269 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
270 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
273 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
274 u64 old_spte, u64 new_spte, int level)
277 struct kvm_memory_slot *slot;
279 if (level > PG_LEVEL_4K)
282 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
284 if ((!is_writable_pte(old_spte) || pfn_changed) &&
285 is_writable_pte(new_spte)) {
286 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
287 mark_page_dirty_in_slot(kvm, slot, gfn);
292 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
295 * @sp: the page to be removed
296 * @shared: This operation may not be running under the exclusive use of
297 * the MMU lock and the operation must synchronize with other
298 * threads that might be adding or removing pages.
300 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
304 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
306 lockdep_assert_held_write(&kvm->mmu_lock);
309 if (sp->lpage_disallowed)
310 unaccount_huge_nx_page(kvm, sp);
313 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
317 * handle_removed_pt() - handle a page table removed from the TDP structure
320 * @pt: the page removed from the paging structure
321 * @shared: This operation may not be running under the exclusive use
322 * of the MMU lock and the operation must synchronize with other
323 * threads that might be modifying SPTEs.
325 * Given a page table that has been removed from the TDP paging structure,
326 * iterates through the page table to clear SPTEs and free child page tables.
328 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
329 * protection. Since this thread removed it from the paging structure,
330 * this thread will be responsible for ensuring the page is freed. Hence the
331 * early rcu_dereferences in the function.
333 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
335 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
336 int level = sp->role.level;
337 gfn_t base_gfn = sp->gfn;
340 trace_kvm_mmu_prepare_zap_page(sp);
342 tdp_mmu_unlink_sp(kvm, sp, shared);
344 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
345 u64 *sptep = rcu_dereference(pt) + i;
346 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
351 * Set the SPTE to a nonpresent value that other
352 * threads will not overwrite. If the SPTE was
353 * already marked as removed then another thread
354 * handling a page fault could overwrite it, so
355 * set the SPTE until it is set from some other
356 * value to the removed SPTE value.
359 old_child_spte = xchg(sptep, REMOVED_SPTE);
360 if (!is_removed_spte(old_child_spte))
366 * If the SPTE is not MMU-present, there is no backing
367 * page associated with the SPTE and so no side effects
368 * that need to be recorded, and exclusive ownership of
369 * mmu_lock ensures the SPTE can't be made present.
370 * Note, zapping MMIO SPTEs is also unnecessary as they
371 * are guarded by the memslots generation, not by being
374 old_child_spte = READ_ONCE(*sptep);
375 if (!is_shadow_present_pte(old_child_spte))
379 * Marking the SPTE as a removed SPTE is not
380 * strictly necessary here as the MMU lock will
381 * stop other threads from concurrently modifying
382 * this SPTE. Using the removed SPTE value keeps
383 * the two branches consistent and simplifies
386 WRITE_ONCE(*sptep, REMOVED_SPTE);
388 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
389 old_child_spte, REMOVED_SPTE, level,
393 kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
394 KVM_PAGES_PER_HPAGE(level + 1));
396 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
400 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
402 * @as_id: the address space of the paging structure the SPTE was a part of
403 * @gfn: the base GFN that was mapped by the SPTE
404 * @old_spte: The value of the SPTE before the change
405 * @new_spte: The value of the SPTE after the change
406 * @level: the level of the PT the SPTE is part of in the paging structure
407 * @shared: This operation may not be running under the exclusive use of
408 * the MMU lock and the operation must synchronize with other
409 * threads that might be modifying SPTEs.
411 * Handle bookkeeping that might result from the modification of a SPTE.
412 * This function must be called for all TDP SPTE modifications.
414 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
415 u64 old_spte, u64 new_spte, int level,
418 bool was_present = is_shadow_present_pte(old_spte);
419 bool is_present = is_shadow_present_pte(new_spte);
420 bool was_leaf = was_present && is_last_spte(old_spte, level);
421 bool is_leaf = is_present && is_last_spte(new_spte, level);
422 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
424 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
425 WARN_ON(level < PG_LEVEL_4K);
426 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
429 * If this warning were to trigger it would indicate that there was a
430 * missing MMU notifier or a race with some notifier handler.
431 * A present, leaf SPTE should never be directly replaced with another
432 * present leaf SPTE pointing to a different PFN. A notifier handler
433 * should be zapping the SPTE before the main MM's page table is
434 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
435 * thread before replacement.
437 if (was_leaf && is_leaf && pfn_changed) {
438 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
439 "SPTE with another present leaf SPTE mapping a\n"
441 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
442 as_id, gfn, old_spte, new_spte, level);
445 * Crash the host to prevent error propagation and guest data
451 if (old_spte == new_spte)
454 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
457 check_spte_writable_invariants(new_spte);
460 * The only times a SPTE should be changed from a non-present to
461 * non-present state is when an MMIO entry is installed/modified/
462 * removed. In that case, there is nothing to do here.
464 if (!was_present && !is_present) {
466 * If this change does not involve a MMIO SPTE or removed SPTE,
467 * it is unexpected. Log the change, though it should not
468 * impact the guest since both the former and current SPTEs
471 if (WARN_ON(!is_mmio_spte(old_spte) &&
472 !is_mmio_spte(new_spte) &&
473 !is_removed_spte(new_spte)))
474 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
475 "should not be replaced with another,\n"
476 "different nonpresent SPTE, unless one or both\n"
477 "are MMIO SPTEs, or the new SPTE is\n"
478 "a temporary removed SPTE.\n"
479 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
480 as_id, gfn, old_spte, new_spte, level);
484 if (is_leaf != was_leaf)
485 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
487 if (was_leaf && is_dirty_spte(old_spte) &&
488 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
489 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
492 * Recursively handle child PTs if the change removed a subtree from
493 * the paging structure.
495 if (was_present && !was_leaf && (pfn_changed || !is_present))
496 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
499 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
500 u64 old_spte, u64 new_spte, int level,
503 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
505 handle_changed_spte_acc_track(old_spte, new_spte, level);
506 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
511 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
512 * and handle the associated bookkeeping. Do not mark the page dirty
513 * in KVM's dirty bitmaps.
515 * If setting the SPTE fails because it has changed, iter->old_spte will be
516 * refreshed to the current value of the spte.
519 * @iter: a tdp_iter instance currently on the SPTE that should be set
520 * @new_spte: The value the SPTE should be set to
522 * * 0 - If the SPTE was set.
523 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
524 * no side-effects other than setting iter->old_spte to the last
525 * known value of the spte.
527 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
528 struct tdp_iter *iter,
531 u64 *sptep = rcu_dereference(iter->sptep);
534 WARN_ON_ONCE(iter->yielded);
536 lockdep_assert_held_read(&kvm->mmu_lock);
539 * Do not change removed SPTEs. Only the thread that froze the SPTE
542 if (is_removed_spte(iter->old_spte))
546 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
547 * does not hold the mmu_lock.
549 old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
550 if (old_spte != iter->old_spte) {
552 * The page table entry was modified by a different logical
553 * CPU. Refresh iter->old_spte with the current value so the
554 * caller operates on fresh data, e.g. if it retries
555 * tdp_mmu_set_spte_atomic().
557 iter->old_spte = old_spte;
561 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
562 new_spte, iter->level, true);
563 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
568 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
569 struct tdp_iter *iter)
574 * Freeze the SPTE by setting it to a special,
575 * non-present value. This will stop other threads from
576 * immediately installing a present entry in its place
577 * before the TLBs are flushed.
579 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
583 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
584 KVM_PAGES_PER_HPAGE(iter->level));
587 * No other thread can overwrite the removed SPTE as they
588 * must either wait on the MMU lock or use
589 * tdp_mmu_set_spte_atomic which will not overwrite the
590 * special removed SPTE value. No bookkeeping is needed
591 * here since the SPTE is going from non-present
594 WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
601 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
603 * @iter: a tdp_iter instance currently on the SPTE that should be set
604 * @new_spte: The value the SPTE should be set to
605 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
606 * of the page. Should be set unless handling an MMU
607 * notifier for access tracking. Leaving record_acc_track
608 * unset in that case prevents page accesses from being
610 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
611 * appropriate for the change being made. Should be set
612 * unless performing certain dirty logging operations.
613 * Leaving record_dirty_log unset in that case prevents page
614 * writes from being double counted.
616 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
617 u64 new_spte, bool record_acc_track,
618 bool record_dirty_log)
620 WARN_ON_ONCE(iter->yielded);
622 lockdep_assert_held_write(&kvm->mmu_lock);
625 * No thread should be using this function to set SPTEs to the
626 * temporary removed SPTE value.
627 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
628 * should be used. If operating under the MMU lock in write mode, the
629 * use of the removed SPTE should not be necessary.
631 WARN_ON(is_removed_spte(iter->old_spte));
633 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
635 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
636 new_spte, iter->level, false);
637 if (record_acc_track)
638 handle_changed_spte_acc_track(iter->old_spte, new_spte,
640 if (record_dirty_log)
641 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
642 iter->old_spte, new_spte,
646 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
649 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
652 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
653 struct tdp_iter *iter,
656 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
659 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
660 struct tdp_iter *iter,
663 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
666 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
667 for_each_tdp_pte(_iter, _root, _start, _end)
669 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
670 tdp_root_for_each_pte(_iter, _root, _start, _end) \
671 if (!is_shadow_present_pte(_iter.old_spte) || \
672 !is_last_spte(_iter.old_spte, _iter.level)) \
676 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
677 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
680 * Yield if the MMU lock is contended or this thread needs to return control
683 * If this function should yield and flush is set, it will perform a remote
684 * TLB flush before yielding.
686 * If this function yields, iter->yielded is set and the caller must skip to
687 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
688 * over the paging structures to allow the iterator to continue its traversal
689 * from the paging structure root.
691 * Returns true if this function yielded.
693 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
694 struct tdp_iter *iter,
695 bool flush, bool shared)
697 WARN_ON(iter->yielded);
699 /* Ensure forward progress has been made before yielding. */
700 if (iter->next_last_level_gfn == iter->yielded_gfn)
703 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
707 kvm_flush_remote_tlbs(kvm);
710 cond_resched_rwlock_read(&kvm->mmu_lock);
712 cond_resched_rwlock_write(&kvm->mmu_lock);
716 WARN_ON(iter->gfn > iter->next_last_level_gfn);
718 iter->yielded = true;
721 return iter->yielded;
725 * Tears down the mappings for the range of gfns, [start, end), and frees the
726 * non-root pages mapping GFNs strictly within that range. Returns true if
727 * SPTEs have been cleared and a TLB flush is needed before releasing the
730 * If can_yield is true, will release the MMU lock and reschedule if the
731 * scheduler needs the CPU or there is contention on the MMU lock. If this
732 * function cannot yield, it will not release the MMU lock or reschedule and
733 * the caller must ensure it does not supply too large a GFN range, or the
734 * operation can cause a soft lockup.
736 * If shared is true, this thread holds the MMU lock in read mode and must
737 * account for the possibility that other threads are modifying the paging
738 * structures concurrently. If shared is false, this thread should hold the
739 * MMU lock in write mode.
741 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
742 gfn_t start, gfn_t end, bool can_yield, bool flush,
745 gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
746 bool zap_all = (start == 0 && end >= max_gfn_host);
747 struct tdp_iter iter;
750 * No need to try to step down in the iterator when zapping all SPTEs,
751 * zapping the top-level non-leaf SPTEs will recurse on their children.
753 int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
756 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
757 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
758 * and so KVM will never install a SPTE for such addresses.
760 end = min(end, max_gfn_host);
762 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
766 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
769 tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
774 if (!is_shadow_present_pte(iter.old_spte))
778 * If this is a non-last-level SPTE that covers a larger range
779 * than should be zapped, continue, and zap the mappings at a
780 * lower level, except when zapping all SPTEs.
784 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
785 !is_last_spte(iter.old_spte, iter.level))
789 tdp_mmu_set_spte(kvm, &iter, 0);
791 } else if (tdp_mmu_zap_spte_atomic(kvm, &iter)) {
801 * Tears down the mappings for the range of gfns, [start, end), and frees the
802 * non-root pages mapping GFNs strictly within that range. Returns true if
803 * SPTEs have been cleared and a TLB flush is needed before releasing the
806 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
807 gfn_t end, bool can_yield, bool flush)
809 struct kvm_mmu_page *root;
811 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
812 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
818 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
823 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
824 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
827 kvm_flush_remote_tlbs(kvm);
830 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
831 struct kvm_mmu_page *prev_root)
833 struct kvm_mmu_page *next_root;
836 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
838 typeof(*prev_root), link);
840 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
841 typeof(*next_root), link);
843 while (next_root && !(next_root->role.invalid &&
844 refcount_read(&next_root->tdp_mmu_root_count)))
845 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
847 typeof(*next_root), link);
853 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
854 * zap" completes. Since kvm_tdp_mmu_invalidate_all_roots() has acquired a
855 * reference to each invalidated root, roots will not be freed until after this
856 * function drops the gifted reference, e.g. so that vCPUs don't get stuck with
857 * tearing down paging structures.
859 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
861 struct kvm_mmu_page *next_root;
862 struct kvm_mmu_page *root;
864 lockdep_assert_held_read(&kvm->mmu_lock);
868 root = next_invalidated_root(kvm, NULL);
871 next_root = next_invalidated_root(kvm, root);
876 * A TLB flush is unnecessary, invalidated roots are guaranteed
877 * to be unreachable by the guest (see kvm_tdp_mmu_put_root()
878 * for more details), and unlike the legacy MMU, no vCPU kick
879 * is needed to play nice with lockless shadow walks as the TDP
880 * MMU protects its paging structures via RCU. Note, zapping
881 * will still flush on yield, but that's a minor performance
882 * blip and not a functional issue.
884 (void)zap_gfn_range(kvm, root, 0, -1ull, true, false, true);
887 * Put the reference acquired in
888 * kvm_tdp_mmu_invalidate_roots
890 kvm_tdp_mmu_put_root(kvm, root, true);
901 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
902 * is about to be zapped, e.g. in response to a memslots update. The caller is
903 * responsible for invoking kvm_tdp_mmu_zap_invalidated_roots() to do the actual
906 * Take a reference on all roots to prevent the root from being freed before it
907 * is zapped by this thread. Freeing a root is not a correctness issue, but if
908 * a vCPU drops the last reference to a root prior to the root being zapped, it
909 * will get stuck with tearing down the entire paging structure.
911 * Get a reference even if the root is already invalid,
912 * kvm_tdp_mmu_zap_invalidated_roots() assumes it was gifted a reference to all
913 * invalid roots, e.g. there's no epoch to identify roots that were invalidated
914 * by a previous call. Roots stay on the list until the last reference is
915 * dropped, so even though all invalid roots are zapped, a root may not go away
916 * for quite some time, e.g. if a vCPU blocks across multiple memslot updates.
918 * Because mmu_lock is held for write, it should be impossible to observe a
919 * root with zero refcount, i.e. the list of roots cannot be stale.
921 * This has essentially the same effect for the TDP MMU
922 * as updating mmu_valid_gen does for the shadow MMU.
924 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
926 struct kvm_mmu_page *root;
928 lockdep_assert_held_write(&kvm->mmu_lock);
929 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
930 if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
931 root->role.invalid = true;
936 * Installs a last-level SPTE to handle a TDP page fault.
937 * (NPT/EPT violation/misconfiguration)
939 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
940 struct kvm_page_fault *fault,
941 struct tdp_iter *iter)
943 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
945 int ret = RET_PF_FIXED;
948 WARN_ON(sp->role.level != fault->goal_level);
949 if (unlikely(!fault->slot))
950 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
952 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
953 fault->pfn, iter->old_spte, fault->prefetch, true,
954 fault->map_writable, &new_spte);
956 if (new_spte == iter->old_spte)
957 ret = RET_PF_SPURIOUS;
958 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
962 * If the page fault was caused by a write but the page is write
963 * protected, emulation is needed. If the emulation was skipped,
964 * the vCPU would have the same fault again.
968 ret = RET_PF_EMULATE;
971 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
972 if (unlikely(is_mmio_spte(new_spte))) {
973 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
975 ret = RET_PF_EMULATE;
977 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
978 rcu_dereference(iter->sptep));
982 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
983 * consistent with legacy MMU behavior.
985 if (ret != RET_PF_SPURIOUS)
986 vcpu->stat.pf_fixed++;
992 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
993 * provided page table.
996 * @iter: a tdp_iter instance currently on the SPTE that should be set
997 * @sp: The new TDP page table to install.
998 * @account_nx: True if this page table is being installed to split a
999 * non-executable huge page.
1000 * @shared: This operation is running under the MMU lock in read mode.
1002 * Returns: 0 if the new page table was installed. Non-0 if the page table
1003 * could not be installed (e.g. the atomic compare-exchange failed).
1005 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1006 struct kvm_mmu_page *sp, bool account_nx,
1009 u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
1013 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1017 tdp_mmu_set_spte(kvm, iter, spte);
1020 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1021 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1023 account_huge_nx_page(kvm, sp);
1024 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1030 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1031 * page tables and SPTEs to translate the faulting guest physical address.
1033 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1035 struct kvm_mmu *mmu = vcpu->arch.mmu;
1036 struct tdp_iter iter;
1037 struct kvm_mmu_page *sp;
1040 kvm_mmu_hugepage_adjust(vcpu, fault);
1042 trace_kvm_mmu_spte_requested(fault);
1046 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1047 if (fault->nx_huge_page_workaround_enabled)
1048 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1050 if (iter.level == fault->goal_level)
1054 * If there is an SPTE mapping a large page at a higher level
1055 * than the target, that SPTE must be cleared and replaced
1056 * with a non-leaf SPTE.
1058 if (is_shadow_present_pte(iter.old_spte) &&
1059 is_large_pte(iter.old_spte)) {
1060 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1064 * The iter must explicitly re-read the spte here
1065 * because the new value informs the !present
1068 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1071 if (!is_shadow_present_pte(iter.old_spte)) {
1072 bool account_nx = fault->huge_page_disallowed &&
1073 fault->req_level >= iter.level;
1076 * If SPTE has been frozen by another thread, just
1077 * give up and retry, avoiding unnecessary page table
1078 * allocation and free.
1080 if (is_removed_spte(iter.old_spte))
1083 sp = tdp_mmu_alloc_sp(vcpu);
1084 tdp_mmu_init_child_sp(sp, &iter);
1086 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1087 tdp_mmu_free_sp(sp);
1093 if (iter.level != fault->goal_level) {
1095 return RET_PF_RETRY;
1098 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1104 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1107 return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
1108 range->end, range->may_block, flush);
1111 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1112 struct kvm_gfn_range *range);
1114 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1115 struct kvm_gfn_range *range,
1116 tdp_handler_t handler)
1118 struct kvm_mmu_page *root;
1119 struct tdp_iter iter;
1125 * Don't support rescheduling, none of the MMU notifiers that funnel
1126 * into this helper allow blocking; it'd be dead, wasteful code.
1128 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1129 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1130 ret |= handler(kvm, &iter, range);
1139 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1140 * if any of the GFNs in the range have been accessed.
1142 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1143 struct kvm_gfn_range *range)
1147 /* If we have a non-accessed entry we don't need to change the pte. */
1148 if (!is_accessed_spte(iter->old_spte))
1151 new_spte = iter->old_spte;
1153 if (spte_ad_enabled(new_spte)) {
1154 new_spte &= ~shadow_accessed_mask;
1157 * Capture the dirty status of the page, so that it doesn't get
1158 * lost when the SPTE is marked for access tracking.
1160 if (is_writable_pte(new_spte))
1161 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1163 new_spte = mark_spte_for_access_track(new_spte);
1166 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1171 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1173 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1176 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1177 struct kvm_gfn_range *range)
1179 return is_accessed_spte(iter->old_spte);
1182 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1184 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1187 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1188 struct kvm_gfn_range *range)
1192 /* Huge pages aren't expected to be modified without first being zapped. */
1193 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1195 if (iter->level != PG_LEVEL_4K ||
1196 !is_shadow_present_pte(iter->old_spte))
1200 * Note, when changing a read-only SPTE, it's not strictly necessary to
1201 * zero the SPTE before setting the new PFN, but doing so preserves the
1202 * invariant that the PFN of a present * leaf SPTE can never change.
1203 * See __handle_changed_spte().
1205 tdp_mmu_set_spte(kvm, iter, 0);
1207 if (!pte_write(range->pte)) {
1208 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1209 pte_pfn(range->pte));
1211 tdp_mmu_set_spte(kvm, iter, new_spte);
1218 * Handle the changed_pte MMU notifier for the TDP MMU.
1219 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1221 * Returns non-zero if a flush is needed before releasing the MMU lock.
1223 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1225 bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1227 /* FIXME: return 'flush' instead of flushing here. */
1229 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1235 * Remove write access from all SPTEs at or above min_level that map GFNs
1236 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1239 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1240 gfn_t start, gfn_t end, int min_level)
1242 struct tdp_iter iter;
1244 bool spte_set = false;
1248 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1250 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1252 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1255 if (!is_shadow_present_pte(iter.old_spte) ||
1256 !is_last_spte(iter.old_spte, iter.level) ||
1257 !(iter.old_spte & PT_WRITABLE_MASK))
1260 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1262 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1273 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1274 * only affect leaf SPTEs down to min_level.
1275 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1277 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1278 const struct kvm_memory_slot *slot, int min_level)
1280 struct kvm_mmu_page *root;
1281 bool spte_set = false;
1283 lockdep_assert_held_read(&kvm->mmu_lock);
1285 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1286 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1287 slot->base_gfn + slot->npages, min_level);
1292 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1294 struct kvm_mmu_page *sp;
1298 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1302 sp->spt = (void *)__get_free_page(gfp);
1304 kmem_cache_free(mmu_page_header_cache, sp);
1311 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1312 struct tdp_iter *iter,
1315 struct kvm_mmu_page *sp;
1318 * Since we are allocating while under the MMU lock we have to be
1319 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1320 * reclaim and to avoid making any filesystem callbacks (which can end
1321 * up invoking KVM MMU notifiers, resulting in a deadlock).
1323 * If this allocation fails we drop the lock and retry with reclaim
1326 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1333 read_unlock(&kvm->mmu_lock);
1335 write_unlock(&kvm->mmu_lock);
1337 iter->yielded = true;
1338 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1341 read_lock(&kvm->mmu_lock);
1343 write_lock(&kvm->mmu_lock);
1350 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1351 struct kvm_mmu_page *sp, bool shared)
1353 const u64 huge_spte = iter->old_spte;
1354 const int level = iter->level;
1357 tdp_mmu_init_child_sp(sp, iter);
1360 * No need for atomics when writing to sp->spt since the page table has
1361 * not been linked in yet and thus is not reachable from any other CPU.
1363 for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1364 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1367 * Replace the huge spte with a pointer to the populated lower level
1368 * page table. Since we are making this change without a TLB flush vCPUs
1369 * will see a mix of the split mappings and the original huge mapping,
1370 * depending on what's currently in their TLB. This is fine from a
1371 * correctness standpoint since the translation will be the same either
1374 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1379 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1380 * are overwriting from the page stats. But we have to manually update
1381 * the page stats with the new present child pages.
1383 kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1386 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1390 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1391 struct kvm_mmu_page *root,
1392 gfn_t start, gfn_t end,
1393 int target_level, bool shared)
1395 struct kvm_mmu_page *sp = NULL;
1396 struct tdp_iter iter;
1402 * Traverse the page table splitting all huge pages above the target
1403 * level into one lower level. For example, if we encounter a 1GB page
1404 * we split it into 512 2MB pages.
1406 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1407 * to visit an SPTE before ever visiting its children, which means we
1408 * will correctly recursively split huge pages that are more than one
1409 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1410 * and then splitting each of those to 512 4KB pages).
1412 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1414 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1417 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1421 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1424 trace_kvm_mmu_split_huge_page(iter.gfn,
1434 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1443 * It's possible to exit the loop having never used the last sp if, for
1444 * example, a vCPU doing HugePage NX splitting wins the race and
1445 * installs its own sp in place of the last sp we tried to split.
1448 tdp_mmu_free_sp(sp);
1455 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1457 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1458 const struct kvm_memory_slot *slot,
1459 gfn_t start, gfn_t end,
1460 int target_level, bool shared)
1462 struct kvm_mmu_page *root;
1465 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1467 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1468 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1470 kvm_tdp_mmu_put_root(kvm, root, shared);
1477 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1478 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1479 * If AD bits are not enabled, this will require clearing the writable bit on
1480 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1483 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1484 gfn_t start, gfn_t end)
1486 struct tdp_iter iter;
1488 bool spte_set = false;
1492 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1494 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1497 if (!is_shadow_present_pte(iter.old_spte))
1500 if (spte_ad_need_write_protect(iter.old_spte)) {
1501 if (is_writable_pte(iter.old_spte))
1502 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1506 if (iter.old_spte & shadow_dirty_mask)
1507 new_spte = iter.old_spte & ~shadow_dirty_mask;
1512 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1523 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1524 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1525 * If AD bits are not enabled, this will require clearing the writable bit on
1526 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1529 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1530 const struct kvm_memory_slot *slot)
1532 struct kvm_mmu_page *root;
1533 bool spte_set = false;
1535 lockdep_assert_held_read(&kvm->mmu_lock);
1537 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1538 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1539 slot->base_gfn + slot->npages);
1545 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1546 * set in mask, starting at gfn. The given memslot is expected to contain all
1547 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1548 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1549 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1551 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1552 gfn_t gfn, unsigned long mask, bool wrprot)
1554 struct tdp_iter iter;
1559 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1560 gfn + BITS_PER_LONG) {
1564 if (iter.level > PG_LEVEL_4K ||
1565 !(mask & (1UL << (iter.gfn - gfn))))
1568 mask &= ~(1UL << (iter.gfn - gfn));
1570 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1571 if (is_writable_pte(iter.old_spte))
1572 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1576 if (iter.old_spte & shadow_dirty_mask)
1577 new_spte = iter.old_spte & ~shadow_dirty_mask;
1582 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1589 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1590 * set in mask, starting at gfn. The given memslot is expected to contain all
1591 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1592 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1593 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1595 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1596 struct kvm_memory_slot *slot,
1597 gfn_t gfn, unsigned long mask,
1600 struct kvm_mmu_page *root;
1602 lockdep_assert_held_write(&kvm->mmu_lock);
1603 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1604 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1608 * Clear leaf entries which could be replaced by large mappings, for
1609 * GFNs within the slot.
1611 static void zap_collapsible_spte_range(struct kvm *kvm,
1612 struct kvm_mmu_page *root,
1613 const struct kvm_memory_slot *slot)
1615 gfn_t start = slot->base_gfn;
1616 gfn_t end = start + slot->npages;
1617 struct tdp_iter iter;
1622 tdp_root_for_each_pte(iter, root, start, end) {
1624 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1627 if (!is_shadow_present_pte(iter.old_spte) ||
1628 !is_last_spte(iter.old_spte, iter.level))
1631 pfn = spte_to_pfn(iter.old_spte);
1632 if (kvm_is_reserved_pfn(pfn) ||
1633 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1637 /* Note, a successful atomic zap also does a remote TLB flush. */
1638 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1646 * Clear non-leaf entries (and free associated page tables) which could
1647 * be replaced by large mappings, for GFNs within the slot.
1649 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1650 const struct kvm_memory_slot *slot)
1652 struct kvm_mmu_page *root;
1654 lockdep_assert_held_read(&kvm->mmu_lock);
1656 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1657 zap_collapsible_spte_range(kvm, root, slot);
1661 * Removes write access on the last level SPTE mapping this GFN and unsets the
1662 * MMU-writable bit to ensure future writes continue to be intercepted.
1663 * Returns true if an SPTE was set and a TLB flush is needed.
1665 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1666 gfn_t gfn, int min_level)
1668 struct tdp_iter iter;
1670 bool spte_set = false;
1672 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1676 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1677 if (!is_shadow_present_pte(iter.old_spte) ||
1678 !is_last_spte(iter.old_spte, iter.level))
1681 new_spte = iter.old_spte &
1682 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1684 if (new_spte == iter.old_spte)
1687 tdp_mmu_set_spte(kvm, &iter, new_spte);
1697 * Removes write access on the last level SPTE mapping this GFN and unsets the
1698 * MMU-writable bit to ensure future writes continue to be intercepted.
1699 * Returns true if an SPTE was set and a TLB flush is needed.
1701 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1702 struct kvm_memory_slot *slot, gfn_t gfn,
1705 struct kvm_mmu_page *root;
1706 bool spte_set = false;
1708 lockdep_assert_held_write(&kvm->mmu_lock);
1709 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1710 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1716 * Return the level of the lowest level SPTE added to sptes.
1717 * That SPTE may be non-present.
1719 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1721 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1724 struct tdp_iter iter;
1725 struct kvm_mmu *mmu = vcpu->arch.mmu;
1726 gfn_t gfn = addr >> PAGE_SHIFT;
1729 *root_level = vcpu->arch.mmu->shadow_root_level;
1731 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1733 sptes[leaf] = iter.old_spte;
1740 * Returns the last level spte pointer of the shadow page walk for the given
1741 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1742 * walk could be performed, returns NULL and *spte does not contain valid data.
1745 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1746 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1748 * WARNING: This function is only intended to be called during fast_page_fault.
1750 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1753 struct tdp_iter iter;
1754 struct kvm_mmu *mmu = vcpu->arch.mmu;
1755 gfn_t gfn = addr >> PAGE_SHIFT;
1756 tdp_ptep_t sptep = NULL;
1758 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1759 *spte = iter.old_spte;
1764 * Perform the rcu_dereference to get the raw spte pointer value since
1765 * we are passing it up to fast_page_fault, which is shared with the
1766 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1769 * This is safe since fast_page_fault obeys the contracts of this
1770 * function as well as all TDP MMU contracts around modifying SPTEs
1771 * outside of mmu_lock.
1773 return rcu_dereference(sptep);