1 // SPDX-License-Identifier: GPL-2.0
4 #include "mmu_internal.h"
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 kvm->arch.tdp_mmu_zap_wq =
29 alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
34 /* Arbitrarily returns true so that this may be used in if statements. */
35 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
39 lockdep_assert_held_read(&kvm->mmu_lock);
41 lockdep_assert_held_write(&kvm->mmu_lock);
46 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
48 if (!kvm->arch.tdp_mmu_enabled)
51 flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
52 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
54 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
55 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
58 * Ensure that all the outstanding RCU callbacks to free shadow pages
59 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
60 * can call kvm_tdp_mmu_put_root and create new callbacks.
65 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
67 free_page((unsigned long)sp->spt);
68 kmem_cache_free(mmu_page_header_cache, sp);
72 * This is called through call_rcu in order to free TDP page table memory
73 * safely with respect to other kernel threads that may be operating on
75 * By only accessing TDP MMU page table memory in an RCU read critical
76 * section, and freeing it after a grace period, lockless access to that
77 * memory won't use it after it is freed.
79 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
81 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
87 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
90 static void tdp_mmu_zap_root_work(struct work_struct *work)
92 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
94 struct kvm *kvm = root->tdp_mmu_async_data;
96 read_lock(&kvm->mmu_lock);
99 * A TLB flush is not necessary as KVM performs a local TLB flush when
100 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
101 * to a different pCPU. Note, the local TLB flush on reuse also
102 * invalidates any paging-structure-cache entries, i.e. TLB entries for
103 * intermediate paging structures, that may be zapped, as such entries
104 * are associated with the ASID on both VMX and SVM.
106 tdp_mmu_zap_root(kvm, root, true);
109 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
110 * avoiding an infinite loop. By design, the root is reachable while
111 * it's being asynchronously zapped, thus a different task can put its
112 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
113 * asynchronously zapped root is unavoidable.
115 kvm_tdp_mmu_put_root(kvm, root, true);
117 read_unlock(&kvm->mmu_lock);
120 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
122 root->tdp_mmu_async_data = kvm;
123 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
124 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
127 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
130 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
132 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
135 WARN_ON(!root->tdp_mmu_page);
137 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
138 list_del_rcu(&root->link);
139 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
142 * A TLB flush is not necessary as KVM performs a local TLB flush when
143 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
144 * to a different pCPU. Note, the local TLB flush on reuse also
145 * invalidates any paging-structure-cache entries, i.e. TLB entries for
146 * intermediate paging structures, that may be zapped, as such entries
147 * are associated with the ASID on both VMX and SVM.
149 tdp_mmu_zap_root(kvm, root, shared);
151 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
155 * Returns the next root after @prev_root (or the first root if @prev_root is
156 * NULL). A reference to the returned root is acquired, and the reference to
157 * @prev_root is released (the caller obviously must hold a reference to
158 * @prev_root if it's non-NULL).
160 * If @only_valid is true, invalid roots are skipped.
162 * Returns NULL if the end of tdp_mmu_roots was reached.
164 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
165 struct kvm_mmu_page *prev_root,
166 bool shared, bool only_valid)
168 struct kvm_mmu_page *next_root;
173 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
175 typeof(*prev_root), link);
177 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
178 typeof(*next_root), link);
181 if ((!only_valid || !next_root->role.invalid) &&
182 kvm_tdp_mmu_get_root(next_root))
185 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
186 &next_root->link, typeof(*next_root), link);
192 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
198 * Note: this iterator gets and puts references to the roots it iterates over.
199 * This makes it safe to release the MMU lock and yield within the loop, but
200 * if exiting the loop early, the caller must drop the reference to the most
201 * recent root. (Unless keeping a live reference is desirable.)
203 * If shared is set, this function is operating under the MMU lock in read
204 * mode. In the unlikely event that this thread must free a root, the lock
205 * will be temporarily dropped and reacquired in write mode.
207 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
208 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
210 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
211 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
212 kvm_mmu_page_as_id(_root) != _as_id) { \
215 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
216 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
218 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
219 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
222 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
223 * the implication being that any flow that holds mmu_lock for read is
224 * inherently yield-friendly and should use the yield-safe variant above.
225 * Holding mmu_lock for write obviates the need for RCU protection as the list
226 * is guaranteed to be stable.
228 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
229 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
230 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
231 kvm_mmu_page_as_id(_root) != _as_id) { \
234 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
236 struct kvm_mmu_page *sp;
238 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
239 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
244 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
245 gfn_t gfn, union kvm_mmu_page_role role)
247 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
252 sp->tdp_mmu_page = true;
254 trace_kvm_mmu_get_page(sp, true);
257 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
258 struct tdp_iter *iter)
260 struct kvm_mmu_page *parent_sp;
261 union kvm_mmu_page_role role;
263 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
265 role = parent_sp->role;
268 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
271 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
273 union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
274 struct kvm *kvm = vcpu->kvm;
275 struct kvm_mmu_page *root;
277 lockdep_assert_held_write(&kvm->mmu_lock);
280 * Check for an existing root before allocating a new one. Note, the
281 * role check prevents consuming an invalid root.
283 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
284 if (root->role.word == role.word &&
285 kvm_tdp_mmu_get_root(root))
289 root = tdp_mmu_alloc_sp(vcpu);
290 tdp_mmu_init_sp(root, NULL, 0, role);
292 refcount_set(&root->tdp_mmu_root_count, 1);
294 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
295 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
296 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
299 return __pa(root->spt);
302 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
303 u64 old_spte, u64 new_spte, int level,
306 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
308 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
311 if (is_accessed_spte(old_spte) &&
312 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
313 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
314 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
317 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
318 u64 old_spte, u64 new_spte, int level)
321 struct kvm_memory_slot *slot;
323 if (level > PG_LEVEL_4K)
326 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
328 if ((!is_writable_pte(old_spte) || pfn_changed) &&
329 is_writable_pte(new_spte)) {
330 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
331 mark_page_dirty_in_slot(kvm, slot, gfn);
336 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
339 * @sp: the page to be removed
340 * @shared: This operation may not be running under the exclusive use of
341 * the MMU lock and the operation must synchronize with other
342 * threads that might be adding or removing pages.
344 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
348 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
350 lockdep_assert_held_write(&kvm->mmu_lock);
353 if (sp->lpage_disallowed)
354 unaccount_huge_nx_page(kvm, sp);
357 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
361 * handle_removed_pt() - handle a page table removed from the TDP structure
364 * @pt: the page removed from the paging structure
365 * @shared: This operation may not be running under the exclusive use
366 * of the MMU lock and the operation must synchronize with other
367 * threads that might be modifying SPTEs.
369 * Given a page table that has been removed from the TDP paging structure,
370 * iterates through the page table to clear SPTEs and free child page tables.
372 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
373 * protection. Since this thread removed it from the paging structure,
374 * this thread will be responsible for ensuring the page is freed. Hence the
375 * early rcu_dereferences in the function.
377 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
379 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
380 int level = sp->role.level;
381 gfn_t base_gfn = sp->gfn;
384 trace_kvm_mmu_prepare_zap_page(sp);
386 tdp_mmu_unlink_sp(kvm, sp, shared);
388 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
389 u64 *sptep = rcu_dereference(pt) + i;
390 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
395 * Set the SPTE to a nonpresent value that other
396 * threads will not overwrite. If the SPTE was
397 * already marked as removed then another thread
398 * handling a page fault could overwrite it, so
399 * set the SPTE until it is set from some other
400 * value to the removed SPTE value.
403 old_child_spte = xchg(sptep, REMOVED_SPTE);
404 if (!is_removed_spte(old_child_spte))
410 * If the SPTE is not MMU-present, there is no backing
411 * page associated with the SPTE and so no side effects
412 * that need to be recorded, and exclusive ownership of
413 * mmu_lock ensures the SPTE can't be made present.
414 * Note, zapping MMIO SPTEs is also unnecessary as they
415 * are guarded by the memslots generation, not by being
418 old_child_spte = READ_ONCE(*sptep);
419 if (!is_shadow_present_pte(old_child_spte))
423 * Marking the SPTE as a removed SPTE is not
424 * strictly necessary here as the MMU lock will
425 * stop other threads from concurrently modifying
426 * this SPTE. Using the removed SPTE value keeps
427 * the two branches consistent and simplifies
430 WRITE_ONCE(*sptep, REMOVED_SPTE);
432 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
433 old_child_spte, REMOVED_SPTE, level,
437 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
441 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
443 * @as_id: the address space of the paging structure the SPTE was a part of
444 * @gfn: the base GFN that was mapped by the SPTE
445 * @old_spte: The value of the SPTE before the change
446 * @new_spte: The value of the SPTE after the change
447 * @level: the level of the PT the SPTE is part of in the paging structure
448 * @shared: This operation may not be running under the exclusive use of
449 * the MMU lock and the operation must synchronize with other
450 * threads that might be modifying SPTEs.
452 * Handle bookkeeping that might result from the modification of a SPTE.
453 * This function must be called for all TDP SPTE modifications.
455 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
456 u64 old_spte, u64 new_spte, int level,
459 bool was_present = is_shadow_present_pte(old_spte);
460 bool is_present = is_shadow_present_pte(new_spte);
461 bool was_leaf = was_present && is_last_spte(old_spte, level);
462 bool is_leaf = is_present && is_last_spte(new_spte, level);
463 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
465 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
466 WARN_ON(level < PG_LEVEL_4K);
467 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
470 * If this warning were to trigger it would indicate that there was a
471 * missing MMU notifier or a race with some notifier handler.
472 * A present, leaf SPTE should never be directly replaced with another
473 * present leaf SPTE pointing to a different PFN. A notifier handler
474 * should be zapping the SPTE before the main MM's page table is
475 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
476 * thread before replacement.
478 if (was_leaf && is_leaf && pfn_changed) {
479 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
480 "SPTE with another present leaf SPTE mapping a\n"
482 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
483 as_id, gfn, old_spte, new_spte, level);
486 * Crash the host to prevent error propagation and guest data
492 if (old_spte == new_spte)
495 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
498 check_spte_writable_invariants(new_spte);
501 * The only times a SPTE should be changed from a non-present to
502 * non-present state is when an MMIO entry is installed/modified/
503 * removed. In that case, there is nothing to do here.
505 if (!was_present && !is_present) {
507 * If this change does not involve a MMIO SPTE or removed SPTE,
508 * it is unexpected. Log the change, though it should not
509 * impact the guest since both the former and current SPTEs
512 if (WARN_ON(!is_mmio_spte(old_spte) &&
513 !is_mmio_spte(new_spte) &&
514 !is_removed_spte(new_spte)))
515 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
516 "should not be replaced with another,\n"
517 "different nonpresent SPTE, unless one or both\n"
518 "are MMIO SPTEs, or the new SPTE is\n"
519 "a temporary removed SPTE.\n"
520 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
521 as_id, gfn, old_spte, new_spte, level);
525 if (is_leaf != was_leaf)
526 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
528 if (was_leaf && is_dirty_spte(old_spte) &&
529 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
530 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
533 * Recursively handle child PTs if the change removed a subtree from
534 * the paging structure. Note the WARN on the PFN changing without the
535 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
536 * pages are kernel allocations and should never be migrated.
538 if (was_present && !was_leaf &&
539 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
540 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
543 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
544 u64 old_spte, u64 new_spte, int level,
547 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
549 handle_changed_spte_acc_track(old_spte, new_spte, level);
550 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
555 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
556 * and handle the associated bookkeeping. Do not mark the page dirty
557 * in KVM's dirty bitmaps.
559 * If setting the SPTE fails because it has changed, iter->old_spte will be
560 * refreshed to the current value of the spte.
563 * @iter: a tdp_iter instance currently on the SPTE that should be set
564 * @new_spte: The value the SPTE should be set to
566 * * 0 - If the SPTE was set.
567 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
568 * no side-effects other than setting iter->old_spte to the last
569 * known value of the spte.
571 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
572 struct tdp_iter *iter,
575 u64 *sptep = rcu_dereference(iter->sptep);
578 WARN_ON_ONCE(iter->yielded);
580 lockdep_assert_held_read(&kvm->mmu_lock);
583 * Do not change removed SPTEs. Only the thread that froze the SPTE
586 if (is_removed_spte(iter->old_spte))
590 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
591 * does not hold the mmu_lock.
593 old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
594 if (old_spte != iter->old_spte) {
596 * The page table entry was modified by a different logical
597 * CPU. Refresh iter->old_spte with the current value so the
598 * caller operates on fresh data, e.g. if it retries
599 * tdp_mmu_set_spte_atomic().
601 iter->old_spte = old_spte;
605 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
606 new_spte, iter->level, true);
607 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
612 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
613 struct tdp_iter *iter)
618 * Freeze the SPTE by setting it to a special,
619 * non-present value. This will stop other threads from
620 * immediately installing a present entry in its place
621 * before the TLBs are flushed.
623 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
627 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
628 KVM_PAGES_PER_HPAGE(iter->level));
631 * No other thread can overwrite the removed SPTE as they
632 * must either wait on the MMU lock or use
633 * tdp_mmu_set_spte_atomic which will not overwrite the
634 * special removed SPTE value. No bookkeeping is needed
635 * here since the SPTE is going from non-present
638 kvm_tdp_mmu_write_spte(iter->sptep, 0);
645 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
647 * @as_id: Address space ID, i.e. regular vs. SMM
648 * @sptep: Pointer to the SPTE
649 * @old_spte: The current value of the SPTE
650 * @new_spte: The new value that will be set for the SPTE
651 * @gfn: The base GFN that was (or will be) mapped by the SPTE
652 * @level: The level _containing_ the SPTE (its parent PT's level)
653 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
654 * of the page. Should be set unless handling an MMU
655 * notifier for access tracking. Leaving record_acc_track
656 * unset in that case prevents page accesses from being
658 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
659 * appropriate for the change being made. Should be set
660 * unless performing certain dirty logging operations.
661 * Leaving record_dirty_log unset in that case prevents page
662 * writes from being double counted.
664 static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
665 u64 old_spte, u64 new_spte, gfn_t gfn, int level,
666 bool record_acc_track, bool record_dirty_log)
668 lockdep_assert_held_write(&kvm->mmu_lock);
671 * No thread should be using this function to set SPTEs to or from the
672 * temporary removed SPTE value.
673 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
674 * should be used. If operating under the MMU lock in write mode, the
675 * use of the removed SPTE should not be necessary.
677 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
679 kvm_tdp_mmu_write_spte(sptep, new_spte);
681 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
683 if (record_acc_track)
684 handle_changed_spte_acc_track(old_spte, new_spte, level);
685 if (record_dirty_log)
686 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
690 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
691 u64 new_spte, bool record_acc_track,
692 bool record_dirty_log)
694 WARN_ON_ONCE(iter->yielded);
696 __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
697 new_spte, iter->gfn, iter->level,
698 record_acc_track, record_dirty_log);
701 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
704 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
707 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
708 struct tdp_iter *iter,
711 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
714 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
715 struct tdp_iter *iter,
718 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
721 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
722 for_each_tdp_pte(_iter, _root, _start, _end)
724 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
725 tdp_root_for_each_pte(_iter, _root, _start, _end) \
726 if (!is_shadow_present_pte(_iter.old_spte) || \
727 !is_last_spte(_iter.old_spte, _iter.level)) \
731 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
732 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
735 * Yield if the MMU lock is contended or this thread needs to return control
738 * If this function should yield and flush is set, it will perform a remote
739 * TLB flush before yielding.
741 * If this function yields, iter->yielded is set and the caller must skip to
742 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
743 * over the paging structures to allow the iterator to continue its traversal
744 * from the paging structure root.
746 * Returns true if this function yielded.
748 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
749 struct tdp_iter *iter,
750 bool flush, bool shared)
752 WARN_ON(iter->yielded);
754 /* Ensure forward progress has been made before yielding. */
755 if (iter->next_last_level_gfn == iter->yielded_gfn)
758 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
760 kvm_flush_remote_tlbs(kvm);
765 cond_resched_rwlock_read(&kvm->mmu_lock);
767 cond_resched_rwlock_write(&kvm->mmu_lock);
771 WARN_ON(iter->gfn > iter->next_last_level_gfn);
773 iter->yielded = true;
776 return iter->yielded;
779 static inline gfn_t tdp_mmu_max_gfn_host(void)
782 * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
783 * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
784 * and so KVM will never install a SPTE for such addresses.
786 return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
789 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
792 bool root_is_unreachable = !refcount_read(&root->tdp_mmu_root_count);
793 struct tdp_iter iter;
795 gfn_t end = tdp_mmu_max_gfn_host();
798 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
803 * No need to try to step down in the iterator when zapping an entire
804 * root, zapping an upper-level SPTE will recurse on its children.
806 for_each_tdp_pte_min_level(iter, root, root->role.level, start, end) {
809 * Yielding isn't allowed when zapping an unreachable root as
810 * the root won't be processed by mmu_notifier callbacks. When
811 * handling an unmap/release mmu_notifier command, KVM must
812 * drop all references to relevant pages prior to completing
813 * the callback. Dropping mmu_lock can result in zapping SPTEs
814 * for an unreachable root after a relevant callback completes,
815 * which leads to use-after-free as zapping a SPTE triggers
816 * "writeback" of dirty/accessed bits to the SPTE's associated
819 if (!root_is_unreachable &&
820 tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
823 if (!is_shadow_present_pte(iter.old_spte))
827 tdp_mmu_set_spte(kvm, &iter, 0);
828 } else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) {
830 * cmpxchg() shouldn't fail if the root is unreachable.
831 * Retry so as not to leak the page and its children.
833 WARN_ONCE(root_is_unreachable,
834 "Contended TDP MMU SPTE in unreachable root.");
839 * WARN if the root is invalid and is unreachable, all SPTEs
840 * should've been zapped by kvm_tdp_mmu_zap_invalidated_roots(),
841 * and inserting new SPTEs under an invalid root is a KVM bug.
843 WARN_ON_ONCE(root_is_unreachable && root->role.invalid);
849 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
854 * This helper intentionally doesn't allow zapping a root shadow page,
855 * which doesn't have a parent page table and thus no associated entry.
857 if (WARN_ON_ONCE(!sp->ptep))
860 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
861 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
864 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
865 sp->gfn, sp->role.level + 1, true, true);
871 * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
872 * have been cleared and a TLB flush is needed before releasing the MMU lock.
874 * If can_yield is true, will release the MMU lock and reschedule if the
875 * scheduler needs the CPU or there is contention on the MMU lock. If this
876 * function cannot yield, it will not release the MMU lock or reschedule and
877 * the caller must ensure it does not supply too large a GFN range, or the
878 * operation can cause a soft lockup.
880 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
881 gfn_t start, gfn_t end, bool can_yield, bool flush)
883 struct tdp_iter iter;
885 end = min(end, tdp_mmu_max_gfn_host());
887 lockdep_assert_held_write(&kvm->mmu_lock);
891 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
893 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
898 if (!is_shadow_present_pte(iter.old_spte) ||
899 !is_last_spte(iter.old_spte, iter.level))
902 tdp_mmu_set_spte(kvm, &iter, 0);
909 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
910 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
916 * Tears down the mappings for the range of gfns, [start, end), and frees the
917 * non-root pages mapping GFNs strictly within that range. Returns true if
918 * SPTEs have been cleared and a TLB flush is needed before releasing the
921 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
922 bool can_yield, bool flush)
924 struct kvm_mmu_page *root;
926 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
927 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false);
932 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
934 struct kvm_mmu_page *root;
938 * Zap all roots, including invalid roots, as all SPTEs must be dropped
939 * before returning to the caller. Zap directly even if the root is
940 * also being zapped by a worker. Walking zapped top-level SPTEs isn't
941 * all that expensive and mmu_lock is already held, which means the
942 * worker has yielded, i.e. flushing the work instead of zapping here
943 * isn't guaranteed to be any faster.
945 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
946 * is being destroyed or the userspace VMM has exited. In both cases,
947 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
949 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
950 for_each_tdp_mmu_root_yield_safe(kvm, root, i)
951 tdp_mmu_zap_root(kvm, root, false);
956 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
959 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
961 flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
965 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
966 * is about to be zapped, e.g. in response to a memslots update. The actual
967 * zapping is performed asynchronously, so a reference is taken on all roots.
968 * Using a separate workqueue makes it easy to ensure that the destruction is
969 * performed before the "fast zap" completes, without keeping a separate list
970 * of invalidated roots; the list is effectively the list of work items in
973 * Get a reference even if the root is already invalid, the asynchronous worker
974 * assumes it was gifted a reference to the root it processes. Because mmu_lock
975 * is held for write, it should be impossible to observe a root with zero refcount,
976 * i.e. the list of roots cannot be stale.
978 * This has essentially the same effect for the TDP MMU
979 * as updating mmu_valid_gen does for the shadow MMU.
981 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
983 struct kvm_mmu_page *root;
985 lockdep_assert_held_write(&kvm->mmu_lock);
986 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
987 if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
988 root->role.invalid = true;
989 tdp_mmu_schedule_zap_root(kvm, root);
995 * Installs a last-level SPTE to handle a TDP page fault.
996 * (NPT/EPT violation/misconfiguration)
998 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
999 struct kvm_page_fault *fault,
1000 struct tdp_iter *iter)
1002 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1004 int ret = RET_PF_FIXED;
1005 bool wrprot = false;
1007 WARN_ON(sp->role.level != fault->goal_level);
1008 if (unlikely(!fault->slot))
1009 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1011 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1012 fault->pfn, iter->old_spte, fault->prefetch, true,
1013 fault->map_writable, &new_spte);
1015 if (new_spte == iter->old_spte)
1016 ret = RET_PF_SPURIOUS;
1017 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1018 return RET_PF_RETRY;
1019 else if (is_shadow_present_pte(iter->old_spte) &&
1020 !is_last_spte(iter->old_spte, iter->level))
1021 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1022 KVM_PAGES_PER_HPAGE(iter->level + 1));
1025 * If the page fault was caused by a write but the page is write
1026 * protected, emulation is needed. If the emulation was skipped,
1027 * the vCPU would have the same fault again.
1031 ret = RET_PF_EMULATE;
1034 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1035 if (unlikely(is_mmio_spte(new_spte))) {
1036 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1038 ret = RET_PF_EMULATE;
1040 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1041 rcu_dereference(iter->sptep));
1045 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
1046 * consistent with legacy MMU behavior.
1048 if (ret != RET_PF_SPURIOUS)
1049 vcpu->stat.pf_fixed++;
1055 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1056 * provided page table.
1058 * @kvm: kvm instance
1059 * @iter: a tdp_iter instance currently on the SPTE that should be set
1060 * @sp: The new TDP page table to install.
1061 * @account_nx: True if this page table is being installed to split a
1062 * non-executable huge page.
1063 * @shared: This operation is running under the MMU lock in read mode.
1065 * Returns: 0 if the new page table was installed. Non-0 if the page table
1066 * could not be installed (e.g. the atomic compare-exchange failed).
1068 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1069 struct kvm_mmu_page *sp, bool account_nx,
1072 u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
1076 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1080 tdp_mmu_set_spte(kvm, iter, spte);
1083 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1084 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1086 account_huge_nx_page(kvm, sp);
1087 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1093 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1094 * page tables and SPTEs to translate the faulting guest physical address.
1096 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1098 struct kvm_mmu *mmu = vcpu->arch.mmu;
1099 struct tdp_iter iter;
1100 struct kvm_mmu_page *sp;
1103 kvm_mmu_hugepage_adjust(vcpu, fault);
1105 trace_kvm_mmu_spte_requested(fault);
1109 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1110 if (fault->nx_huge_page_workaround_enabled)
1111 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1113 if (iter.level == fault->goal_level)
1117 * If there is an SPTE mapping a large page at a higher level
1118 * than the target, that SPTE must be cleared and replaced
1119 * with a non-leaf SPTE.
1121 if (is_shadow_present_pte(iter.old_spte) &&
1122 is_large_pte(iter.old_spte)) {
1123 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1127 * The iter must explicitly re-read the spte here
1128 * because the new value informs the !present
1131 iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
1134 if (!is_shadow_present_pte(iter.old_spte)) {
1135 bool account_nx = fault->huge_page_disallowed &&
1136 fault->req_level >= iter.level;
1139 * If SPTE has been frozen by another thread, just
1140 * give up and retry, avoiding unnecessary page table
1141 * allocation and free.
1143 if (is_removed_spte(iter.old_spte))
1146 sp = tdp_mmu_alloc_sp(vcpu);
1147 tdp_mmu_init_child_sp(sp, &iter);
1149 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1150 tdp_mmu_free_sp(sp);
1156 if (iter.level != fault->goal_level) {
1158 return RET_PF_RETRY;
1161 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1167 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1170 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1171 range->end, range->may_block, flush);
1174 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1175 struct kvm_gfn_range *range);
1177 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1178 struct kvm_gfn_range *range,
1179 tdp_handler_t handler)
1181 struct kvm_mmu_page *root;
1182 struct tdp_iter iter;
1186 * Don't support rescheduling, none of the MMU notifiers that funnel
1187 * into this helper allow blocking; it'd be dead, wasteful code.
1189 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1192 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1193 ret |= handler(kvm, &iter, range);
1202 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1203 * if any of the GFNs in the range have been accessed.
1205 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1206 struct kvm_gfn_range *range)
1210 /* If we have a non-accessed entry we don't need to change the pte. */
1211 if (!is_accessed_spte(iter->old_spte))
1214 new_spte = iter->old_spte;
1216 if (spte_ad_enabled(new_spte)) {
1217 new_spte &= ~shadow_accessed_mask;
1220 * Capture the dirty status of the page, so that it doesn't get
1221 * lost when the SPTE is marked for access tracking.
1223 if (is_writable_pte(new_spte))
1224 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1226 new_spte = mark_spte_for_access_track(new_spte);
1229 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1234 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1236 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1239 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1240 struct kvm_gfn_range *range)
1242 return is_accessed_spte(iter->old_spte);
1245 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1247 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1250 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1251 struct kvm_gfn_range *range)
1255 /* Huge pages aren't expected to be modified without first being zapped. */
1256 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1258 if (iter->level != PG_LEVEL_4K ||
1259 !is_shadow_present_pte(iter->old_spte))
1263 * Note, when changing a read-only SPTE, it's not strictly necessary to
1264 * zero the SPTE before setting the new PFN, but doing so preserves the
1265 * invariant that the PFN of a present * leaf SPTE can never change.
1266 * See __handle_changed_spte().
1268 tdp_mmu_set_spte(kvm, iter, 0);
1270 if (!pte_write(range->pte)) {
1271 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1272 pte_pfn(range->pte));
1274 tdp_mmu_set_spte(kvm, iter, new_spte);
1281 * Handle the changed_pte MMU notifier for the TDP MMU.
1282 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1284 * Returns non-zero if a flush is needed before releasing the MMU lock.
1286 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1289 * No need to handle the remote TLB flush under RCU protection, the
1290 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1291 * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
1293 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1297 * Remove write access from all SPTEs at or above min_level that map GFNs
1298 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1301 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1302 gfn_t start, gfn_t end, int min_level)
1304 struct tdp_iter iter;
1306 bool spte_set = false;
1310 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1312 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1314 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1317 if (!is_shadow_present_pte(iter.old_spte) ||
1318 !is_last_spte(iter.old_spte, iter.level) ||
1319 !(iter.old_spte & PT_WRITABLE_MASK))
1322 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1324 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1335 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1336 * only affect leaf SPTEs down to min_level.
1337 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1339 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1340 const struct kvm_memory_slot *slot, int min_level)
1342 struct kvm_mmu_page *root;
1343 bool spte_set = false;
1345 lockdep_assert_held_read(&kvm->mmu_lock);
1347 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1348 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1349 slot->base_gfn + slot->npages, min_level);
1354 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1356 struct kvm_mmu_page *sp;
1360 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1364 sp->spt = (void *)__get_free_page(gfp);
1366 kmem_cache_free(mmu_page_header_cache, sp);
1373 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1374 struct tdp_iter *iter,
1377 struct kvm_mmu_page *sp;
1380 * Since we are allocating while under the MMU lock we have to be
1381 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1382 * reclaim and to avoid making any filesystem callbacks (which can end
1383 * up invoking KVM MMU notifiers, resulting in a deadlock).
1385 * If this allocation fails we drop the lock and retry with reclaim
1388 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1395 read_unlock(&kvm->mmu_lock);
1397 write_unlock(&kvm->mmu_lock);
1399 iter->yielded = true;
1400 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1403 read_lock(&kvm->mmu_lock);
1405 write_lock(&kvm->mmu_lock);
1412 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1413 struct kvm_mmu_page *sp, bool shared)
1415 const u64 huge_spte = iter->old_spte;
1416 const int level = iter->level;
1419 tdp_mmu_init_child_sp(sp, iter);
1422 * No need for atomics when writing to sp->spt since the page table has
1423 * not been linked in yet and thus is not reachable from any other CPU.
1425 for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1426 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1429 * Replace the huge spte with a pointer to the populated lower level
1430 * page table. Since we are making this change without a TLB flush vCPUs
1431 * will see a mix of the split mappings and the original huge mapping,
1432 * depending on what's currently in their TLB. This is fine from a
1433 * correctness standpoint since the translation will be the same either
1436 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1441 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1442 * are overwriting from the page stats. But we have to manually update
1443 * the page stats with the new present child pages.
1445 kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1448 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1452 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1453 struct kvm_mmu_page *root,
1454 gfn_t start, gfn_t end,
1455 int target_level, bool shared)
1457 struct kvm_mmu_page *sp = NULL;
1458 struct tdp_iter iter;
1464 * Traverse the page table splitting all huge pages above the target
1465 * level into one lower level. For example, if we encounter a 1GB page
1466 * we split it into 512 2MB pages.
1468 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1469 * to visit an SPTE before ever visiting its children, which means we
1470 * will correctly recursively split huge pages that are more than one
1471 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1472 * and then splitting each of those to 512 4KB pages).
1474 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1476 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1479 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1483 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1486 trace_kvm_mmu_split_huge_page(iter.gfn,
1496 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1505 * It's possible to exit the loop having never used the last sp if, for
1506 * example, a vCPU doing HugePage NX splitting wins the race and
1507 * installs its own sp in place of the last sp we tried to split.
1510 tdp_mmu_free_sp(sp);
1517 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1519 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1520 const struct kvm_memory_slot *slot,
1521 gfn_t start, gfn_t end,
1522 int target_level, bool shared)
1524 struct kvm_mmu_page *root;
1527 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1529 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1530 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1532 kvm_tdp_mmu_put_root(kvm, root, shared);
1539 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1540 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1541 * If AD bits are not enabled, this will require clearing the writable bit on
1542 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1545 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1546 gfn_t start, gfn_t end)
1548 struct tdp_iter iter;
1550 bool spte_set = false;
1554 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1556 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1559 if (!is_shadow_present_pte(iter.old_spte))
1562 if (spte_ad_need_write_protect(iter.old_spte)) {
1563 if (is_writable_pte(iter.old_spte))
1564 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1568 if (iter.old_spte & shadow_dirty_mask)
1569 new_spte = iter.old_spte & ~shadow_dirty_mask;
1574 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1585 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1586 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1587 * If AD bits are not enabled, this will require clearing the writable bit on
1588 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1591 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1592 const struct kvm_memory_slot *slot)
1594 struct kvm_mmu_page *root;
1595 bool spte_set = false;
1597 lockdep_assert_held_read(&kvm->mmu_lock);
1599 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1600 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1601 slot->base_gfn + slot->npages);
1607 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1608 * set in mask, starting at gfn. The given memslot is expected to contain all
1609 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1610 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1611 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1613 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1614 gfn_t gfn, unsigned long mask, bool wrprot)
1616 struct tdp_iter iter;
1621 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1622 gfn + BITS_PER_LONG) {
1626 if (iter.level > PG_LEVEL_4K ||
1627 !(mask & (1UL << (iter.gfn - gfn))))
1630 mask &= ~(1UL << (iter.gfn - gfn));
1632 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1633 if (is_writable_pte(iter.old_spte))
1634 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1638 if (iter.old_spte & shadow_dirty_mask)
1639 new_spte = iter.old_spte & ~shadow_dirty_mask;
1644 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1651 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1652 * set in mask, starting at gfn. The given memslot is expected to contain all
1653 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1654 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1655 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1657 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1658 struct kvm_memory_slot *slot,
1659 gfn_t gfn, unsigned long mask,
1662 struct kvm_mmu_page *root;
1664 lockdep_assert_held_write(&kvm->mmu_lock);
1665 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1666 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1670 * Clear leaf entries which could be replaced by large mappings, for
1671 * GFNs within the slot.
1673 static void zap_collapsible_spte_range(struct kvm *kvm,
1674 struct kvm_mmu_page *root,
1675 const struct kvm_memory_slot *slot)
1677 gfn_t start = slot->base_gfn;
1678 gfn_t end = start + slot->npages;
1679 struct tdp_iter iter;
1684 tdp_root_for_each_pte(iter, root, start, end) {
1686 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1689 if (!is_shadow_present_pte(iter.old_spte) ||
1690 !is_last_spte(iter.old_spte, iter.level))
1693 pfn = spte_to_pfn(iter.old_spte);
1694 if (kvm_is_reserved_pfn(pfn) ||
1695 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1699 /* Note, a successful atomic zap also does a remote TLB flush. */
1700 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1708 * Clear non-leaf entries (and free associated page tables) which could
1709 * be replaced by large mappings, for GFNs within the slot.
1711 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1712 const struct kvm_memory_slot *slot)
1714 struct kvm_mmu_page *root;
1716 lockdep_assert_held_read(&kvm->mmu_lock);
1718 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1719 zap_collapsible_spte_range(kvm, root, slot);
1723 * Removes write access on the last level SPTE mapping this GFN and unsets the
1724 * MMU-writable bit to ensure future writes continue to be intercepted.
1725 * Returns true if an SPTE was set and a TLB flush is needed.
1727 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1728 gfn_t gfn, int min_level)
1730 struct tdp_iter iter;
1732 bool spte_set = false;
1734 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1738 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1739 if (!is_shadow_present_pte(iter.old_spte) ||
1740 !is_last_spte(iter.old_spte, iter.level))
1743 new_spte = iter.old_spte &
1744 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1746 if (new_spte == iter.old_spte)
1749 tdp_mmu_set_spte(kvm, &iter, new_spte);
1759 * Removes write access on the last level SPTE mapping this GFN and unsets the
1760 * MMU-writable bit to ensure future writes continue to be intercepted.
1761 * Returns true if an SPTE was set and a TLB flush is needed.
1763 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1764 struct kvm_memory_slot *slot, gfn_t gfn,
1767 struct kvm_mmu_page *root;
1768 bool spte_set = false;
1770 lockdep_assert_held_write(&kvm->mmu_lock);
1771 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1772 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1778 * Return the level of the lowest level SPTE added to sptes.
1779 * That SPTE may be non-present.
1781 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1783 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1786 struct tdp_iter iter;
1787 struct kvm_mmu *mmu = vcpu->arch.mmu;
1788 gfn_t gfn = addr >> PAGE_SHIFT;
1791 *root_level = vcpu->arch.mmu->shadow_root_level;
1793 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1795 sptes[leaf] = iter.old_spte;
1802 * Returns the last level spte pointer of the shadow page walk for the given
1803 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1804 * walk could be performed, returns NULL and *spte does not contain valid data.
1807 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1808 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1810 * WARNING: This function is only intended to be called during fast_page_fault.
1812 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1815 struct tdp_iter iter;
1816 struct kvm_mmu *mmu = vcpu->arch.mmu;
1817 gfn_t gfn = addr >> PAGE_SHIFT;
1818 tdp_ptep_t sptep = NULL;
1820 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1821 *spte = iter.old_spte;
1826 * Perform the rcu_dereference to get the raw spte pointer value since
1827 * we are passing it up to fast_page_fault, which is shared with the
1828 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1831 * This is safe since fast_page_fault obeys the contracts of this
1832 * function as well as all TDP MMU contracts around modifying SPTEs
1833 * outside of mmu_lock.
1835 return rcu_dereference(sptep);