1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5 #include "mmu_internal.h"
11 #include <asm/cmpxchg.h>
12 #include <trace/events/kvm.h>
14 /* Initializes the TDP MMU for the VM, if enabled. */
15 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
17 struct workqueue_struct *wq;
19 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
23 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
24 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
25 kvm->arch.tdp_mmu_zap_wq = wq;
29 /* Arbitrarily returns true so that this may be used in if statements. */
30 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
34 lockdep_assert_held_read(&kvm->mmu_lock);
36 lockdep_assert_held_write(&kvm->mmu_lock);
41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
43 /* Also waits for any queued work items. */
44 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
46 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
47 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
50 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
52 * can call kvm_tdp_mmu_put_root and create new callbacks.
57 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
59 free_page((unsigned long)sp->spt);
60 kmem_cache_free(mmu_page_header_cache, sp);
64 * This is called through call_rcu in order to free TDP page table memory
65 * safely with respect to other kernel threads that may be operating on
67 * By only accessing TDP MMU page table memory in an RCU read critical
68 * section, and freeing it after a grace period, lockless access to that
69 * memory won't use it after it is freed.
71 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
73 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
79 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
82 static void tdp_mmu_zap_root_work(struct work_struct *work)
84 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
86 struct kvm *kvm = root->tdp_mmu_async_data;
88 read_lock(&kvm->mmu_lock);
91 * A TLB flush is not necessary as KVM performs a local TLB flush when
92 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
93 * to a different pCPU. Note, the local TLB flush on reuse also
94 * invalidates any paging-structure-cache entries, i.e. TLB entries for
95 * intermediate paging structures, that may be zapped, as such entries
96 * are associated with the ASID on both VMX and SVM.
98 tdp_mmu_zap_root(kvm, root, true);
101 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
102 * avoiding an infinite loop. By design, the root is reachable while
103 * it's being asynchronously zapped, thus a different task can put its
104 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
105 * asynchronously zapped root is unavoidable.
107 kvm_tdp_mmu_put_root(kvm, root, true);
109 read_unlock(&kvm->mmu_lock);
112 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
114 root->tdp_mmu_async_data = kvm;
115 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
116 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
119 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
121 union kvm_mmu_page_role role = page->role;
124 /* No need to use cmpxchg, only the invalid bit can change. */
125 role.word = xchg(&page->role.word, role.word);
129 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
132 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
134 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
137 WARN_ON(!is_tdp_mmu_page(root));
140 * The root now has refcount=0. It is valid, but readers already
141 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
142 * rejects it. This remains true for the rest of the execution
143 * of this function, because readers visit valid roots only
144 * (except for tdp_mmu_zap_root_work(), which however
145 * does not acquire any reference itself).
147 * Even though there are flows that need to visit all roots for
148 * correctness, they all take mmu_lock for write, so they cannot yet
149 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
150 * since the root still has refcount=0.
152 * However, tdp_mmu_zap_root can yield, and writers do not expect to
153 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
154 * So the root temporarily gets an extra reference, going to refcount=1
155 * while staying invalid. Readers still cannot acquire any reference;
156 * but writers are now allowed to run if tdp_mmu_zap_root yields and
157 * they might take an extra reference if they themselves yield.
158 * Therefore, when the reference is given back by the worker,
159 * there is no guarantee that the refcount is still 1. If not, whoever
160 * puts the last reference will free the page, but they will not have to
161 * zap the root because a root cannot go from invalid to valid.
163 if (!kvm_tdp_root_mark_invalid(root)) {
164 refcount_set(&root->tdp_mmu_root_count, 1);
167 * Zapping the root in a worker is not just "nice to have";
168 * it is required because kvm_tdp_mmu_invalidate_all_roots()
169 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did
170 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
171 * might return with some roots not zapped yet.
173 tdp_mmu_schedule_zap_root(kvm, root);
177 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
178 list_del_rcu(&root->link);
179 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
180 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
184 * Returns the next root after @prev_root (or the first root if @prev_root is
185 * NULL). A reference to the returned root is acquired, and the reference to
186 * @prev_root is released (the caller obviously must hold a reference to
187 * @prev_root if it's non-NULL).
189 * If @only_valid is true, invalid roots are skipped.
191 * Returns NULL if the end of tdp_mmu_roots was reached.
193 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
194 struct kvm_mmu_page *prev_root,
195 bool shared, bool only_valid)
197 struct kvm_mmu_page *next_root;
202 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
204 typeof(*prev_root), link);
206 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
207 typeof(*next_root), link);
210 if ((!only_valid || !next_root->role.invalid) &&
211 kvm_tdp_mmu_get_root(next_root))
214 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
215 &next_root->link, typeof(*next_root), link);
221 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
227 * Note: this iterator gets and puts references to the roots it iterates over.
228 * This makes it safe to release the MMU lock and yield within the loop, but
229 * if exiting the loop early, the caller must drop the reference to the most
230 * recent root. (Unless keeping a live reference is desirable.)
232 * If shared is set, this function is operating under the MMU lock in read
233 * mode. In the unlikely event that this thread must free a root, the lock
234 * will be temporarily dropped and reacquired in write mode.
236 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
237 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
239 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
240 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
241 kvm_mmu_page_as_id(_root) != _as_id) { \
244 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
245 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
247 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
248 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
251 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
252 * the implication being that any flow that holds mmu_lock for read is
253 * inherently yield-friendly and should use the yield-safe variant above.
254 * Holding mmu_lock for write obviates the need for RCU protection as the list
255 * is guaranteed to be stable.
257 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
258 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
259 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
260 kvm_mmu_page_as_id(_root) != _as_id) { \
263 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
265 struct kvm_mmu_page *sp;
267 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
268 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
273 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
274 gfn_t gfn, union kvm_mmu_page_role role)
276 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
278 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
283 sp->tdp_mmu_page = true;
285 trace_kvm_mmu_get_page(sp, true);
288 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
289 struct tdp_iter *iter)
291 struct kvm_mmu_page *parent_sp;
292 union kvm_mmu_page_role role;
294 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
296 role = parent_sp->role;
299 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
302 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
304 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
305 struct kvm *kvm = vcpu->kvm;
306 struct kvm_mmu_page *root;
308 lockdep_assert_held_write(&kvm->mmu_lock);
311 * Check for an existing root before allocating a new one. Note, the
312 * role check prevents consuming an invalid root.
314 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
315 if (root->role.word == role.word &&
316 kvm_tdp_mmu_get_root(root))
320 root = tdp_mmu_alloc_sp(vcpu);
321 tdp_mmu_init_sp(root, NULL, 0, role);
323 refcount_set(&root->tdp_mmu_root_count, 1);
325 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
326 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
327 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
330 return __pa(root->spt);
333 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
334 u64 old_spte, u64 new_spte, int level,
337 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
339 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
342 if (is_accessed_spte(old_spte) &&
343 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
344 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
345 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
348 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
349 u64 old_spte, u64 new_spte, int level)
352 struct kvm_memory_slot *slot;
354 if (level > PG_LEVEL_4K)
357 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
359 if ((!is_writable_pte(old_spte) || pfn_changed) &&
360 is_writable_pte(new_spte)) {
361 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
362 mark_page_dirty_in_slot(kvm, slot, gfn);
366 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
368 kvm_account_pgtable_pages((void *)sp->spt, +1);
369 atomic64_inc(&kvm->arch.tdp_mmu_pages);
372 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
374 kvm_account_pgtable_pages((void *)sp->spt, -1);
375 atomic64_dec(&kvm->arch.tdp_mmu_pages);
379 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
382 * @sp: the page to be removed
383 * @shared: This operation may not be running under the exclusive use of
384 * the MMU lock and the operation must synchronize with other
385 * threads that might be adding or removing pages.
387 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
390 tdp_unaccount_mmu_page(kvm, sp);
392 if (!sp->nx_huge_page_disallowed)
396 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
398 lockdep_assert_held_write(&kvm->mmu_lock);
400 sp->nx_huge_page_disallowed = false;
401 untrack_possible_nx_huge_page(kvm, sp);
404 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
408 * handle_removed_pt() - handle a page table removed from the TDP structure
411 * @pt: the page removed from the paging structure
412 * @shared: This operation may not be running under the exclusive use
413 * of the MMU lock and the operation must synchronize with other
414 * threads that might be modifying SPTEs.
416 * Given a page table that has been removed from the TDP paging structure,
417 * iterates through the page table to clear SPTEs and free child page tables.
419 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
420 * protection. Since this thread removed it from the paging structure,
421 * this thread will be responsible for ensuring the page is freed. Hence the
422 * early rcu_dereferences in the function.
424 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
426 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
427 int level = sp->role.level;
428 gfn_t base_gfn = sp->gfn;
431 trace_kvm_mmu_prepare_zap_page(sp);
433 tdp_mmu_unlink_sp(kvm, sp, shared);
435 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
436 tdp_ptep_t sptep = pt + i;
437 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
442 * Set the SPTE to a nonpresent value that other
443 * threads will not overwrite. If the SPTE was
444 * already marked as removed then another thread
445 * handling a page fault could overwrite it, so
446 * set the SPTE until it is set from some other
447 * value to the removed SPTE value.
450 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
451 if (!is_removed_spte(old_spte))
457 * If the SPTE is not MMU-present, there is no backing
458 * page associated with the SPTE and so no side effects
459 * that need to be recorded, and exclusive ownership of
460 * mmu_lock ensures the SPTE can't be made present.
461 * Note, zapping MMIO SPTEs is also unnecessary as they
462 * are guarded by the memslots generation, not by being
465 old_spte = kvm_tdp_mmu_read_spte(sptep);
466 if (!is_shadow_present_pte(old_spte))
470 * Use the common helper instead of a raw WRITE_ONCE as
471 * the SPTE needs to be updated atomically if it can be
472 * modified by a different vCPU outside of mmu_lock.
473 * Even though the parent SPTE is !PRESENT, the TLB
474 * hasn't yet been flushed, and both Intel and AMD
475 * document that A/D assists can use upper-level PxE
476 * entries that are cached in the TLB, i.e. the CPU can
477 * still access the page and mark it dirty.
479 * No retry is needed in the atomic update path as the
480 * sole concern is dropping a Dirty bit, i.e. no other
481 * task can zap/remove the SPTE as mmu_lock is held for
482 * write. Marking the SPTE as a removed SPTE is not
483 * strictly necessary for the same reason, but using
484 * the remove SPTE value keeps the shared/exclusive
485 * paths consistent and allows the handle_changed_spte()
486 * call below to hardcode the new value to REMOVED_SPTE.
488 * Note, even though dropping a Dirty bit is the only
489 * scenario where a non-atomic update could result in a
490 * functional bug, simply checking the Dirty bit isn't
491 * sufficient as a fast page fault could read the upper
492 * level SPTE before it is zapped, and then make this
493 * target SPTE writable, resume the guest, and set the
494 * Dirty bit between reading the SPTE above and writing
497 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
498 REMOVED_SPTE, level);
500 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
501 old_spte, REMOVED_SPTE, level, shared);
504 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
508 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
510 * @as_id: the address space of the paging structure the SPTE was a part of
511 * @gfn: the base GFN that was mapped by the SPTE
512 * @old_spte: The value of the SPTE before the change
513 * @new_spte: The value of the SPTE after the change
514 * @level: the level of the PT the SPTE is part of in the paging structure
515 * @shared: This operation may not be running under the exclusive use of
516 * the MMU lock and the operation must synchronize with other
517 * threads that might be modifying SPTEs.
519 * Handle bookkeeping that might result from the modification of a SPTE.
520 * This function must be called for all TDP SPTE modifications.
522 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
523 u64 old_spte, u64 new_spte, int level,
526 bool was_present = is_shadow_present_pte(old_spte);
527 bool is_present = is_shadow_present_pte(new_spte);
528 bool was_leaf = was_present && is_last_spte(old_spte, level);
529 bool is_leaf = is_present && is_last_spte(new_spte, level);
530 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
532 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
533 WARN_ON(level < PG_LEVEL_4K);
534 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
537 * If this warning were to trigger it would indicate that there was a
538 * missing MMU notifier or a race with some notifier handler.
539 * A present, leaf SPTE should never be directly replaced with another
540 * present leaf SPTE pointing to a different PFN. A notifier handler
541 * should be zapping the SPTE before the main MM's page table is
542 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
543 * thread before replacement.
545 if (was_leaf && is_leaf && pfn_changed) {
546 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
547 "SPTE with another present leaf SPTE mapping a\n"
549 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
550 as_id, gfn, old_spte, new_spte, level);
553 * Crash the host to prevent error propagation and guest data
559 if (old_spte == new_spte)
562 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
565 check_spte_writable_invariants(new_spte);
568 * The only times a SPTE should be changed from a non-present to
569 * non-present state is when an MMIO entry is installed/modified/
570 * removed. In that case, there is nothing to do here.
572 if (!was_present && !is_present) {
574 * If this change does not involve a MMIO SPTE or removed SPTE,
575 * it is unexpected. Log the change, though it should not
576 * impact the guest since both the former and current SPTEs
579 if (WARN_ON(!is_mmio_spte(old_spte) &&
580 !is_mmio_spte(new_spte) &&
581 !is_removed_spte(new_spte)))
582 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
583 "should not be replaced with another,\n"
584 "different nonpresent SPTE, unless one or both\n"
585 "are MMIO SPTEs, or the new SPTE is\n"
586 "a temporary removed SPTE.\n"
587 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
588 as_id, gfn, old_spte, new_spte, level);
592 if (is_leaf != was_leaf)
593 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
595 if (was_leaf && is_dirty_spte(old_spte) &&
596 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
597 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
600 * Recursively handle child PTs if the change removed a subtree from
601 * the paging structure. Note the WARN on the PFN changing without the
602 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
603 * pages are kernel allocations and should never be migrated.
605 if (was_present && !was_leaf &&
606 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
607 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
610 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
611 u64 old_spte, u64 new_spte, int level,
614 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
616 handle_changed_spte_acc_track(old_spte, new_spte, level);
617 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
622 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
623 * and handle the associated bookkeeping. Do not mark the page dirty
624 * in KVM's dirty bitmaps.
626 * If setting the SPTE fails because it has changed, iter->old_spte will be
627 * refreshed to the current value of the spte.
630 * @iter: a tdp_iter instance currently on the SPTE that should be set
631 * @new_spte: The value the SPTE should be set to
633 * * 0 - If the SPTE was set.
634 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
635 * no side-effects other than setting iter->old_spte to the last
636 * known value of the spte.
638 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
639 struct tdp_iter *iter,
642 u64 *sptep = rcu_dereference(iter->sptep);
645 * The caller is responsible for ensuring the old SPTE is not a REMOVED
646 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
647 * and pre-checking before inserting a new SPTE is advantageous as it
648 * avoids unnecessary work.
650 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
652 lockdep_assert_held_read(&kvm->mmu_lock);
655 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
656 * does not hold the mmu_lock.
658 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
661 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
662 new_spte, iter->level, true);
663 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
668 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
669 struct tdp_iter *iter)
674 * Freeze the SPTE by setting it to a special,
675 * non-present value. This will stop other threads from
676 * immediately installing a present entry in its place
677 * before the TLBs are flushed.
679 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
683 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
684 KVM_PAGES_PER_HPAGE(iter->level));
687 * No other thread can overwrite the removed SPTE as they must either
688 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
689 * overwrite the special removed SPTE value. No bookkeeping is needed
690 * here since the SPTE is going from non-present to non-present. Use
691 * the raw write helper to avoid an unnecessary check on volatile bits.
693 __kvm_tdp_mmu_write_spte(iter->sptep, 0);
700 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
702 * @as_id: Address space ID, i.e. regular vs. SMM
703 * @sptep: Pointer to the SPTE
704 * @old_spte: The current value of the SPTE
705 * @new_spte: The new value that will be set for the SPTE
706 * @gfn: The base GFN that was (or will be) mapped by the SPTE
707 * @level: The level _containing_ the SPTE (its parent PT's level)
708 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
709 * of the page. Should be set unless handling an MMU
710 * notifier for access tracking. Leaving record_acc_track
711 * unset in that case prevents page accesses from being
713 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
714 * appropriate for the change being made. Should be set
715 * unless performing certain dirty logging operations.
716 * Leaving record_dirty_log unset in that case prevents page
717 * writes from being double counted.
719 * Returns the old SPTE value, which _may_ be different than @old_spte if the
720 * SPTE had voldatile bits.
722 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
723 u64 old_spte, u64 new_spte, gfn_t gfn, int level,
724 bool record_acc_track, bool record_dirty_log)
726 lockdep_assert_held_write(&kvm->mmu_lock);
729 * No thread should be using this function to set SPTEs to or from the
730 * temporary removed SPTE value.
731 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
732 * should be used. If operating under the MMU lock in write mode, the
733 * use of the removed SPTE should not be necessary.
735 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
737 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
739 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
741 if (record_acc_track)
742 handle_changed_spte_acc_track(old_spte, new_spte, level);
743 if (record_dirty_log)
744 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
749 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
750 u64 new_spte, bool record_acc_track,
751 bool record_dirty_log)
753 WARN_ON_ONCE(iter->yielded);
755 iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
756 iter->old_spte, new_spte,
757 iter->gfn, iter->level,
758 record_acc_track, record_dirty_log);
761 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
764 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
767 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
768 struct tdp_iter *iter,
771 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
774 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
775 struct tdp_iter *iter,
778 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
781 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
782 for_each_tdp_pte(_iter, _root, _start, _end)
784 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
785 tdp_root_for_each_pte(_iter, _root, _start, _end) \
786 if (!is_shadow_present_pte(_iter.old_spte) || \
787 !is_last_spte(_iter.old_spte, _iter.level)) \
791 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
792 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
795 * Yield if the MMU lock is contended or this thread needs to return control
798 * If this function should yield and flush is set, it will perform a remote
799 * TLB flush before yielding.
801 * If this function yields, iter->yielded is set and the caller must skip to
802 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
803 * over the paging structures to allow the iterator to continue its traversal
804 * from the paging structure root.
806 * Returns true if this function yielded.
808 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
809 struct tdp_iter *iter,
810 bool flush, bool shared)
812 WARN_ON(iter->yielded);
814 /* Ensure forward progress has been made before yielding. */
815 if (iter->next_last_level_gfn == iter->yielded_gfn)
818 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
820 kvm_flush_remote_tlbs(kvm);
825 cond_resched_rwlock_read(&kvm->mmu_lock);
827 cond_resched_rwlock_write(&kvm->mmu_lock);
831 WARN_ON(iter->gfn > iter->next_last_level_gfn);
833 iter->yielded = true;
836 return iter->yielded;
839 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
842 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
843 * a gpa range that would exceed the max gfn, and KVM does not create
844 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
845 * the slow emulation path every time.
847 return kvm_mmu_max_gfn() + 1;
850 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
851 bool shared, int zap_level)
853 struct tdp_iter iter;
855 gfn_t end = tdp_mmu_max_gfn_exclusive();
858 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
860 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
863 if (!is_shadow_present_pte(iter.old_spte))
866 if (iter.level > zap_level)
870 tdp_mmu_set_spte(kvm, &iter, 0);
871 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
876 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
881 * The root must have an elevated refcount so that it's reachable via
882 * mmu_notifier callbacks, which allows this path to yield and drop
883 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
884 * must drop all references to relevant pages prior to completing the
885 * callback. Dropping mmu_lock with an unreachable root would result
886 * in zapping SPTEs after a relevant mmu_notifier callback completes
887 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
888 * dirty accessed bits to the SPTE's associated struct page.
890 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
892 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
897 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
898 * split the zap into two passes. On the first pass, zap at the 1gb
899 * level, and then zap top-level SPs on the second pass. "1gb" is not
900 * arbitrary, as KVM must be able to zap a 1gb shadow page without
901 * inducing a stall to allow in-place replacement with a 1gb hugepage.
903 * Because zapping a SP recurses on its children, stepping down to
904 * PG_LEVEL_4K in the iterator itself is unnecessary.
906 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
907 __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
912 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
917 * This helper intentionally doesn't allow zapping a root shadow page,
918 * which doesn't have a parent page table and thus no associated entry.
920 if (WARN_ON_ONCE(!sp->ptep))
923 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
924 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
927 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
928 sp->gfn, sp->role.level + 1, true, true);
934 * If can_yield is true, will release the MMU lock and reschedule if the
935 * scheduler needs the CPU or there is contention on the MMU lock. If this
936 * function cannot yield, it will not release the MMU lock or reschedule and
937 * the caller must ensure it does not supply too large a GFN range, or the
938 * operation can cause a soft lockup.
940 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
941 gfn_t start, gfn_t end, bool can_yield, bool flush)
943 struct tdp_iter iter;
945 end = min(end, tdp_mmu_max_gfn_exclusive());
947 lockdep_assert_held_write(&kvm->mmu_lock);
951 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
953 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
958 if (!is_shadow_present_pte(iter.old_spte) ||
959 !is_last_spte(iter.old_spte, iter.level))
962 tdp_mmu_set_spte(kvm, &iter, 0);
969 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
970 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
976 * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
977 * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
978 * more SPTEs were zapped since the MMU lock was last acquired.
980 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
981 bool can_yield, bool flush)
983 struct kvm_mmu_page *root;
985 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
986 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
991 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
993 struct kvm_mmu_page *root;
997 * Zap all roots, including invalid roots, as all SPTEs must be dropped
998 * before returning to the caller. Zap directly even if the root is
999 * also being zapped by a worker. Walking zapped top-level SPTEs isn't
1000 * all that expensive and mmu_lock is already held, which means the
1001 * worker has yielded, i.e. flushing the work instead of zapping here
1002 * isn't guaranteed to be any faster.
1004 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
1005 * is being destroyed or the userspace VMM has exited. In both cases,
1006 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1008 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1009 for_each_tdp_mmu_root_yield_safe(kvm, root, i)
1010 tdp_mmu_zap_root(kvm, root, false);
1015 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1018 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1020 flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1024 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1025 * is about to be zapped, e.g. in response to a memslots update. The actual
1026 * zapping is performed asynchronously, so a reference is taken on all roots.
1027 * Using a separate workqueue makes it easy to ensure that the destruction is
1028 * performed before the "fast zap" completes, without keeping a separate list
1029 * of invalidated roots; the list is effectively the list of work items in
1032 * Get a reference even if the root is already invalid, the asynchronous worker
1033 * assumes it was gifted a reference to the root it processes. Because mmu_lock
1034 * is held for write, it should be impossible to observe a root with zero refcount,
1035 * i.e. the list of roots cannot be stale.
1037 * This has essentially the same effect for the TDP MMU
1038 * as updating mmu_valid_gen does for the shadow MMU.
1040 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1042 struct kvm_mmu_page *root;
1044 lockdep_assert_held_write(&kvm->mmu_lock);
1045 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1046 if (!root->role.invalid &&
1047 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1048 root->role.invalid = true;
1049 tdp_mmu_schedule_zap_root(kvm, root);
1055 * Installs a last-level SPTE to handle a TDP page fault.
1056 * (NPT/EPT violation/misconfiguration)
1058 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1059 struct kvm_page_fault *fault,
1060 struct tdp_iter *iter)
1062 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1064 int ret = RET_PF_FIXED;
1065 bool wrprot = false;
1067 if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
1068 return RET_PF_RETRY;
1070 if (unlikely(!fault->slot))
1071 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1073 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1074 fault->pfn, iter->old_spte, fault->prefetch, true,
1075 fault->map_writable, &new_spte);
1077 if (new_spte == iter->old_spte)
1078 ret = RET_PF_SPURIOUS;
1079 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1080 return RET_PF_RETRY;
1081 else if (is_shadow_present_pte(iter->old_spte) &&
1082 !is_last_spte(iter->old_spte, iter->level))
1083 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1084 KVM_PAGES_PER_HPAGE(iter->level + 1));
1087 * If the page fault was caused by a write but the page is write
1088 * protected, emulation is needed. If the emulation was skipped,
1089 * the vCPU would have the same fault again.
1093 ret = RET_PF_EMULATE;
1096 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1097 if (unlikely(is_mmio_spte(new_spte))) {
1098 vcpu->stat.pf_mmio_spte_created++;
1099 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1101 ret = RET_PF_EMULATE;
1103 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1104 rcu_dereference(iter->sptep));
1111 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1112 * provided page table.
1114 * @kvm: kvm instance
1115 * @iter: a tdp_iter instance currently on the SPTE that should be set
1116 * @sp: The new TDP page table to install.
1117 * @shared: This operation is running under the MMU lock in read mode.
1119 * Returns: 0 if the new page table was installed. Non-0 if the page table
1120 * could not be installed (e.g. the atomic compare-exchange failed).
1122 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1123 struct kvm_mmu_page *sp, bool shared)
1125 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1129 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1133 tdp_mmu_set_spte(kvm, iter, spte);
1136 tdp_account_mmu_page(kvm, sp);
1141 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1142 struct kvm_mmu_page *sp, bool shared);
1145 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1146 * page tables and SPTEs to translate the faulting guest physical address.
1148 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1150 struct kvm_mmu *mmu = vcpu->arch.mmu;
1151 struct kvm *kvm = vcpu->kvm;
1152 struct tdp_iter iter;
1153 struct kvm_mmu_page *sp;
1154 int ret = RET_PF_RETRY;
1156 kvm_mmu_hugepage_adjust(vcpu, fault);
1158 trace_kvm_mmu_spte_requested(fault);
1162 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1165 if (fault->nx_huge_page_workaround_enabled)
1166 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1169 * If SPTE has been frozen by another thread, just give up and
1170 * retry, avoiding unnecessary page table allocation and free.
1172 if (is_removed_spte(iter.old_spte))
1175 if (iter.level == fault->goal_level)
1176 goto map_target_level;
1178 /* Step down into the lower level page table if it exists. */
1179 if (is_shadow_present_pte(iter.old_spte) &&
1180 !is_large_pte(iter.old_spte))
1184 * The SPTE is either non-present or points to a huge page that
1185 * needs to be split.
1187 sp = tdp_mmu_alloc_sp(vcpu);
1188 tdp_mmu_init_child_sp(sp, &iter);
1190 sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1192 if (is_shadow_present_pte(iter.old_spte))
1193 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1195 r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1198 * Force the guest to retry if installing an upper level SPTE
1199 * failed, e.g. because a different task modified the SPTE.
1202 tdp_mmu_free_sp(sp);
1206 if (fault->huge_page_disallowed &&
1207 fault->req_level >= iter.level) {
1208 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1209 if (sp->nx_huge_page_disallowed)
1210 track_possible_nx_huge_page(kvm, sp);
1211 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1216 * The walk aborted before reaching the target level, e.g. because the
1217 * iterator detected an upper level SPTE was frozen during traversal.
1219 WARN_ON_ONCE(iter.level == fault->goal_level);
1223 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1230 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1233 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1234 range->end, range->may_block, flush);
1237 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1238 struct kvm_gfn_range *range);
1240 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1241 struct kvm_gfn_range *range,
1242 tdp_handler_t handler)
1244 struct kvm_mmu_page *root;
1245 struct tdp_iter iter;
1249 * Don't support rescheduling, none of the MMU notifiers that funnel
1250 * into this helper allow blocking; it'd be dead, wasteful code.
1252 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1255 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1256 ret |= handler(kvm, &iter, range);
1265 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1266 * if any of the GFNs in the range have been accessed.
1268 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1269 struct kvm_gfn_range *range)
1273 /* If we have a non-accessed entry we don't need to change the pte. */
1274 if (!is_accessed_spte(iter->old_spte))
1277 new_spte = iter->old_spte;
1279 if (spte_ad_enabled(new_spte)) {
1280 new_spte &= ~shadow_accessed_mask;
1283 * Capture the dirty status of the page, so that it doesn't get
1284 * lost when the SPTE is marked for access tracking.
1286 if (is_writable_pte(new_spte))
1287 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1289 new_spte = mark_spte_for_access_track(new_spte);
1292 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1297 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1299 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1302 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1303 struct kvm_gfn_range *range)
1305 return is_accessed_spte(iter->old_spte);
1308 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1310 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1313 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1314 struct kvm_gfn_range *range)
1318 /* Huge pages aren't expected to be modified without first being zapped. */
1319 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1321 if (iter->level != PG_LEVEL_4K ||
1322 !is_shadow_present_pte(iter->old_spte))
1326 * Note, when changing a read-only SPTE, it's not strictly necessary to
1327 * zero the SPTE before setting the new PFN, but doing so preserves the
1328 * invariant that the PFN of a present * leaf SPTE can never change.
1329 * See __handle_changed_spte().
1331 tdp_mmu_set_spte(kvm, iter, 0);
1333 if (!pte_write(range->pte)) {
1334 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1335 pte_pfn(range->pte));
1337 tdp_mmu_set_spte(kvm, iter, new_spte);
1344 * Handle the changed_pte MMU notifier for the TDP MMU.
1345 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1347 * Returns non-zero if a flush is needed before releasing the MMU lock.
1349 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1352 * No need to handle the remote TLB flush under RCU protection, the
1353 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1354 * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
1356 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1360 * Remove write access from all SPTEs at or above min_level that map GFNs
1361 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1364 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1365 gfn_t start, gfn_t end, int min_level)
1367 struct tdp_iter iter;
1369 bool spte_set = false;
1373 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1375 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1377 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1380 if (!is_shadow_present_pte(iter.old_spte) ||
1381 !is_last_spte(iter.old_spte, iter.level) ||
1382 !(iter.old_spte & PT_WRITABLE_MASK))
1385 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1387 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1398 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1399 * only affect leaf SPTEs down to min_level.
1400 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1402 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1403 const struct kvm_memory_slot *slot, int min_level)
1405 struct kvm_mmu_page *root;
1406 bool spte_set = false;
1408 lockdep_assert_held_read(&kvm->mmu_lock);
1410 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1411 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1412 slot->base_gfn + slot->npages, min_level);
1417 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1419 struct kvm_mmu_page *sp;
1423 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1427 sp->spt = (void *)__get_free_page(gfp);
1429 kmem_cache_free(mmu_page_header_cache, sp);
1436 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1437 struct tdp_iter *iter,
1440 struct kvm_mmu_page *sp;
1443 * Since we are allocating while under the MMU lock we have to be
1444 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1445 * reclaim and to avoid making any filesystem callbacks (which can end
1446 * up invoking KVM MMU notifiers, resulting in a deadlock).
1448 * If this allocation fails we drop the lock and retry with reclaim
1451 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1458 read_unlock(&kvm->mmu_lock);
1460 write_unlock(&kvm->mmu_lock);
1462 iter->yielded = true;
1463 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1466 read_lock(&kvm->mmu_lock);
1468 write_lock(&kvm->mmu_lock);
1475 /* Note, the caller is responsible for initializing @sp. */
1476 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1477 struct kvm_mmu_page *sp, bool shared)
1479 const u64 huge_spte = iter->old_spte;
1480 const int level = iter->level;
1484 * No need for atomics when writing to sp->spt since the page table has
1485 * not been linked in yet and thus is not reachable from any other CPU.
1487 for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1488 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1491 * Replace the huge spte with a pointer to the populated lower level
1492 * page table. Since we are making this change without a TLB flush vCPUs
1493 * will see a mix of the split mappings and the original huge mapping,
1494 * depending on what's currently in their TLB. This is fine from a
1495 * correctness standpoint since the translation will be the same either
1498 ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1503 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1504 * are overwriting from the page stats. But we have to manually update
1505 * the page stats with the new present child pages.
1507 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1510 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1514 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1515 struct kvm_mmu_page *root,
1516 gfn_t start, gfn_t end,
1517 int target_level, bool shared)
1519 struct kvm_mmu_page *sp = NULL;
1520 struct tdp_iter iter;
1526 * Traverse the page table splitting all huge pages above the target
1527 * level into one lower level. For example, if we encounter a 1GB page
1528 * we split it into 512 2MB pages.
1530 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1531 * to visit an SPTE before ever visiting its children, which means we
1532 * will correctly recursively split huge pages that are more than one
1533 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1534 * and then splitting each of those to 512 4KB pages).
1536 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1538 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1541 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1545 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1548 trace_kvm_mmu_split_huge_page(iter.gfn,
1558 tdp_mmu_init_child_sp(sp, &iter);
1560 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1569 * It's possible to exit the loop having never used the last sp if, for
1570 * example, a vCPU doing HugePage NX splitting wins the race and
1571 * installs its own sp in place of the last sp we tried to split.
1574 tdp_mmu_free_sp(sp);
1581 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1583 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1584 const struct kvm_memory_slot *slot,
1585 gfn_t start, gfn_t end,
1586 int target_level, bool shared)
1588 struct kvm_mmu_page *root;
1591 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1593 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1594 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1596 kvm_tdp_mmu_put_root(kvm, root, shared);
1603 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1604 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1605 * If AD bits are not enabled, this will require clearing the writable bit on
1606 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1609 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1610 gfn_t start, gfn_t end)
1612 struct tdp_iter iter;
1614 bool spte_set = false;
1618 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1620 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1623 if (!is_shadow_present_pte(iter.old_spte))
1626 if (spte_ad_need_write_protect(iter.old_spte)) {
1627 if (is_writable_pte(iter.old_spte))
1628 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1632 if (iter.old_spte & shadow_dirty_mask)
1633 new_spte = iter.old_spte & ~shadow_dirty_mask;
1638 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1649 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1650 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1651 * If AD bits are not enabled, this will require clearing the writable bit on
1652 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1655 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1656 const struct kvm_memory_slot *slot)
1658 struct kvm_mmu_page *root;
1659 bool spte_set = false;
1661 lockdep_assert_held_read(&kvm->mmu_lock);
1663 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1664 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1665 slot->base_gfn + slot->npages);
1671 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1672 * set in mask, starting at gfn. The given memslot is expected to contain all
1673 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1674 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1675 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1677 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1678 gfn_t gfn, unsigned long mask, bool wrprot)
1680 struct tdp_iter iter;
1685 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1686 gfn + BITS_PER_LONG) {
1690 if (iter.level > PG_LEVEL_4K ||
1691 !(mask & (1UL << (iter.gfn - gfn))))
1694 mask &= ~(1UL << (iter.gfn - gfn));
1696 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1697 if (is_writable_pte(iter.old_spte))
1698 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1702 if (iter.old_spte & shadow_dirty_mask)
1703 new_spte = iter.old_spte & ~shadow_dirty_mask;
1708 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1715 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1716 * set in mask, starting at gfn. The given memslot is expected to contain all
1717 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1718 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1719 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1721 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1722 struct kvm_memory_slot *slot,
1723 gfn_t gfn, unsigned long mask,
1726 struct kvm_mmu_page *root;
1728 lockdep_assert_held_write(&kvm->mmu_lock);
1729 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1730 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1733 static void zap_collapsible_spte_range(struct kvm *kvm,
1734 struct kvm_mmu_page *root,
1735 const struct kvm_memory_slot *slot)
1737 gfn_t start = slot->base_gfn;
1738 gfn_t end = start + slot->npages;
1739 struct tdp_iter iter;
1740 int max_mapping_level;
1744 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1746 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1749 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1750 !is_shadow_present_pte(iter.old_spte))
1754 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1755 * a large page size, then its parent would have been zapped
1756 * instead of stepping down.
1758 if (is_last_spte(iter.old_spte, iter.level))
1762 * If iter.gfn resides outside of the slot, i.e. the page for
1763 * the current level overlaps but is not contained by the slot,
1764 * then the SPTE can't be made huge. More importantly, trying
1765 * to query that info from slot->arch.lpage_info will cause an
1766 * out-of-bounds access.
1768 if (iter.gfn < start || iter.gfn >= end)
1771 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1772 iter.gfn, PG_LEVEL_NUM);
1773 if (max_mapping_level < iter.level)
1776 /* Note, a successful atomic zap also does a remote TLB flush. */
1777 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1785 * Zap non-leaf SPTEs (and free their associated page tables) which could
1786 * be replaced by huge pages, for GFNs within the slot.
1788 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1789 const struct kvm_memory_slot *slot)
1791 struct kvm_mmu_page *root;
1793 lockdep_assert_held_read(&kvm->mmu_lock);
1795 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1796 zap_collapsible_spte_range(kvm, root, slot);
1800 * Removes write access on the last level SPTE mapping this GFN and unsets the
1801 * MMU-writable bit to ensure future writes continue to be intercepted.
1802 * Returns true if an SPTE was set and a TLB flush is needed.
1804 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1805 gfn_t gfn, int min_level)
1807 struct tdp_iter iter;
1809 bool spte_set = false;
1811 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1815 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1816 if (!is_shadow_present_pte(iter.old_spte) ||
1817 !is_last_spte(iter.old_spte, iter.level))
1820 new_spte = iter.old_spte &
1821 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1823 if (new_spte == iter.old_spte)
1826 tdp_mmu_set_spte(kvm, &iter, new_spte);
1836 * Removes write access on the last level SPTE mapping this GFN and unsets the
1837 * MMU-writable bit to ensure future writes continue to be intercepted.
1838 * Returns true if an SPTE was set and a TLB flush is needed.
1840 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1841 struct kvm_memory_slot *slot, gfn_t gfn,
1844 struct kvm_mmu_page *root;
1845 bool spte_set = false;
1847 lockdep_assert_held_write(&kvm->mmu_lock);
1848 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1849 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1855 * Return the level of the lowest level SPTE added to sptes.
1856 * That SPTE may be non-present.
1858 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1860 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1863 struct tdp_iter iter;
1864 struct kvm_mmu *mmu = vcpu->arch.mmu;
1865 gfn_t gfn = addr >> PAGE_SHIFT;
1868 *root_level = vcpu->arch.mmu->root_role.level;
1870 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1872 sptes[leaf] = iter.old_spte;
1879 * Returns the last level spte pointer of the shadow page walk for the given
1880 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1881 * walk could be performed, returns NULL and *spte does not contain valid data.
1884 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1885 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1887 * WARNING: This function is only intended to be called during fast_page_fault.
1889 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1892 struct tdp_iter iter;
1893 struct kvm_mmu *mmu = vcpu->arch.mmu;
1894 gfn_t gfn = addr >> PAGE_SHIFT;
1895 tdp_ptep_t sptep = NULL;
1897 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1898 *spte = iter.old_spte;
1903 * Perform the rcu_dereference to get the raw spte pointer value since
1904 * we are passing it up to fast_page_fault, which is shared with the
1905 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1908 * This is safe since fast_page_fault obeys the contracts of this
1909 * function as well as all TDP MMU contracts around modifying SPTEs
1910 * outside of mmu_lock.
1912 return rcu_dereference(sptep);