1 // SPDX-License-Identifier: GPL-2.0
4 #include "mmu_internal.h"
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
19 struct workqueue_struct *wq;
21 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
24 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
28 /* This should not be changed for the lifetime of the VM. */
29 kvm->arch.tdp_mmu_enabled = true;
30 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
31 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
32 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
33 kvm->arch.tdp_mmu_zap_wq = wq;
37 /* Arbitrarily returns true so that this may be used in if statements. */
38 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
42 lockdep_assert_held_read(&kvm->mmu_lock);
44 lockdep_assert_held_write(&kvm->mmu_lock);
49 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
51 if (!kvm->arch.tdp_mmu_enabled)
54 flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
55 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
57 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
58 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
61 * Ensure that all the outstanding RCU callbacks to free shadow pages
62 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
63 * can call kvm_tdp_mmu_put_root and create new callbacks.
68 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
70 free_page((unsigned long)sp->spt);
71 kmem_cache_free(mmu_page_header_cache, sp);
75 * This is called through call_rcu in order to free TDP page table memory
76 * safely with respect to other kernel threads that may be operating on
78 * By only accessing TDP MMU page table memory in an RCU read critical
79 * section, and freeing it after a grace period, lockless access to that
80 * memory won't use it after it is freed.
82 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
84 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
90 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
93 static void tdp_mmu_zap_root_work(struct work_struct *work)
95 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
97 struct kvm *kvm = root->tdp_mmu_async_data;
99 read_lock(&kvm->mmu_lock);
102 * A TLB flush is not necessary as KVM performs a local TLB flush when
103 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
104 * to a different pCPU. Note, the local TLB flush on reuse also
105 * invalidates any paging-structure-cache entries, i.e. TLB entries for
106 * intermediate paging structures, that may be zapped, as such entries
107 * are associated with the ASID on both VMX and SVM.
109 tdp_mmu_zap_root(kvm, root, true);
112 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
113 * avoiding an infinite loop. By design, the root is reachable while
114 * it's being asynchronously zapped, thus a different task can put its
115 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
116 * asynchronously zapped root is unavoidable.
118 kvm_tdp_mmu_put_root(kvm, root, true);
120 read_unlock(&kvm->mmu_lock);
123 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
125 root->tdp_mmu_async_data = kvm;
126 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
127 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
130 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
132 union kvm_mmu_page_role role = page->role;
135 /* No need to use cmpxchg, only the invalid bit can change. */
136 role.word = xchg(&page->role.word, role.word);
140 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
143 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
145 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
148 WARN_ON(!root->tdp_mmu_page);
151 * The root now has refcount=0. It is valid, but readers already
152 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
153 * rejects it. This remains true for the rest of the execution
154 * of this function, because readers visit valid roots only
155 * (except for tdp_mmu_zap_root_work(), which however
156 * does not acquire any reference itself).
158 * Even though there are flows that need to visit all roots for
159 * correctness, they all take mmu_lock for write, so they cannot yet
160 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
161 * since the root still has refcount=0.
163 * However, tdp_mmu_zap_root can yield, and writers do not expect to
164 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
165 * So the root temporarily gets an extra reference, going to refcount=1
166 * while staying invalid. Readers still cannot acquire any reference;
167 * but writers are now allowed to run if tdp_mmu_zap_root yields and
168 * they might take an extra reference if they themselves yield.
169 * Therefore, when the reference is given back by the worker,
170 * there is no guarantee that the refcount is still 1. If not, whoever
171 * puts the last reference will free the page, but they will not have to
172 * zap the root because a root cannot go from invalid to valid.
174 if (!kvm_tdp_root_mark_invalid(root)) {
175 refcount_set(&root->tdp_mmu_root_count, 1);
178 * Zapping the root in a worker is not just "nice to have";
179 * it is required because kvm_tdp_mmu_invalidate_all_roots()
180 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did
181 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
182 * might return with some roots not zapped yet.
184 tdp_mmu_schedule_zap_root(kvm, root);
188 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
189 list_del_rcu(&root->link);
190 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
191 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
195 * Returns the next root after @prev_root (or the first root if @prev_root is
196 * NULL). A reference to the returned root is acquired, and the reference to
197 * @prev_root is released (the caller obviously must hold a reference to
198 * @prev_root if it's non-NULL).
200 * If @only_valid is true, invalid roots are skipped.
202 * Returns NULL if the end of tdp_mmu_roots was reached.
204 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
205 struct kvm_mmu_page *prev_root,
206 bool shared, bool only_valid)
208 struct kvm_mmu_page *next_root;
213 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
215 typeof(*prev_root), link);
217 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
218 typeof(*next_root), link);
221 if ((!only_valid || !next_root->role.invalid) &&
222 kvm_tdp_mmu_get_root(next_root))
225 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
226 &next_root->link, typeof(*next_root), link);
232 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
238 * Note: this iterator gets and puts references to the roots it iterates over.
239 * This makes it safe to release the MMU lock and yield within the loop, but
240 * if exiting the loop early, the caller must drop the reference to the most
241 * recent root. (Unless keeping a live reference is desirable.)
243 * If shared is set, this function is operating under the MMU lock in read
244 * mode. In the unlikely event that this thread must free a root, the lock
245 * will be temporarily dropped and reacquired in write mode.
247 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
248 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
250 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
251 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
252 kvm_mmu_page_as_id(_root) != _as_id) { \
255 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
256 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
258 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
259 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
262 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
263 * the implication being that any flow that holds mmu_lock for read is
264 * inherently yield-friendly and should use the yield-safe variant above.
265 * Holding mmu_lock for write obviates the need for RCU protection as the list
266 * is guaranteed to be stable.
268 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
269 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
270 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
271 kvm_mmu_page_as_id(_root) != _as_id) { \
274 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
276 struct kvm_mmu_page *sp;
278 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
279 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
284 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
285 gfn_t gfn, union kvm_mmu_page_role role)
287 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
292 sp->tdp_mmu_page = true;
294 trace_kvm_mmu_get_page(sp, true);
297 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
298 struct tdp_iter *iter)
300 struct kvm_mmu_page *parent_sp;
301 union kvm_mmu_page_role role;
303 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
305 role = parent_sp->role;
308 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
311 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
313 union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
314 struct kvm *kvm = vcpu->kvm;
315 struct kvm_mmu_page *root;
317 lockdep_assert_held_write(&kvm->mmu_lock);
320 * Check for an existing root before allocating a new one. Note, the
321 * role check prevents consuming an invalid root.
323 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
324 if (root->role.word == role.word &&
325 kvm_tdp_mmu_get_root(root))
329 root = tdp_mmu_alloc_sp(vcpu);
330 tdp_mmu_init_sp(root, NULL, 0, role);
332 refcount_set(&root->tdp_mmu_root_count, 1);
334 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
335 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
336 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
339 return __pa(root->spt);
342 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
343 u64 old_spte, u64 new_spte, int level,
346 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
348 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
351 if (is_accessed_spte(old_spte) &&
352 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
353 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
354 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
357 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
358 u64 old_spte, u64 new_spte, int level)
361 struct kvm_memory_slot *slot;
363 if (level > PG_LEVEL_4K)
366 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
368 if ((!is_writable_pte(old_spte) || pfn_changed) &&
369 is_writable_pte(new_spte)) {
370 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
371 mark_page_dirty_in_slot(kvm, slot, gfn);
376 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
379 * @sp: the page to be removed
380 * @shared: This operation may not be running under the exclusive use of
381 * the MMU lock and the operation must synchronize with other
382 * threads that might be adding or removing pages.
384 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
388 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
390 lockdep_assert_held_write(&kvm->mmu_lock);
393 if (sp->lpage_disallowed)
394 unaccount_huge_nx_page(kvm, sp);
397 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
401 * handle_removed_pt() - handle a page table removed from the TDP structure
404 * @pt: the page removed from the paging structure
405 * @shared: This operation may not be running under the exclusive use
406 * of the MMU lock and the operation must synchronize with other
407 * threads that might be modifying SPTEs.
409 * Given a page table that has been removed from the TDP paging structure,
410 * iterates through the page table to clear SPTEs and free child page tables.
412 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
413 * protection. Since this thread removed it from the paging structure,
414 * this thread will be responsible for ensuring the page is freed. Hence the
415 * early rcu_dereferences in the function.
417 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
419 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
420 int level = sp->role.level;
421 gfn_t base_gfn = sp->gfn;
424 trace_kvm_mmu_prepare_zap_page(sp);
426 tdp_mmu_unlink_sp(kvm, sp, shared);
428 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
429 u64 *sptep = rcu_dereference(pt) + i;
430 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
435 * Set the SPTE to a nonpresent value that other
436 * threads will not overwrite. If the SPTE was
437 * already marked as removed then another thread
438 * handling a page fault could overwrite it, so
439 * set the SPTE until it is set from some other
440 * value to the removed SPTE value.
443 old_child_spte = xchg(sptep, REMOVED_SPTE);
444 if (!is_removed_spte(old_child_spte))
450 * If the SPTE is not MMU-present, there is no backing
451 * page associated with the SPTE and so no side effects
452 * that need to be recorded, and exclusive ownership of
453 * mmu_lock ensures the SPTE can't be made present.
454 * Note, zapping MMIO SPTEs is also unnecessary as they
455 * are guarded by the memslots generation, not by being
458 old_child_spte = READ_ONCE(*sptep);
459 if (!is_shadow_present_pte(old_child_spte))
463 * Marking the SPTE as a removed SPTE is not
464 * strictly necessary here as the MMU lock will
465 * stop other threads from concurrently modifying
466 * this SPTE. Using the removed SPTE value keeps
467 * the two branches consistent and simplifies
470 WRITE_ONCE(*sptep, REMOVED_SPTE);
472 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
473 old_child_spte, REMOVED_SPTE, level,
477 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
481 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
483 * @as_id: the address space of the paging structure the SPTE was a part of
484 * @gfn: the base GFN that was mapped by the SPTE
485 * @old_spte: The value of the SPTE before the change
486 * @new_spte: The value of the SPTE after the change
487 * @level: the level of the PT the SPTE is part of in the paging structure
488 * @shared: This operation may not be running under the exclusive use of
489 * the MMU lock and the operation must synchronize with other
490 * threads that might be modifying SPTEs.
492 * Handle bookkeeping that might result from the modification of a SPTE.
493 * This function must be called for all TDP SPTE modifications.
495 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
496 u64 old_spte, u64 new_spte, int level,
499 bool was_present = is_shadow_present_pte(old_spte);
500 bool is_present = is_shadow_present_pte(new_spte);
501 bool was_leaf = was_present && is_last_spte(old_spte, level);
502 bool is_leaf = is_present && is_last_spte(new_spte, level);
503 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
505 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
506 WARN_ON(level < PG_LEVEL_4K);
507 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
510 * If this warning were to trigger it would indicate that there was a
511 * missing MMU notifier or a race with some notifier handler.
512 * A present, leaf SPTE should never be directly replaced with another
513 * present leaf SPTE pointing to a different PFN. A notifier handler
514 * should be zapping the SPTE before the main MM's page table is
515 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
516 * thread before replacement.
518 if (was_leaf && is_leaf && pfn_changed) {
519 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
520 "SPTE with another present leaf SPTE mapping a\n"
522 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
523 as_id, gfn, old_spte, new_spte, level);
526 * Crash the host to prevent error propagation and guest data
532 if (old_spte == new_spte)
535 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
538 check_spte_writable_invariants(new_spte);
541 * The only times a SPTE should be changed from a non-present to
542 * non-present state is when an MMIO entry is installed/modified/
543 * removed. In that case, there is nothing to do here.
545 if (!was_present && !is_present) {
547 * If this change does not involve a MMIO SPTE or removed SPTE,
548 * it is unexpected. Log the change, though it should not
549 * impact the guest since both the former and current SPTEs
552 if (WARN_ON(!is_mmio_spte(old_spte) &&
553 !is_mmio_spte(new_spte) &&
554 !is_removed_spte(new_spte)))
555 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
556 "should not be replaced with another,\n"
557 "different nonpresent SPTE, unless one or both\n"
558 "are MMIO SPTEs, or the new SPTE is\n"
559 "a temporary removed SPTE.\n"
560 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
561 as_id, gfn, old_spte, new_spte, level);
565 if (is_leaf != was_leaf)
566 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
568 if (was_leaf && is_dirty_spte(old_spte) &&
569 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
570 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
573 * Recursively handle child PTs if the change removed a subtree from
574 * the paging structure. Note the WARN on the PFN changing without the
575 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
576 * pages are kernel allocations and should never be migrated.
578 if (was_present && !was_leaf &&
579 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
580 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
583 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
584 u64 old_spte, u64 new_spte, int level,
587 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
589 handle_changed_spte_acc_track(old_spte, new_spte, level);
590 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
595 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
596 * and handle the associated bookkeeping. Do not mark the page dirty
597 * in KVM's dirty bitmaps.
599 * If setting the SPTE fails because it has changed, iter->old_spte will be
600 * refreshed to the current value of the spte.
603 * @iter: a tdp_iter instance currently on the SPTE that should be set
604 * @new_spte: The value the SPTE should be set to
606 * * 0 - If the SPTE was set.
607 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
608 * no side-effects other than setting iter->old_spte to the last
609 * known value of the spte.
611 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
612 struct tdp_iter *iter,
615 u64 *sptep = rcu_dereference(iter->sptep);
619 * The caller is responsible for ensuring the old SPTE is not a REMOVED
620 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
621 * and pre-checking before inserting a new SPTE is advantageous as it
622 * avoids unnecessary work.
624 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
626 lockdep_assert_held_read(&kvm->mmu_lock);
629 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
630 * does not hold the mmu_lock.
632 old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
633 if (old_spte != iter->old_spte) {
635 * The page table entry was modified by a different logical
636 * CPU. Refresh iter->old_spte with the current value so the
637 * caller operates on fresh data, e.g. if it retries
638 * tdp_mmu_set_spte_atomic().
640 iter->old_spte = old_spte;
644 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
645 new_spte, iter->level, true);
646 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
651 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
652 struct tdp_iter *iter)
657 * Freeze the SPTE by setting it to a special,
658 * non-present value. This will stop other threads from
659 * immediately installing a present entry in its place
660 * before the TLBs are flushed.
662 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
666 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
667 KVM_PAGES_PER_HPAGE(iter->level));
670 * No other thread can overwrite the removed SPTE as they
671 * must either wait on the MMU lock or use
672 * tdp_mmu_set_spte_atomic which will not overwrite the
673 * special removed SPTE value. No bookkeeping is needed
674 * here since the SPTE is going from non-present
677 kvm_tdp_mmu_write_spte(iter->sptep, 0);
684 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
686 * @as_id: Address space ID, i.e. regular vs. SMM
687 * @sptep: Pointer to the SPTE
688 * @old_spte: The current value of the SPTE
689 * @new_spte: The new value that will be set for the SPTE
690 * @gfn: The base GFN that was (or will be) mapped by the SPTE
691 * @level: The level _containing_ the SPTE (its parent PT's level)
692 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
693 * of the page. Should be set unless handling an MMU
694 * notifier for access tracking. Leaving record_acc_track
695 * unset in that case prevents page accesses from being
697 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
698 * appropriate for the change being made. Should be set
699 * unless performing certain dirty logging operations.
700 * Leaving record_dirty_log unset in that case prevents page
701 * writes from being double counted.
703 static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
704 u64 old_spte, u64 new_spte, gfn_t gfn, int level,
705 bool record_acc_track, bool record_dirty_log)
707 lockdep_assert_held_write(&kvm->mmu_lock);
710 * No thread should be using this function to set SPTEs to or from the
711 * temporary removed SPTE value.
712 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
713 * should be used. If operating under the MMU lock in write mode, the
714 * use of the removed SPTE should not be necessary.
716 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
718 kvm_tdp_mmu_write_spte(sptep, new_spte);
720 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
722 if (record_acc_track)
723 handle_changed_spte_acc_track(old_spte, new_spte, level);
724 if (record_dirty_log)
725 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
729 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
730 u64 new_spte, bool record_acc_track,
731 bool record_dirty_log)
733 WARN_ON_ONCE(iter->yielded);
735 __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
736 new_spte, iter->gfn, iter->level,
737 record_acc_track, record_dirty_log);
740 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
743 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
746 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
747 struct tdp_iter *iter,
750 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
753 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
754 struct tdp_iter *iter,
757 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
760 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
761 for_each_tdp_pte(_iter, _root, _start, _end)
763 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
764 tdp_root_for_each_pte(_iter, _root, _start, _end) \
765 if (!is_shadow_present_pte(_iter.old_spte) || \
766 !is_last_spte(_iter.old_spte, _iter.level)) \
770 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
771 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
774 * Yield if the MMU lock is contended or this thread needs to return control
777 * If this function should yield and flush is set, it will perform a remote
778 * TLB flush before yielding.
780 * If this function yields, iter->yielded is set and the caller must skip to
781 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
782 * over the paging structures to allow the iterator to continue its traversal
783 * from the paging structure root.
785 * Returns true if this function yielded.
787 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
788 struct tdp_iter *iter,
789 bool flush, bool shared)
791 WARN_ON(iter->yielded);
793 /* Ensure forward progress has been made before yielding. */
794 if (iter->next_last_level_gfn == iter->yielded_gfn)
797 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
799 kvm_flush_remote_tlbs(kvm);
804 cond_resched_rwlock_read(&kvm->mmu_lock);
806 cond_resched_rwlock_write(&kvm->mmu_lock);
810 WARN_ON(iter->gfn > iter->next_last_level_gfn);
812 iter->yielded = true;
815 return iter->yielded;
818 static inline gfn_t tdp_mmu_max_gfn_host(void)
821 * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
822 * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
823 * and so KVM will never install a SPTE for such addresses.
825 return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
828 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
829 bool shared, int zap_level)
831 struct tdp_iter iter;
833 gfn_t end = tdp_mmu_max_gfn_host();
836 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
838 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
841 if (!is_shadow_present_pte(iter.old_spte))
844 if (iter.level > zap_level)
848 tdp_mmu_set_spte(kvm, &iter, 0);
849 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
854 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
859 * The root must have an elevated refcount so that it's reachable via
860 * mmu_notifier callbacks, which allows this path to yield and drop
861 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
862 * must drop all references to relevant pages prior to completing the
863 * callback. Dropping mmu_lock with an unreachable root would result
864 * in zapping SPTEs after a relevant mmu_notifier callback completes
865 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
866 * dirty accessed bits to the SPTE's associated struct page.
868 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
870 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
875 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
876 * split the zap into two passes. On the first pass, zap at the 1gb
877 * level, and then zap top-level SPs on the second pass. "1gb" is not
878 * arbitrary, as KVM must be able to zap a 1gb shadow page without
879 * inducing a stall to allow in-place replacement with a 1gb hugepage.
881 * Because zapping a SP recurses on its children, stepping down to
882 * PG_LEVEL_4K in the iterator itself is unnecessary.
884 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
885 __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
890 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
895 * This helper intentionally doesn't allow zapping a root shadow page,
896 * which doesn't have a parent page table and thus no associated entry.
898 if (WARN_ON_ONCE(!sp->ptep))
901 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
902 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
905 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
906 sp->gfn, sp->role.level + 1, true, true);
912 * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
913 * have been cleared and a TLB flush is needed before releasing the MMU lock.
915 * If can_yield is true, will release the MMU lock and reschedule if the
916 * scheduler needs the CPU or there is contention on the MMU lock. If this
917 * function cannot yield, it will not release the MMU lock or reschedule and
918 * the caller must ensure it does not supply too large a GFN range, or the
919 * operation can cause a soft lockup.
921 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
922 gfn_t start, gfn_t end, bool can_yield, bool flush)
924 struct tdp_iter iter;
926 end = min(end, tdp_mmu_max_gfn_host());
928 lockdep_assert_held_write(&kvm->mmu_lock);
932 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
934 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
939 if (!is_shadow_present_pte(iter.old_spte) ||
940 !is_last_spte(iter.old_spte, iter.level))
943 tdp_mmu_set_spte(kvm, &iter, 0);
950 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
951 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
957 * Tears down the mappings for the range of gfns, [start, end), and frees the
958 * non-root pages mapping GFNs strictly within that range. Returns true if
959 * SPTEs have been cleared and a TLB flush is needed before releasing the
962 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
963 bool can_yield, bool flush)
965 struct kvm_mmu_page *root;
967 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
968 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
973 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
975 struct kvm_mmu_page *root;
979 * Zap all roots, including invalid roots, as all SPTEs must be dropped
980 * before returning to the caller. Zap directly even if the root is
981 * also being zapped by a worker. Walking zapped top-level SPTEs isn't
982 * all that expensive and mmu_lock is already held, which means the
983 * worker has yielded, i.e. flushing the work instead of zapping here
984 * isn't guaranteed to be any faster.
986 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
987 * is being destroyed or the userspace VMM has exited. In both cases,
988 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
990 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
991 for_each_tdp_mmu_root_yield_safe(kvm, root, i)
992 tdp_mmu_zap_root(kvm, root, false);
997 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1000 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1002 flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1006 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1007 * is about to be zapped, e.g. in response to a memslots update. The actual
1008 * zapping is performed asynchronously, so a reference is taken on all roots.
1009 * Using a separate workqueue makes it easy to ensure that the destruction is
1010 * performed before the "fast zap" completes, without keeping a separate list
1011 * of invalidated roots; the list is effectively the list of work items in
1014 * Get a reference even if the root is already invalid, the asynchronous worker
1015 * assumes it was gifted a reference to the root it processes. Because mmu_lock
1016 * is held for write, it should be impossible to observe a root with zero refcount,
1017 * i.e. the list of roots cannot be stale.
1019 * This has essentially the same effect for the TDP MMU
1020 * as updating mmu_valid_gen does for the shadow MMU.
1022 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1024 struct kvm_mmu_page *root;
1026 lockdep_assert_held_write(&kvm->mmu_lock);
1027 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1028 if (!root->role.invalid &&
1029 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1030 root->role.invalid = true;
1031 tdp_mmu_schedule_zap_root(kvm, root);
1037 * Installs a last-level SPTE to handle a TDP page fault.
1038 * (NPT/EPT violation/misconfiguration)
1040 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1041 struct kvm_page_fault *fault,
1042 struct tdp_iter *iter)
1044 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1046 int ret = RET_PF_FIXED;
1047 bool wrprot = false;
1049 WARN_ON(sp->role.level != fault->goal_level);
1050 if (unlikely(!fault->slot))
1051 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1053 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1054 fault->pfn, iter->old_spte, fault->prefetch, true,
1055 fault->map_writable, &new_spte);
1057 if (new_spte == iter->old_spte)
1058 ret = RET_PF_SPURIOUS;
1059 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1060 return RET_PF_RETRY;
1061 else if (is_shadow_present_pte(iter->old_spte) &&
1062 !is_last_spte(iter->old_spte, iter->level))
1063 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1064 KVM_PAGES_PER_HPAGE(iter->level + 1));
1067 * If the page fault was caused by a write but the page is write
1068 * protected, emulation is needed. If the emulation was skipped,
1069 * the vCPU would have the same fault again.
1073 ret = RET_PF_EMULATE;
1076 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1077 if (unlikely(is_mmio_spte(new_spte))) {
1078 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1080 ret = RET_PF_EMULATE;
1082 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1083 rcu_dereference(iter->sptep));
1087 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
1088 * consistent with legacy MMU behavior.
1090 if (ret != RET_PF_SPURIOUS)
1091 vcpu->stat.pf_fixed++;
1097 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1098 * provided page table.
1100 * @kvm: kvm instance
1101 * @iter: a tdp_iter instance currently on the SPTE that should be set
1102 * @sp: The new TDP page table to install.
1103 * @account_nx: True if this page table is being installed to split a
1104 * non-executable huge page.
1105 * @shared: This operation is running under the MMU lock in read mode.
1107 * Returns: 0 if the new page table was installed. Non-0 if the page table
1108 * could not be installed (e.g. the atomic compare-exchange failed).
1110 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1111 struct kvm_mmu_page *sp, bool account_nx,
1114 u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
1118 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1122 tdp_mmu_set_spte(kvm, iter, spte);
1125 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1126 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1128 account_huge_nx_page(kvm, sp);
1129 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1135 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1136 * page tables and SPTEs to translate the faulting guest physical address.
1138 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1140 struct kvm_mmu *mmu = vcpu->arch.mmu;
1141 struct tdp_iter iter;
1142 struct kvm_mmu_page *sp;
1145 kvm_mmu_hugepage_adjust(vcpu, fault);
1147 trace_kvm_mmu_spte_requested(fault);
1151 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1152 if (fault->nx_huge_page_workaround_enabled)
1153 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1155 if (iter.level == fault->goal_level)
1159 * If there is an SPTE mapping a large page at a higher level
1160 * than the target, that SPTE must be cleared and replaced
1161 * with a non-leaf SPTE.
1163 if (is_shadow_present_pte(iter.old_spte) &&
1164 is_large_pte(iter.old_spte)) {
1165 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1169 * The iter must explicitly re-read the spte here
1170 * because the new value informs the !present
1173 iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
1176 if (!is_shadow_present_pte(iter.old_spte)) {
1177 bool account_nx = fault->huge_page_disallowed &&
1178 fault->req_level >= iter.level;
1181 * If SPTE has been frozen by another thread, just
1182 * give up and retry, avoiding unnecessary page table
1183 * allocation and free.
1185 if (is_removed_spte(iter.old_spte))
1188 sp = tdp_mmu_alloc_sp(vcpu);
1189 tdp_mmu_init_child_sp(sp, &iter);
1191 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1192 tdp_mmu_free_sp(sp);
1199 * Force the guest to retry the access if the upper level SPTEs aren't
1200 * in place, or if the target leaf SPTE is frozen by another CPU.
1202 if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
1204 return RET_PF_RETRY;
1207 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1213 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1216 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1217 range->end, range->may_block, flush);
1220 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1221 struct kvm_gfn_range *range);
1223 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1224 struct kvm_gfn_range *range,
1225 tdp_handler_t handler)
1227 struct kvm_mmu_page *root;
1228 struct tdp_iter iter;
1232 * Don't support rescheduling, none of the MMU notifiers that funnel
1233 * into this helper allow blocking; it'd be dead, wasteful code.
1235 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1238 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1239 ret |= handler(kvm, &iter, range);
1248 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1249 * if any of the GFNs in the range have been accessed.
1251 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1252 struct kvm_gfn_range *range)
1256 /* If we have a non-accessed entry we don't need to change the pte. */
1257 if (!is_accessed_spte(iter->old_spte))
1260 new_spte = iter->old_spte;
1262 if (spte_ad_enabled(new_spte)) {
1263 new_spte &= ~shadow_accessed_mask;
1266 * Capture the dirty status of the page, so that it doesn't get
1267 * lost when the SPTE is marked for access tracking.
1269 if (is_writable_pte(new_spte))
1270 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1272 new_spte = mark_spte_for_access_track(new_spte);
1275 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1280 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1282 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1285 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1286 struct kvm_gfn_range *range)
1288 return is_accessed_spte(iter->old_spte);
1291 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1293 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1296 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1297 struct kvm_gfn_range *range)
1301 /* Huge pages aren't expected to be modified without first being zapped. */
1302 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1304 if (iter->level != PG_LEVEL_4K ||
1305 !is_shadow_present_pte(iter->old_spte))
1309 * Note, when changing a read-only SPTE, it's not strictly necessary to
1310 * zero the SPTE before setting the new PFN, but doing so preserves the
1311 * invariant that the PFN of a present * leaf SPTE can never change.
1312 * See __handle_changed_spte().
1314 tdp_mmu_set_spte(kvm, iter, 0);
1316 if (!pte_write(range->pte)) {
1317 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1318 pte_pfn(range->pte));
1320 tdp_mmu_set_spte(kvm, iter, new_spte);
1327 * Handle the changed_pte MMU notifier for the TDP MMU.
1328 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1330 * Returns non-zero if a flush is needed before releasing the MMU lock.
1332 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1335 * No need to handle the remote TLB flush under RCU protection, the
1336 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1337 * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
1339 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1343 * Remove write access from all SPTEs at or above min_level that map GFNs
1344 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1347 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1348 gfn_t start, gfn_t end, int min_level)
1350 struct tdp_iter iter;
1352 bool spte_set = false;
1356 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1358 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1360 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1363 if (!is_shadow_present_pte(iter.old_spte) ||
1364 !is_last_spte(iter.old_spte, iter.level) ||
1365 !(iter.old_spte & PT_WRITABLE_MASK))
1368 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1370 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1381 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1382 * only affect leaf SPTEs down to min_level.
1383 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1385 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1386 const struct kvm_memory_slot *slot, int min_level)
1388 struct kvm_mmu_page *root;
1389 bool spte_set = false;
1391 lockdep_assert_held_read(&kvm->mmu_lock);
1393 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1394 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1395 slot->base_gfn + slot->npages, min_level);
1400 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1402 struct kvm_mmu_page *sp;
1406 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1410 sp->spt = (void *)__get_free_page(gfp);
1412 kmem_cache_free(mmu_page_header_cache, sp);
1419 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1420 struct tdp_iter *iter,
1423 struct kvm_mmu_page *sp;
1426 * Since we are allocating while under the MMU lock we have to be
1427 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1428 * reclaim and to avoid making any filesystem callbacks (which can end
1429 * up invoking KVM MMU notifiers, resulting in a deadlock).
1431 * If this allocation fails we drop the lock and retry with reclaim
1434 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1441 read_unlock(&kvm->mmu_lock);
1443 write_unlock(&kvm->mmu_lock);
1445 iter->yielded = true;
1446 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1449 read_lock(&kvm->mmu_lock);
1451 write_lock(&kvm->mmu_lock);
1458 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1459 struct kvm_mmu_page *sp, bool shared)
1461 const u64 huge_spte = iter->old_spte;
1462 const int level = iter->level;
1465 tdp_mmu_init_child_sp(sp, iter);
1468 * No need for atomics when writing to sp->spt since the page table has
1469 * not been linked in yet and thus is not reachable from any other CPU.
1471 for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1472 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1475 * Replace the huge spte with a pointer to the populated lower level
1476 * page table. Since we are making this change without a TLB flush vCPUs
1477 * will see a mix of the split mappings and the original huge mapping,
1478 * depending on what's currently in their TLB. This is fine from a
1479 * correctness standpoint since the translation will be the same either
1482 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1487 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1488 * are overwriting from the page stats. But we have to manually update
1489 * the page stats with the new present child pages.
1491 kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1494 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1498 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1499 struct kvm_mmu_page *root,
1500 gfn_t start, gfn_t end,
1501 int target_level, bool shared)
1503 struct kvm_mmu_page *sp = NULL;
1504 struct tdp_iter iter;
1510 * Traverse the page table splitting all huge pages above the target
1511 * level into one lower level. For example, if we encounter a 1GB page
1512 * we split it into 512 2MB pages.
1514 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1515 * to visit an SPTE before ever visiting its children, which means we
1516 * will correctly recursively split huge pages that are more than one
1517 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1518 * and then splitting each of those to 512 4KB pages).
1520 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1522 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1525 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1529 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1532 trace_kvm_mmu_split_huge_page(iter.gfn,
1542 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1551 * It's possible to exit the loop having never used the last sp if, for
1552 * example, a vCPU doing HugePage NX splitting wins the race and
1553 * installs its own sp in place of the last sp we tried to split.
1556 tdp_mmu_free_sp(sp);
1563 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1565 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1566 const struct kvm_memory_slot *slot,
1567 gfn_t start, gfn_t end,
1568 int target_level, bool shared)
1570 struct kvm_mmu_page *root;
1573 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1575 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1576 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1578 kvm_tdp_mmu_put_root(kvm, root, shared);
1585 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1586 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1587 * If AD bits are not enabled, this will require clearing the writable bit on
1588 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1591 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1592 gfn_t start, gfn_t end)
1594 struct tdp_iter iter;
1596 bool spte_set = false;
1600 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1602 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1605 if (!is_shadow_present_pte(iter.old_spte))
1608 if (spte_ad_need_write_protect(iter.old_spte)) {
1609 if (is_writable_pte(iter.old_spte))
1610 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1614 if (iter.old_spte & shadow_dirty_mask)
1615 new_spte = iter.old_spte & ~shadow_dirty_mask;
1620 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1631 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1632 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1633 * If AD bits are not enabled, this will require clearing the writable bit on
1634 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1637 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1638 const struct kvm_memory_slot *slot)
1640 struct kvm_mmu_page *root;
1641 bool spte_set = false;
1643 lockdep_assert_held_read(&kvm->mmu_lock);
1645 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1646 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1647 slot->base_gfn + slot->npages);
1653 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1654 * set in mask, starting at gfn. The given memslot is expected to contain all
1655 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1656 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1657 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1659 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1660 gfn_t gfn, unsigned long mask, bool wrprot)
1662 struct tdp_iter iter;
1667 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1668 gfn + BITS_PER_LONG) {
1672 if (iter.level > PG_LEVEL_4K ||
1673 !(mask & (1UL << (iter.gfn - gfn))))
1676 mask &= ~(1UL << (iter.gfn - gfn));
1678 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1679 if (is_writable_pte(iter.old_spte))
1680 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1684 if (iter.old_spte & shadow_dirty_mask)
1685 new_spte = iter.old_spte & ~shadow_dirty_mask;
1690 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1697 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1698 * set in mask, starting at gfn. The given memslot is expected to contain all
1699 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1700 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1701 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1703 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1704 struct kvm_memory_slot *slot,
1705 gfn_t gfn, unsigned long mask,
1708 struct kvm_mmu_page *root;
1710 lockdep_assert_held_write(&kvm->mmu_lock);
1711 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1712 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1716 * Clear leaf entries which could be replaced by large mappings, for
1717 * GFNs within the slot.
1719 static void zap_collapsible_spte_range(struct kvm *kvm,
1720 struct kvm_mmu_page *root,
1721 const struct kvm_memory_slot *slot)
1723 gfn_t start = slot->base_gfn;
1724 gfn_t end = start + slot->npages;
1725 struct tdp_iter iter;
1730 tdp_root_for_each_pte(iter, root, start, end) {
1732 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1735 if (!is_shadow_present_pte(iter.old_spte) ||
1736 !is_last_spte(iter.old_spte, iter.level))
1739 pfn = spte_to_pfn(iter.old_spte);
1740 if (kvm_is_reserved_pfn(pfn) ||
1741 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1745 /* Note, a successful atomic zap also does a remote TLB flush. */
1746 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1754 * Clear non-leaf entries (and free associated page tables) which could
1755 * be replaced by large mappings, for GFNs within the slot.
1757 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1758 const struct kvm_memory_slot *slot)
1760 struct kvm_mmu_page *root;
1762 lockdep_assert_held_read(&kvm->mmu_lock);
1764 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1765 zap_collapsible_spte_range(kvm, root, slot);
1769 * Removes write access on the last level SPTE mapping this GFN and unsets the
1770 * MMU-writable bit to ensure future writes continue to be intercepted.
1771 * Returns true if an SPTE was set and a TLB flush is needed.
1773 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1774 gfn_t gfn, int min_level)
1776 struct tdp_iter iter;
1778 bool spte_set = false;
1780 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1784 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1785 if (!is_shadow_present_pte(iter.old_spte) ||
1786 !is_last_spte(iter.old_spte, iter.level))
1789 new_spte = iter.old_spte &
1790 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1792 if (new_spte == iter.old_spte)
1795 tdp_mmu_set_spte(kvm, &iter, new_spte);
1805 * Removes write access on the last level SPTE mapping this GFN and unsets the
1806 * MMU-writable bit to ensure future writes continue to be intercepted.
1807 * Returns true if an SPTE was set and a TLB flush is needed.
1809 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1810 struct kvm_memory_slot *slot, gfn_t gfn,
1813 struct kvm_mmu_page *root;
1814 bool spte_set = false;
1816 lockdep_assert_held_write(&kvm->mmu_lock);
1817 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1818 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1824 * Return the level of the lowest level SPTE added to sptes.
1825 * That SPTE may be non-present.
1827 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1829 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1832 struct tdp_iter iter;
1833 struct kvm_mmu *mmu = vcpu->arch.mmu;
1834 gfn_t gfn = addr >> PAGE_SHIFT;
1837 *root_level = vcpu->arch.mmu->shadow_root_level;
1839 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1841 sptes[leaf] = iter.old_spte;
1848 * Returns the last level spte pointer of the shadow page walk for the given
1849 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1850 * walk could be performed, returns NULL and *spte does not contain valid data.
1853 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1854 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1856 * WARNING: This function is only intended to be called during fast_page_fault.
1858 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1861 struct tdp_iter iter;
1862 struct kvm_mmu *mmu = vcpu->arch.mmu;
1863 gfn_t gfn = addr >> PAGE_SHIFT;
1864 tdp_ptep_t sptep = NULL;
1866 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1867 *spte = iter.old_spte;
1872 * Perform the rcu_dereference to get the raw spte pointer value since
1873 * we are passing it up to fast_page_fault, which is shared with the
1874 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1877 * This is safe since fast_page_fault obeys the contracts of this
1878 * function as well as all TDP MMU contracts around modifying SPTEs
1879 * outside of mmu_lock.
1881 return rcu_dereference(sptep);