Merge tag 'sched-urgent-2021-03-14' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-microblaze.git] / arch / x86 / kvm / mmu / tdp_mmu.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20                 return;
21
22         /* This should not be changed for the lifetime of the VM. */
23         kvm->arch.tdp_mmu_enabled = true;
24
25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 }
29
30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
31 {
32         if (!kvm->arch.tdp_mmu_enabled)
33                 return;
34
35         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
36
37         /*
38          * Ensure that all the outstanding RCU callbacks to free shadow pages
39          * can run before the VM is torn down.
40          */
41         rcu_barrier();
42 }
43
44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
45 {
46         if (kvm_mmu_put_root(kvm, root))
47                 kvm_tdp_mmu_free_root(kvm, root);
48 }
49
50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
51                                            struct kvm_mmu_page *root)
52 {
53         lockdep_assert_held_write(&kvm->mmu_lock);
54
55         if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
56                 return false;
57
58         kvm_mmu_get_root(kvm, root);
59         return true;
60
61 }
62
63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
64                                                      struct kvm_mmu_page *root)
65 {
66         struct kvm_mmu_page *next_root;
67
68         next_root = list_next_entry(root, link);
69         tdp_mmu_put_root(kvm, root);
70         return next_root;
71 }
72
73 /*
74  * Note: this iterator gets and puts references to the roots it iterates over.
75  * This makes it safe to release the MMU lock and yield within the loop, but
76  * if exiting the loop early, the caller must drop the reference to the most
77  * recent root. (Unless keeping a live reference is desirable.)
78  */
79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                           \
80         for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,        \
81                                       typeof(*_root), link);            \
82              tdp_mmu_next_root_valid(_kvm, _root);                      \
83              _root = tdp_mmu_next_root(_kvm, _root))
84
85 #define for_each_tdp_mmu_root(_kvm, _root)                              \
86         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
87
88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
89                           gfn_t start, gfn_t end, bool can_yield);
90
91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
92 {
93         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
94
95         lockdep_assert_held_write(&kvm->mmu_lock);
96
97         WARN_ON(root->root_count);
98         WARN_ON(!root->tdp_mmu_page);
99
100         list_del(&root->link);
101
102         zap_gfn_range(kvm, root, 0, max_gfn, false);
103
104         free_page((unsigned long)root->spt);
105         kmem_cache_free(mmu_page_header_cache, root);
106 }
107
108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
109                                                    int level)
110 {
111         union kvm_mmu_page_role role;
112
113         role = vcpu->arch.mmu->mmu_role.base;
114         role.level = level;
115         role.direct = true;
116         role.gpte_is_8_bytes = true;
117         role.access = ACC_ALL;
118
119         return role;
120 }
121
122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
123                                                int level)
124 {
125         struct kvm_mmu_page *sp;
126
127         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
128         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
129         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
130
131         sp->role.word = page_role_for_level(vcpu, level).word;
132         sp->gfn = gfn;
133         sp->tdp_mmu_page = true;
134
135         trace_kvm_mmu_get_page(sp, true);
136
137         return sp;
138 }
139
140 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
141 {
142         union kvm_mmu_page_role role;
143         struct kvm *kvm = vcpu->kvm;
144         struct kvm_mmu_page *root;
145
146         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
147
148         write_lock(&kvm->mmu_lock);
149
150         /* Check for an existing root before allocating a new one. */
151         for_each_tdp_mmu_root(kvm, root) {
152                 if (root->role.word == role.word) {
153                         kvm_mmu_get_root(kvm, root);
154                         write_unlock(&kvm->mmu_lock);
155                         return root;
156                 }
157         }
158
159         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
160         root->root_count = 1;
161
162         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
163
164         write_unlock(&kvm->mmu_lock);
165
166         return root;
167 }
168
169 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
170 {
171         struct kvm_mmu_page *root;
172
173         root = get_tdp_mmu_vcpu_root(vcpu);
174         if (!root)
175                 return INVALID_PAGE;
176
177         return __pa(root->spt);
178 }
179
180 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
181 {
182         free_page((unsigned long)sp->spt);
183         kmem_cache_free(mmu_page_header_cache, sp);
184 }
185
186 /*
187  * This is called through call_rcu in order to free TDP page table memory
188  * safely with respect to other kernel threads that may be operating on
189  * the memory.
190  * By only accessing TDP MMU page table memory in an RCU read critical
191  * section, and freeing it after a grace period, lockless access to that
192  * memory won't use it after it is freed.
193  */
194 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
195 {
196         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
197                                                rcu_head);
198
199         tdp_mmu_free_sp(sp);
200 }
201
202 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
203                                 u64 old_spte, u64 new_spte, int level,
204                                 bool shared);
205
206 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
207 {
208         return sp->role.smm ? 1 : 0;
209 }
210
211 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
212 {
213         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
214
215         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
216                 return;
217
218         if (is_accessed_spte(old_spte) &&
219             (!is_accessed_spte(new_spte) || pfn_changed))
220                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
221 }
222
223 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
224                                           u64 old_spte, u64 new_spte, int level)
225 {
226         bool pfn_changed;
227         struct kvm_memory_slot *slot;
228
229         if (level > PG_LEVEL_4K)
230                 return;
231
232         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
233
234         if ((!is_writable_pte(old_spte) || pfn_changed) &&
235             is_writable_pte(new_spte)) {
236                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
237                 mark_page_dirty_in_slot(kvm, slot, gfn);
238         }
239 }
240
241 /**
242  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
243  *
244  * @kvm: kvm instance
245  * @sp: the new page
246  * @shared: This operation may not be running under the exclusive use of
247  *          the MMU lock and the operation must synchronize with other
248  *          threads that might be adding or removing pages.
249  * @account_nx: This page replaces a NX large page and should be marked for
250  *              eventual reclaim.
251  */
252 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
253                               bool shared, bool account_nx)
254 {
255         if (shared)
256                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
257         else
258                 lockdep_assert_held_write(&kvm->mmu_lock);
259
260         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
261         if (account_nx)
262                 account_huge_nx_page(kvm, sp);
263
264         if (shared)
265                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
266 }
267
268 /**
269  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
270  *
271  * @kvm: kvm instance
272  * @sp: the page to be removed
273  * @shared: This operation may not be running under the exclusive use of
274  *          the MMU lock and the operation must synchronize with other
275  *          threads that might be adding or removing pages.
276  */
277 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
278                                 bool shared)
279 {
280         if (shared)
281                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
282         else
283                 lockdep_assert_held_write(&kvm->mmu_lock);
284
285         list_del(&sp->link);
286         if (sp->lpage_disallowed)
287                 unaccount_huge_nx_page(kvm, sp);
288
289         if (shared)
290                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
291 }
292
293 /**
294  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
295  *
296  * @kvm: kvm instance
297  * @pt: the page removed from the paging structure
298  * @shared: This operation may not be running under the exclusive use
299  *          of the MMU lock and the operation must synchronize with other
300  *          threads that might be modifying SPTEs.
301  *
302  * Given a page table that has been removed from the TDP paging structure,
303  * iterates through the page table to clear SPTEs and free child page tables.
304  */
305 static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
306                                         bool shared)
307 {
308         struct kvm_mmu_page *sp = sptep_to_sp(pt);
309         int level = sp->role.level;
310         gfn_t base_gfn = sp->gfn;
311         u64 old_child_spte;
312         u64 *sptep;
313         gfn_t gfn;
314         int i;
315
316         trace_kvm_mmu_prepare_zap_page(sp);
317
318         tdp_mmu_unlink_page(kvm, sp, shared);
319
320         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
321                 sptep = pt + i;
322                 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
323
324                 if (shared) {
325                         /*
326                          * Set the SPTE to a nonpresent value that other
327                          * threads will not overwrite. If the SPTE was
328                          * already marked as removed then another thread
329                          * handling a page fault could overwrite it, so
330                          * set the SPTE until it is set from some other
331                          * value to the removed SPTE value.
332                          */
333                         for (;;) {
334                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
335                                 if (!is_removed_spte(old_child_spte))
336                                         break;
337                                 cpu_relax();
338                         }
339                 } else {
340                         /*
341                          * If the SPTE is not MMU-present, there is no backing
342                          * page associated with the SPTE and so no side effects
343                          * that need to be recorded, and exclusive ownership of
344                          * mmu_lock ensures the SPTE can't be made present.
345                          * Note, zapping MMIO SPTEs is also unnecessary as they
346                          * are guarded by the memslots generation, not by being
347                          * unreachable.
348                          */
349                         old_child_spte = READ_ONCE(*sptep);
350                         if (!is_shadow_present_pte(old_child_spte))
351                                 continue;
352
353                         /*
354                          * Marking the SPTE as a removed SPTE is not
355                          * strictly necessary here as the MMU lock will
356                          * stop other threads from concurrently modifying
357                          * this SPTE. Using the removed SPTE value keeps
358                          * the two branches consistent and simplifies
359                          * the function.
360                          */
361                         WRITE_ONCE(*sptep, REMOVED_SPTE);
362                 }
363                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
364                                     old_child_spte, REMOVED_SPTE, level - 1,
365                                     shared);
366         }
367
368         kvm_flush_remote_tlbs_with_address(kvm, gfn,
369                                            KVM_PAGES_PER_HPAGE(level));
370
371         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
372 }
373
374 /**
375  * handle_changed_spte - handle bookkeeping associated with an SPTE change
376  * @kvm: kvm instance
377  * @as_id: the address space of the paging structure the SPTE was a part of
378  * @gfn: the base GFN that was mapped by the SPTE
379  * @old_spte: The value of the SPTE before the change
380  * @new_spte: The value of the SPTE after the change
381  * @level: the level of the PT the SPTE is part of in the paging structure
382  * @shared: This operation may not be running under the exclusive use of
383  *          the MMU lock and the operation must synchronize with other
384  *          threads that might be modifying SPTEs.
385  *
386  * Handle bookkeeping that might result from the modification of a SPTE.
387  * This function must be called for all TDP SPTE modifications.
388  */
389 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
390                                   u64 old_spte, u64 new_spte, int level,
391                                   bool shared)
392 {
393         bool was_present = is_shadow_present_pte(old_spte);
394         bool is_present = is_shadow_present_pte(new_spte);
395         bool was_leaf = was_present && is_last_spte(old_spte, level);
396         bool is_leaf = is_present && is_last_spte(new_spte, level);
397         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
398
399         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
400         WARN_ON(level < PG_LEVEL_4K);
401         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
402
403         /*
404          * If this warning were to trigger it would indicate that there was a
405          * missing MMU notifier or a race with some notifier handler.
406          * A present, leaf SPTE should never be directly replaced with another
407          * present leaf SPTE pointing to a differnt PFN. A notifier handler
408          * should be zapping the SPTE before the main MM's page table is
409          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
410          * thread before replacement.
411          */
412         if (was_leaf && is_leaf && pfn_changed) {
413                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
414                        "SPTE with another present leaf SPTE mapping a\n"
415                        "different PFN!\n"
416                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
417                        as_id, gfn, old_spte, new_spte, level);
418
419                 /*
420                  * Crash the host to prevent error propagation and guest data
421                  * courruption.
422                  */
423                 BUG();
424         }
425
426         if (old_spte == new_spte)
427                 return;
428
429         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
430
431         /*
432          * The only times a SPTE should be changed from a non-present to
433          * non-present state is when an MMIO entry is installed/modified/
434          * removed. In that case, there is nothing to do here.
435          */
436         if (!was_present && !is_present) {
437                 /*
438                  * If this change does not involve a MMIO SPTE or removed SPTE,
439                  * it is unexpected. Log the change, though it should not
440                  * impact the guest since both the former and current SPTEs
441                  * are nonpresent.
442                  */
443                 if (WARN_ON(!is_mmio_spte(old_spte) &&
444                             !is_mmio_spte(new_spte) &&
445                             !is_removed_spte(new_spte)))
446                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
447                                "should not be replaced with another,\n"
448                                "different nonpresent SPTE, unless one or both\n"
449                                "are MMIO SPTEs, or the new SPTE is\n"
450                                "a temporary removed SPTE.\n"
451                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
452                                as_id, gfn, old_spte, new_spte, level);
453                 return;
454         }
455
456
457         if (was_leaf && is_dirty_spte(old_spte) &&
458             (!is_dirty_spte(new_spte) || pfn_changed))
459                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
460
461         /*
462          * Recursively handle child PTs if the change removed a subtree from
463          * the paging structure.
464          */
465         if (was_present && !was_leaf && (pfn_changed || !is_present))
466                 handle_removed_tdp_mmu_page(kvm,
467                                 spte_to_child_pt(old_spte, level), shared);
468 }
469
470 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
471                                 u64 old_spte, u64 new_spte, int level,
472                                 bool shared)
473 {
474         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
475                               shared);
476         handle_changed_spte_acc_track(old_spte, new_spte, level);
477         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
478                                       new_spte, level);
479 }
480
481 /*
482  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
483  * associated bookkeeping
484  *
485  * @kvm: kvm instance
486  * @iter: a tdp_iter instance currently on the SPTE that should be set
487  * @new_spte: The value the SPTE should be set to
488  * Returns: true if the SPTE was set, false if it was not. If false is returned,
489  *          this function will have no side-effects.
490  */
491 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
492                                            struct tdp_iter *iter,
493                                            u64 new_spte)
494 {
495         u64 *root_pt = tdp_iter_root_pt(iter);
496         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
497         int as_id = kvm_mmu_page_as_id(root);
498
499         lockdep_assert_held_read(&kvm->mmu_lock);
500
501         /*
502          * Do not change removed SPTEs. Only the thread that froze the SPTE
503          * may modify it.
504          */
505         if (iter->old_spte == REMOVED_SPTE)
506                 return false;
507
508         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
509                       new_spte) != iter->old_spte)
510                 return false;
511
512         handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
513                             iter->level, true);
514
515         return true;
516 }
517
518 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
519                                            struct tdp_iter *iter)
520 {
521         /*
522          * Freeze the SPTE by setting it to a special,
523          * non-present value. This will stop other threads from
524          * immediately installing a present entry in its place
525          * before the TLBs are flushed.
526          */
527         if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
528                 return false;
529
530         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
531                                            KVM_PAGES_PER_HPAGE(iter->level));
532
533         /*
534          * No other thread can overwrite the removed SPTE as they
535          * must either wait on the MMU lock or use
536          * tdp_mmu_set_spte_atomic which will not overrite the
537          * special removed SPTE value. No bookkeeping is needed
538          * here since the SPTE is going from non-present
539          * to non-present.
540          */
541         WRITE_ONCE(*iter->sptep, 0);
542
543         return true;
544 }
545
546
547 /*
548  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
549  * @kvm: kvm instance
550  * @iter: a tdp_iter instance currently on the SPTE that should be set
551  * @new_spte: The value the SPTE should be set to
552  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
553  *                    of the page. Should be set unless handling an MMU
554  *                    notifier for access tracking. Leaving record_acc_track
555  *                    unset in that case prevents page accesses from being
556  *                    double counted.
557  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
558  *                    appropriate for the change being made. Should be set
559  *                    unless performing certain dirty logging operations.
560  *                    Leaving record_dirty_log unset in that case prevents page
561  *                    writes from being double counted.
562  */
563 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
564                                       u64 new_spte, bool record_acc_track,
565                                       bool record_dirty_log)
566 {
567         tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
568         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
569         int as_id = kvm_mmu_page_as_id(root);
570
571         lockdep_assert_held_write(&kvm->mmu_lock);
572
573         /*
574          * No thread should be using this function to set SPTEs to the
575          * temporary removed SPTE value.
576          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
577          * should be used. If operating under the MMU lock in write mode, the
578          * use of the removed SPTE should not be necessary.
579          */
580         WARN_ON(iter->old_spte == REMOVED_SPTE);
581
582         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
583
584         __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
585                               iter->level, false);
586         if (record_acc_track)
587                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
588                                               iter->level);
589         if (record_dirty_log)
590                 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
591                                               iter->old_spte, new_spte,
592                                               iter->level);
593 }
594
595 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
596                                     u64 new_spte)
597 {
598         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
599 }
600
601 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
602                                                  struct tdp_iter *iter,
603                                                  u64 new_spte)
604 {
605         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
606 }
607
608 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
609                                                  struct tdp_iter *iter,
610                                                  u64 new_spte)
611 {
612         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
613 }
614
615 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
616         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
617
618 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
619         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
620                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
621                     !is_last_spte(_iter.old_spte, _iter.level))         \
622                         continue;                                       \
623                 else
624
625 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
626         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
627                          _mmu->shadow_root_level, _start, _end)
628
629 /*
630  * Yield if the MMU lock is contended or this thread needs to return control
631  * to the scheduler.
632  *
633  * If this function should yield and flush is set, it will perform a remote
634  * TLB flush before yielding.
635  *
636  * If this function yields, it will also reset the tdp_iter's walk over the
637  * paging structure and the calling function should skip to the next
638  * iteration to allow the iterator to continue its traversal from the
639  * paging structure root.
640  *
641  * Return true if this function yielded and the iterator's traversal was reset.
642  * Return false if a yield was not needed.
643  */
644 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
645                                              struct tdp_iter *iter, bool flush)
646 {
647         /* Ensure forward progress has been made before yielding. */
648         if (iter->next_last_level_gfn == iter->yielded_gfn)
649                 return false;
650
651         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
652                 rcu_read_unlock();
653
654                 if (flush)
655                         kvm_flush_remote_tlbs(kvm);
656
657                 cond_resched_rwlock_write(&kvm->mmu_lock);
658                 rcu_read_lock();
659
660                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
661
662                 tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
663                                iter->root_level, iter->min_level,
664                                iter->next_last_level_gfn);
665
666                 return true;
667         }
668
669         return false;
670 }
671
672 /*
673  * Tears down the mappings for the range of gfns, [start, end), and frees the
674  * non-root pages mapping GFNs strictly within that range. Returns true if
675  * SPTEs have been cleared and a TLB flush is needed before releasing the
676  * MMU lock.
677  * If can_yield is true, will release the MMU lock and reschedule if the
678  * scheduler needs the CPU or there is contention on the MMU lock. If this
679  * function cannot yield, it will not release the MMU lock or reschedule and
680  * the caller must ensure it does not supply too large a GFN range, or the
681  * operation can cause a soft lockup.
682  */
683 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
684                           gfn_t start, gfn_t end, bool can_yield)
685 {
686         struct tdp_iter iter;
687         bool flush_needed = false;
688
689         rcu_read_lock();
690
691         tdp_root_for_each_pte(iter, root, start, end) {
692                 if (can_yield &&
693                     tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
694                         flush_needed = false;
695                         continue;
696                 }
697
698                 if (!is_shadow_present_pte(iter.old_spte))
699                         continue;
700
701                 /*
702                  * If this is a non-last-level SPTE that covers a larger range
703                  * than should be zapped, continue, and zap the mappings at a
704                  * lower level.
705                  */
706                 if ((iter.gfn < start ||
707                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
708                     !is_last_spte(iter.old_spte, iter.level))
709                         continue;
710
711                 tdp_mmu_set_spte(kvm, &iter, 0);
712                 flush_needed = true;
713         }
714
715         rcu_read_unlock();
716         return flush_needed;
717 }
718
719 /*
720  * Tears down the mappings for the range of gfns, [start, end), and frees the
721  * non-root pages mapping GFNs strictly within that range. Returns true if
722  * SPTEs have been cleared and a TLB flush is needed before releasing the
723  * MMU lock.
724  */
725 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
726 {
727         struct kvm_mmu_page *root;
728         bool flush = false;
729
730         for_each_tdp_mmu_root_yield_safe(kvm, root)
731                 flush |= zap_gfn_range(kvm, root, start, end, true);
732
733         return flush;
734 }
735
736 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
737 {
738         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
739         bool flush;
740
741         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
742         if (flush)
743                 kvm_flush_remote_tlbs(kvm);
744 }
745
746 /*
747  * Installs a last-level SPTE to handle a TDP page fault.
748  * (NPT/EPT violation/misconfiguration)
749  */
750 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
751                                           int map_writable,
752                                           struct tdp_iter *iter,
753                                           kvm_pfn_t pfn, bool prefault)
754 {
755         u64 new_spte;
756         int ret = 0;
757         int make_spte_ret = 0;
758
759         if (unlikely(is_noslot_pfn(pfn)))
760                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
761         else
762                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
763                                          pfn, iter->old_spte, prefault, true,
764                                          map_writable, !shadow_accessed_mask,
765                                          &new_spte);
766
767         if (new_spte == iter->old_spte)
768                 ret = RET_PF_SPURIOUS;
769         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
770                 return RET_PF_RETRY;
771
772         /*
773          * If the page fault was caused by a write but the page is write
774          * protected, emulation is needed. If the emulation was skipped,
775          * the vCPU would have the same fault again.
776          */
777         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
778                 if (write)
779                         ret = RET_PF_EMULATE;
780                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
781         }
782
783         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
784         if (unlikely(is_mmio_spte(new_spte))) {
785                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
786                                      new_spte);
787                 ret = RET_PF_EMULATE;
788         } else
789                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
790                                        rcu_dereference(iter->sptep));
791
792         trace_kvm_mmu_set_spte(iter->level, iter->gfn,
793                                rcu_dereference(iter->sptep));
794         if (!prefault)
795                 vcpu->stat.pf_fixed++;
796
797         return ret;
798 }
799
800 /*
801  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
802  * page tables and SPTEs to translate the faulting guest physical address.
803  */
804 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
805                     int map_writable, int max_level, kvm_pfn_t pfn,
806                     bool prefault)
807 {
808         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
809         bool write = error_code & PFERR_WRITE_MASK;
810         bool exec = error_code & PFERR_FETCH_MASK;
811         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
812         struct kvm_mmu *mmu = vcpu->arch.mmu;
813         struct tdp_iter iter;
814         struct kvm_mmu_page *sp;
815         u64 *child_pt;
816         u64 new_spte;
817         int ret;
818         gfn_t gfn = gpa >> PAGE_SHIFT;
819         int level;
820         int req_level;
821
822         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
823                 return RET_PF_RETRY;
824         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
825                 return RET_PF_RETRY;
826
827         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
828                                         huge_page_disallowed, &req_level);
829
830         trace_kvm_mmu_spte_requested(gpa, level, pfn);
831
832         rcu_read_lock();
833
834         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
835                 if (nx_huge_page_workaround_enabled)
836                         disallowed_hugepage_adjust(iter.old_spte, gfn,
837                                                    iter.level, &pfn, &level);
838
839                 if (iter.level == level)
840                         break;
841
842                 /*
843                  * If there is an SPTE mapping a large page at a higher level
844                  * than the target, that SPTE must be cleared and replaced
845                  * with a non-leaf SPTE.
846                  */
847                 if (is_shadow_present_pte(iter.old_spte) &&
848                     is_large_pte(iter.old_spte)) {
849                         if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
850                                 break;
851
852                         /*
853                          * The iter must explicitly re-read the spte here
854                          * because the new value informs the !present
855                          * path below.
856                          */
857                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
858                 }
859
860                 if (!is_shadow_present_pte(iter.old_spte)) {
861                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
862                         child_pt = sp->spt;
863
864                         new_spte = make_nonleaf_spte(child_pt,
865                                                      !shadow_accessed_mask);
866
867                         if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
868                                                     new_spte)) {
869                                 tdp_mmu_link_page(vcpu->kvm, sp, true,
870                                                   huge_page_disallowed &&
871                                                   req_level >= iter.level);
872
873                                 trace_kvm_mmu_get_page(sp, true);
874                         } else {
875                                 tdp_mmu_free_sp(sp);
876                                 break;
877                         }
878                 }
879         }
880
881         if (iter.level != level) {
882                 rcu_read_unlock();
883                 return RET_PF_RETRY;
884         }
885
886         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
887                                               pfn, prefault);
888         rcu_read_unlock();
889
890         return ret;
891 }
892
893 static __always_inline int
894 kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
895                              unsigned long start,
896                              unsigned long end,
897                              unsigned long data,
898                              int (*handler)(struct kvm *kvm,
899                                             struct kvm_memory_slot *slot,
900                                             struct kvm_mmu_page *root,
901                                             gfn_t start,
902                                             gfn_t end,
903                                             unsigned long data))
904 {
905         struct kvm_memslots *slots;
906         struct kvm_memory_slot *memslot;
907         struct kvm_mmu_page *root;
908         int ret = 0;
909         int as_id;
910
911         for_each_tdp_mmu_root_yield_safe(kvm, root) {
912                 as_id = kvm_mmu_page_as_id(root);
913                 slots = __kvm_memslots(kvm, as_id);
914                 kvm_for_each_memslot(memslot, slots) {
915                         unsigned long hva_start, hva_end;
916                         gfn_t gfn_start, gfn_end;
917
918                         hva_start = max(start, memslot->userspace_addr);
919                         hva_end = min(end, memslot->userspace_addr +
920                                       (memslot->npages << PAGE_SHIFT));
921                         if (hva_start >= hva_end)
922                                 continue;
923                         /*
924                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
925                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
926                          */
927                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
928                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
929
930                         ret |= handler(kvm, memslot, root, gfn_start,
931                                        gfn_end, data);
932                 }
933         }
934
935         return ret;
936 }
937
938 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
939                                      struct kvm_memory_slot *slot,
940                                      struct kvm_mmu_page *root, gfn_t start,
941                                      gfn_t end, unsigned long unused)
942 {
943         return zap_gfn_range(kvm, root, start, end, false);
944 }
945
946 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
947                               unsigned long end)
948 {
949         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
950                                             zap_gfn_range_hva_wrapper);
951 }
952
953 /*
954  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
955  * if any of the GFNs in the range have been accessed.
956  */
957 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
958                          struct kvm_mmu_page *root, gfn_t start, gfn_t end,
959                          unsigned long unused)
960 {
961         struct tdp_iter iter;
962         int young = 0;
963         u64 new_spte = 0;
964
965         rcu_read_lock();
966
967         tdp_root_for_each_leaf_pte(iter, root, start, end) {
968                 /*
969                  * If we have a non-accessed entry we don't need to change the
970                  * pte.
971                  */
972                 if (!is_accessed_spte(iter.old_spte))
973                         continue;
974
975                 new_spte = iter.old_spte;
976
977                 if (spte_ad_enabled(new_spte)) {
978                         clear_bit((ffs(shadow_accessed_mask) - 1),
979                                   (unsigned long *)&new_spte);
980                 } else {
981                         /*
982                          * Capture the dirty status of the page, so that it doesn't get
983                          * lost when the SPTE is marked for access tracking.
984                          */
985                         if (is_writable_pte(new_spte))
986                                 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
987
988                         new_spte = mark_spte_for_access_track(new_spte);
989                 }
990                 new_spte &= ~shadow_dirty_mask;
991
992                 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
993                 young = 1;
994
995                 trace_kvm_age_page(iter.gfn, iter.level, slot, young);
996         }
997
998         rcu_read_unlock();
999
1000         return young;
1001 }
1002
1003 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
1004                               unsigned long end)
1005 {
1006         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
1007                                             age_gfn_range);
1008 }
1009
1010 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
1011                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1012                         unsigned long unused2)
1013 {
1014         struct tdp_iter iter;
1015
1016         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
1017                 if (is_accessed_spte(iter.old_spte))
1018                         return 1;
1019
1020         return 0;
1021 }
1022
1023 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1024 {
1025         return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
1026                                             test_age_gfn);
1027 }
1028
1029 /*
1030  * Handle the changed_pte MMU notifier for the TDP MMU.
1031  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1032  * notifier.
1033  * Returns non-zero if a flush is needed before releasing the MMU lock.
1034  */
1035 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1036                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1037                         unsigned long data)
1038 {
1039         struct tdp_iter iter;
1040         pte_t *ptep = (pte_t *)data;
1041         kvm_pfn_t new_pfn;
1042         u64 new_spte;
1043         int need_flush = 0;
1044
1045         rcu_read_lock();
1046
1047         WARN_ON(pte_huge(*ptep));
1048
1049         new_pfn = pte_pfn(*ptep);
1050
1051         tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
1052                 if (iter.level != PG_LEVEL_4K)
1053                         continue;
1054
1055                 if (!is_shadow_present_pte(iter.old_spte))
1056                         break;
1057
1058                 tdp_mmu_set_spte(kvm, &iter, 0);
1059
1060                 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
1061
1062                 if (!pte_write(*ptep)) {
1063                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1064                                         iter.old_spte, new_pfn);
1065
1066                         tdp_mmu_set_spte(kvm, &iter, new_spte);
1067                 }
1068
1069                 need_flush = 1;
1070         }
1071
1072         if (need_flush)
1073                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1074
1075         rcu_read_unlock();
1076
1077         return 0;
1078 }
1079
1080 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1081                              pte_t *host_ptep)
1082 {
1083         return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
1084                                             (unsigned long)host_ptep,
1085                                             set_tdp_spte);
1086 }
1087
1088 /*
1089  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1090  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1091  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1092  */
1093 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1094                              gfn_t start, gfn_t end, int min_level)
1095 {
1096         struct tdp_iter iter;
1097         u64 new_spte;
1098         bool spte_set = false;
1099
1100         rcu_read_lock();
1101
1102         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1103
1104         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1105                                    min_level, start, end) {
1106                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1107                         continue;
1108
1109                 if (!is_shadow_present_pte(iter.old_spte) ||
1110                     !is_last_spte(iter.old_spte, iter.level) ||
1111                     !(iter.old_spte & PT_WRITABLE_MASK))
1112                         continue;
1113
1114                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1115
1116                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1117                 spte_set = true;
1118         }
1119
1120         rcu_read_unlock();
1121         return spte_set;
1122 }
1123
1124 /*
1125  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1126  * only affect leaf SPTEs down to min_level.
1127  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1128  */
1129 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1130                              int min_level)
1131 {
1132         struct kvm_mmu_page *root;
1133         int root_as_id;
1134         bool spte_set = false;
1135
1136         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1137                 root_as_id = kvm_mmu_page_as_id(root);
1138                 if (root_as_id != slot->as_id)
1139                         continue;
1140
1141                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1142                              slot->base_gfn + slot->npages, min_level);
1143         }
1144
1145         return spte_set;
1146 }
1147
1148 /*
1149  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1150  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1151  * If AD bits are not enabled, this will require clearing the writable bit on
1152  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1153  * be flushed.
1154  */
1155 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1156                            gfn_t start, gfn_t end)
1157 {
1158         struct tdp_iter iter;
1159         u64 new_spte;
1160         bool spte_set = false;
1161
1162         rcu_read_lock();
1163
1164         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1165                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1166                         continue;
1167
1168                 if (spte_ad_need_write_protect(iter.old_spte)) {
1169                         if (is_writable_pte(iter.old_spte))
1170                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1171                         else
1172                                 continue;
1173                 } else {
1174                         if (iter.old_spte & shadow_dirty_mask)
1175                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1176                         else
1177                                 continue;
1178                 }
1179
1180                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1181                 spte_set = true;
1182         }
1183
1184         rcu_read_unlock();
1185         return spte_set;
1186 }
1187
1188 /*
1189  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1190  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1191  * If AD bits are not enabled, this will require clearing the writable bit on
1192  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1193  * be flushed.
1194  */
1195 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1196 {
1197         struct kvm_mmu_page *root;
1198         int root_as_id;
1199         bool spte_set = false;
1200
1201         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1202                 root_as_id = kvm_mmu_page_as_id(root);
1203                 if (root_as_id != slot->as_id)
1204                         continue;
1205
1206                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1207                                 slot->base_gfn + slot->npages);
1208         }
1209
1210         return spte_set;
1211 }
1212
1213 /*
1214  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1215  * set in mask, starting at gfn. The given memslot is expected to contain all
1216  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1217  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1218  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1219  */
1220 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1221                                   gfn_t gfn, unsigned long mask, bool wrprot)
1222 {
1223         struct tdp_iter iter;
1224         u64 new_spte;
1225
1226         rcu_read_lock();
1227
1228         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1229                                     gfn + BITS_PER_LONG) {
1230                 if (!mask)
1231                         break;
1232
1233                 if (iter.level > PG_LEVEL_4K ||
1234                     !(mask & (1UL << (iter.gfn - gfn))))
1235                         continue;
1236
1237                 mask &= ~(1UL << (iter.gfn - gfn));
1238
1239                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1240                         if (is_writable_pte(iter.old_spte))
1241                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1242                         else
1243                                 continue;
1244                 } else {
1245                         if (iter.old_spte & shadow_dirty_mask)
1246                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1247                         else
1248                                 continue;
1249                 }
1250
1251                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1252         }
1253
1254         rcu_read_unlock();
1255 }
1256
1257 /*
1258  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1259  * set in mask, starting at gfn. The given memslot is expected to contain all
1260  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1261  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1262  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1263  */
1264 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1265                                        struct kvm_memory_slot *slot,
1266                                        gfn_t gfn, unsigned long mask,
1267                                        bool wrprot)
1268 {
1269         struct kvm_mmu_page *root;
1270         int root_as_id;
1271
1272         lockdep_assert_held_write(&kvm->mmu_lock);
1273         for_each_tdp_mmu_root(kvm, root) {
1274                 root_as_id = kvm_mmu_page_as_id(root);
1275                 if (root_as_id != slot->as_id)
1276                         continue;
1277
1278                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1279         }
1280 }
1281
1282 /*
1283  * Clear leaf entries which could be replaced by large mappings, for
1284  * GFNs within the slot.
1285  */
1286 static void zap_collapsible_spte_range(struct kvm *kvm,
1287                                        struct kvm_mmu_page *root,
1288                                        struct kvm_memory_slot *slot)
1289 {
1290         gfn_t start = slot->base_gfn;
1291         gfn_t end = start + slot->npages;
1292         struct tdp_iter iter;
1293         kvm_pfn_t pfn;
1294         bool spte_set = false;
1295
1296         rcu_read_lock();
1297
1298         tdp_root_for_each_pte(iter, root, start, end) {
1299                 if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
1300                         spte_set = false;
1301                         continue;
1302                 }
1303
1304                 if (!is_shadow_present_pte(iter.old_spte) ||
1305                     !is_last_spte(iter.old_spte, iter.level))
1306                         continue;
1307
1308                 pfn = spte_to_pfn(iter.old_spte);
1309                 if (kvm_is_reserved_pfn(pfn) ||
1310                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1311                                                             pfn, PG_LEVEL_NUM))
1312                         continue;
1313
1314                 tdp_mmu_set_spte(kvm, &iter, 0);
1315
1316                 spte_set = true;
1317         }
1318
1319         rcu_read_unlock();
1320         if (spte_set)
1321                 kvm_flush_remote_tlbs(kvm);
1322 }
1323
1324 /*
1325  * Clear non-leaf entries (and free associated page tables) which could
1326  * be replaced by large mappings, for GFNs within the slot.
1327  */
1328 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1329                                        struct kvm_memory_slot *slot)
1330 {
1331         struct kvm_mmu_page *root;
1332         int root_as_id;
1333
1334         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1335                 root_as_id = kvm_mmu_page_as_id(root);
1336                 if (root_as_id != slot->as_id)
1337                         continue;
1338
1339                 zap_collapsible_spte_range(kvm, root, slot);
1340         }
1341 }
1342
1343 /*
1344  * Removes write access on the last level SPTE mapping this GFN and unsets the
1345  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1346  * Returns true if an SPTE was set and a TLB flush is needed.
1347  */
1348 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1349                               gfn_t gfn)
1350 {
1351         struct tdp_iter iter;
1352         u64 new_spte;
1353         bool spte_set = false;
1354
1355         rcu_read_lock();
1356
1357         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1358                 if (!is_writable_pte(iter.old_spte))
1359                         break;
1360
1361                 new_spte = iter.old_spte &
1362                         ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
1363
1364                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1365                 spte_set = true;
1366         }
1367
1368         rcu_read_unlock();
1369
1370         return spte_set;
1371 }
1372
1373 /*
1374  * Removes write access on the last level SPTE mapping this GFN and unsets the
1375  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1376  * Returns true if an SPTE was set and a TLB flush is needed.
1377  */
1378 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1379                                    struct kvm_memory_slot *slot, gfn_t gfn)
1380 {
1381         struct kvm_mmu_page *root;
1382         int root_as_id;
1383         bool spte_set = false;
1384
1385         lockdep_assert_held_write(&kvm->mmu_lock);
1386         for_each_tdp_mmu_root(kvm, root) {
1387                 root_as_id = kvm_mmu_page_as_id(root);
1388                 if (root_as_id != slot->as_id)
1389                         continue;
1390
1391                 spte_set |= write_protect_gfn(kvm, root, gfn);
1392         }
1393         return spte_set;
1394 }
1395
1396 /*
1397  * Return the level of the lowest level SPTE added to sptes.
1398  * That SPTE may be non-present.
1399  */
1400 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1401                          int *root_level)
1402 {
1403         struct tdp_iter iter;
1404         struct kvm_mmu *mmu = vcpu->arch.mmu;
1405         gfn_t gfn = addr >> PAGE_SHIFT;
1406         int leaf = -1;
1407
1408         *root_level = vcpu->arch.mmu->shadow_root_level;
1409
1410         rcu_read_lock();
1411
1412         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1413                 leaf = iter.level;
1414                 sptes[leaf] = iter.old_spte;
1415         }
1416
1417         rcu_read_unlock();
1418
1419         return leaf;
1420 }