KVM: x86/mmu: replace root_level with cpu_role.base.level
[linux-2.6-microblaze.git] / arch / x86 / kvm / mmu / mmu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * This module enables machines with Intel VT-x extensions to run virtual
6  * machines without emulation or binary translation.
7  *
8  * MMU support
9  *
10  * Copyright (C) 2006 Qumranet, Inc.
11  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
12  *
13  * Authors:
14  *   Yaniv Kamay  <yaniv@qumranet.com>
15  *   Avi Kivity   <avi@qumranet.com>
16  */
17
18 #include "irq.h"
19 #include "ioapic.h"
20 #include "mmu.h"
21 #include "mmu_internal.h"
22 #include "tdp_mmu.h"
23 #include "x86.h"
24 #include "kvm_cache_regs.h"
25 #include "kvm_emulate.h"
26 #include "cpuid.h"
27 #include "spte.h"
28
29 #include <linux/kvm_host.h>
30 #include <linux/types.h>
31 #include <linux/string.h>
32 #include <linux/mm.h>
33 #include <linux/highmem.h>
34 #include <linux/moduleparam.h>
35 #include <linux/export.h>
36 #include <linux/swap.h>
37 #include <linux/hugetlb.h>
38 #include <linux/compiler.h>
39 #include <linux/srcu.h>
40 #include <linux/slab.h>
41 #include <linux/sched/signal.h>
42 #include <linux/uaccess.h>
43 #include <linux/hash.h>
44 #include <linux/kern_levels.h>
45 #include <linux/kthread.h>
46
47 #include <asm/page.h>
48 #include <asm/memtype.h>
49 #include <asm/cmpxchg.h>
50 #include <asm/io.h>
51 #include <asm/set_memory.h>
52 #include <asm/vmx.h>
53 #include <asm/kvm_page_track.h>
54 #include "trace.h"
55
56 #include "paging.h"
57
58 extern bool itlb_multihit_kvm_mitigation;
59
60 int __read_mostly nx_huge_pages = -1;
61 static uint __read_mostly nx_huge_pages_recovery_period_ms;
62 #ifdef CONFIG_PREEMPT_RT
63 /* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
64 static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
65 #else
66 static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
67 #endif
68
69 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
70 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
71
72 static const struct kernel_param_ops nx_huge_pages_ops = {
73         .set = set_nx_huge_pages,
74         .get = param_get_bool,
75 };
76
77 static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
78         .set = set_nx_huge_pages_recovery_param,
79         .get = param_get_uint,
80 };
81
82 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
83 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
84 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
85                 &nx_huge_pages_recovery_ratio, 0644);
86 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
87 module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
88                 &nx_huge_pages_recovery_period_ms, 0644);
89 __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
90
91 static bool __read_mostly force_flush_and_sync_on_reuse;
92 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
93
94 /*
95  * When setting this variable to true it enables Two-Dimensional-Paging
96  * where the hardware walks 2 page tables:
97  * 1. the guest-virtual to guest-physical
98  * 2. while doing 1. it walks guest-physical to host-physical
99  * If the hardware supports that we don't need to do shadow paging.
100  */
101 bool tdp_enabled = false;
102
103 static int max_huge_page_level __read_mostly;
104 static int tdp_root_level __read_mostly;
105 static int max_tdp_level __read_mostly;
106
107 #ifdef MMU_DEBUG
108 bool dbg = 0;
109 module_param(dbg, bool, 0644);
110 #endif
111
112 #define PTE_PREFETCH_NUM                8
113
114 #define PT32_LEVEL_BITS 10
115
116 #define PT32_LEVEL_SHIFT(level) \
117                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
118
119 #define PT32_LVL_OFFSET_MASK(level) \
120         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
121                                                 * PT32_LEVEL_BITS))) - 1))
122
123 #define PT32_INDEX(address, level)\
124         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
125
126
127 #define PT32_BASE_ADDR_MASK PAGE_MASK
128 #define PT32_DIR_BASE_ADDR_MASK \
129         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
130 #define PT32_LVL_ADDR_MASK(level) \
131         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
132                                             * PT32_LEVEL_BITS))) - 1))
133
134 #include <trace/events/kvm.h>
135
136 /* make pte_list_desc fit well in cache lines */
137 #define PTE_LIST_EXT 14
138
139 /*
140  * Slight optimization of cacheline layout, by putting `more' and `spte_count'
141  * at the start; then accessing it will only use one single cacheline for
142  * either full (entries==PTE_LIST_EXT) case or entries<=6.
143  */
144 struct pte_list_desc {
145         struct pte_list_desc *more;
146         /*
147          * Stores number of entries stored in the pte_list_desc.  No need to be
148          * u64 but just for easier alignment.  When PTE_LIST_EXT, means full.
149          */
150         u64 spte_count;
151         u64 *sptes[PTE_LIST_EXT];
152 };
153
154 struct kvm_shadow_walk_iterator {
155         u64 addr;
156         hpa_t shadow_addr;
157         u64 *sptep;
158         int level;
159         unsigned index;
160 };
161
162 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
163         for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
164                                          (_root), (_addr));                \
165              shadow_walk_okay(&(_walker));                                 \
166              shadow_walk_next(&(_walker)))
167
168 #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
169         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
170              shadow_walk_okay(&(_walker));                      \
171              shadow_walk_next(&(_walker)))
172
173 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)     \
174         for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
175              shadow_walk_okay(&(_walker)) &&                            \
176                 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
177              __shadow_walk_next(&(_walker), spte))
178
179 static struct kmem_cache *pte_list_desc_cache;
180 struct kmem_cache *mmu_page_header_cache;
181 static struct percpu_counter kvm_total_used_mmu_pages;
182
183 static void mmu_spte_set(u64 *sptep, u64 spte);
184
185 struct kvm_mmu_role_regs {
186         const unsigned long cr0;
187         const unsigned long cr4;
188         const u64 efer;
189 };
190
191 #define CREATE_TRACE_POINTS
192 #include "mmutrace.h"
193
194 /*
195  * Yes, lot's of underscores.  They're a hint that you probably shouldn't be
196  * reading from the role_regs.  Once the root_role is constructed, it becomes
197  * the single source of truth for the MMU's state.
198  */
199 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)                   \
200 static inline bool __maybe_unused                                       \
201 ____is_##reg##_##name(const struct kvm_mmu_role_regs *regs)             \
202 {                                                                       \
203         return !!(regs->reg & flag);                                    \
204 }
205 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
206 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
207 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
208 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
209 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
210 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
211 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
212 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
213 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
214 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
215
216 /*
217  * The MMU itself (with a valid role) is the single source of truth for the
218  * MMU.  Do not use the regs used to build the MMU/role, nor the vCPU.  The
219  * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
220  * and the vCPU may be incorrect/irrelevant.
221  */
222 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)         \
223 static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)        \
224 {                                                               \
225         return !!(mmu->cpu_role. base_or_ext . reg##_##name);   \
226 }
227 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
228 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pse);
229 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
230 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
231 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
232 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
233 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
234 BUILD_MMU_ROLE_ACCESSOR(ext,  efer, lma);
235
236 static inline bool is_cr0_pg(struct kvm_mmu *mmu)
237 {
238         return mmu->cpu_role.base.level > 0;
239 }
240
241 static inline bool is_cr4_pae(struct kvm_mmu *mmu)
242 {
243         return !mmu->cpu_role.base.has_4_byte_gpte;
244 }
245
246 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
247 {
248         struct kvm_mmu_role_regs regs = {
249                 .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
250                 .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
251                 .efer = vcpu->arch.efer,
252         };
253
254         return regs;
255 }
256
257 static inline bool kvm_available_flush_tlb_with_range(void)
258 {
259         return kvm_x86_ops.tlb_remote_flush_with_range;
260 }
261
262 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
263                 struct kvm_tlb_range *range)
264 {
265         int ret = -ENOTSUPP;
266
267         if (range && kvm_x86_ops.tlb_remote_flush_with_range)
268                 ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
269
270         if (ret)
271                 kvm_flush_remote_tlbs(kvm);
272 }
273
274 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
275                 u64 start_gfn, u64 pages)
276 {
277         struct kvm_tlb_range range;
278
279         range.start_gfn = start_gfn;
280         range.pages = pages;
281
282         kvm_flush_remote_tlbs_with_range(kvm, &range);
283 }
284
285 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
286                            unsigned int access)
287 {
288         u64 spte = make_mmio_spte(vcpu, gfn, access);
289
290         trace_mark_mmio_spte(sptep, gfn, spte);
291         mmu_spte_set(sptep, spte);
292 }
293
294 static gfn_t get_mmio_spte_gfn(u64 spte)
295 {
296         u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
297
298         gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
299                & shadow_nonpresent_or_rsvd_mask;
300
301         return gpa >> PAGE_SHIFT;
302 }
303
304 static unsigned get_mmio_spte_access(u64 spte)
305 {
306         return spte & shadow_mmio_access_mask;
307 }
308
309 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
310 {
311         u64 kvm_gen, spte_gen, gen;
312
313         gen = kvm_vcpu_memslots(vcpu)->generation;
314         if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
315                 return false;
316
317         kvm_gen = gen & MMIO_SPTE_GEN_MASK;
318         spte_gen = get_mmio_spte_generation(spte);
319
320         trace_check_mmio_spte(spte, kvm_gen, spte_gen);
321         return likely(kvm_gen == spte_gen);
322 }
323
324 static int is_cpuid_PSE36(void)
325 {
326         return 1;
327 }
328
329 static gfn_t pse36_gfn_delta(u32 gpte)
330 {
331         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
332
333         return (gpte & PT32_DIR_PSE36_MASK) << shift;
334 }
335
336 #ifdef CONFIG_X86_64
337 static void __set_spte(u64 *sptep, u64 spte)
338 {
339         WRITE_ONCE(*sptep, spte);
340 }
341
342 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
343 {
344         WRITE_ONCE(*sptep, spte);
345 }
346
347 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
348 {
349         return xchg(sptep, spte);
350 }
351
352 static u64 __get_spte_lockless(u64 *sptep)
353 {
354         return READ_ONCE(*sptep);
355 }
356 #else
357 union split_spte {
358         struct {
359                 u32 spte_low;
360                 u32 spte_high;
361         };
362         u64 spte;
363 };
364
365 static void count_spte_clear(u64 *sptep, u64 spte)
366 {
367         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
368
369         if (is_shadow_present_pte(spte))
370                 return;
371
372         /* Ensure the spte is completely set before we increase the count */
373         smp_wmb();
374         sp->clear_spte_count++;
375 }
376
377 static void __set_spte(u64 *sptep, u64 spte)
378 {
379         union split_spte *ssptep, sspte;
380
381         ssptep = (union split_spte *)sptep;
382         sspte = (union split_spte)spte;
383
384         ssptep->spte_high = sspte.spte_high;
385
386         /*
387          * If we map the spte from nonpresent to present, We should store
388          * the high bits firstly, then set present bit, so cpu can not
389          * fetch this spte while we are setting the spte.
390          */
391         smp_wmb();
392
393         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
394 }
395
396 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
397 {
398         union split_spte *ssptep, sspte;
399
400         ssptep = (union split_spte *)sptep;
401         sspte = (union split_spte)spte;
402
403         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
404
405         /*
406          * If we map the spte from present to nonpresent, we should clear
407          * present bit firstly to avoid vcpu fetch the old high bits.
408          */
409         smp_wmb();
410
411         ssptep->spte_high = sspte.spte_high;
412         count_spte_clear(sptep, spte);
413 }
414
415 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
416 {
417         union split_spte *ssptep, sspte, orig;
418
419         ssptep = (union split_spte *)sptep;
420         sspte = (union split_spte)spte;
421
422         /* xchg acts as a barrier before the setting of the high bits */
423         orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
424         orig.spte_high = ssptep->spte_high;
425         ssptep->spte_high = sspte.spte_high;
426         count_spte_clear(sptep, spte);
427
428         return orig.spte;
429 }
430
431 /*
432  * The idea using the light way get the spte on x86_32 guest is from
433  * gup_get_pte (mm/gup.c).
434  *
435  * An spte tlb flush may be pending, because kvm_set_pte_rmapp
436  * coalesces them and we are running out of the MMU lock.  Therefore
437  * we need to protect against in-progress updates of the spte.
438  *
439  * Reading the spte while an update is in progress may get the old value
440  * for the high part of the spte.  The race is fine for a present->non-present
441  * change (because the high part of the spte is ignored for non-present spte),
442  * but for a present->present change we must reread the spte.
443  *
444  * All such changes are done in two steps (present->non-present and
445  * non-present->present), hence it is enough to count the number of
446  * present->non-present updates: if it changed while reading the spte,
447  * we might have hit the race.  This is done using clear_spte_count.
448  */
449 static u64 __get_spte_lockless(u64 *sptep)
450 {
451         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
452         union split_spte spte, *orig = (union split_spte *)sptep;
453         int count;
454
455 retry:
456         count = sp->clear_spte_count;
457         smp_rmb();
458
459         spte.spte_low = orig->spte_low;
460         smp_rmb();
461
462         spte.spte_high = orig->spte_high;
463         smp_rmb();
464
465         if (unlikely(spte.spte_low != orig->spte_low ||
466               count != sp->clear_spte_count))
467                 goto retry;
468
469         return spte.spte;
470 }
471 #endif
472
473 static bool spte_has_volatile_bits(u64 spte)
474 {
475         if (!is_shadow_present_pte(spte))
476                 return false;
477
478         /*
479          * Always atomically update spte if it can be updated
480          * out of mmu-lock, it can ensure dirty bit is not lost,
481          * also, it can help us to get a stable is_writable_pte()
482          * to ensure tlb flush is not missed.
483          */
484         if (spte_can_locklessly_be_made_writable(spte) ||
485             is_access_track_spte(spte))
486                 return true;
487
488         if (spte_ad_enabled(spte)) {
489                 if ((spte & shadow_accessed_mask) == 0 ||
490                     (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
491                         return true;
492         }
493
494         return false;
495 }
496
497 /* Rules for using mmu_spte_set:
498  * Set the sptep from nonpresent to present.
499  * Note: the sptep being assigned *must* be either not present
500  * or in a state where the hardware will not attempt to update
501  * the spte.
502  */
503 static void mmu_spte_set(u64 *sptep, u64 new_spte)
504 {
505         WARN_ON(is_shadow_present_pte(*sptep));
506         __set_spte(sptep, new_spte);
507 }
508
509 /*
510  * Update the SPTE (excluding the PFN), but do not track changes in its
511  * accessed/dirty status.
512  */
513 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
514 {
515         u64 old_spte = *sptep;
516
517         WARN_ON(!is_shadow_present_pte(new_spte));
518         check_spte_writable_invariants(new_spte);
519
520         if (!is_shadow_present_pte(old_spte)) {
521                 mmu_spte_set(sptep, new_spte);
522                 return old_spte;
523         }
524
525         if (!spte_has_volatile_bits(old_spte))
526                 __update_clear_spte_fast(sptep, new_spte);
527         else
528                 old_spte = __update_clear_spte_slow(sptep, new_spte);
529
530         WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
531
532         return old_spte;
533 }
534
535 /* Rules for using mmu_spte_update:
536  * Update the state bits, it means the mapped pfn is not changed.
537  *
538  * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
539  * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
540  * spte, even though the writable spte might be cached on a CPU's TLB.
541  *
542  * Returns true if the TLB needs to be flushed
543  */
544 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
545 {
546         bool flush = false;
547         u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
548
549         if (!is_shadow_present_pte(old_spte))
550                 return false;
551
552         /*
553          * For the spte updated out of mmu-lock is safe, since
554          * we always atomically update it, see the comments in
555          * spte_has_volatile_bits().
556          */
557         if (spte_can_locklessly_be_made_writable(old_spte) &&
558               !is_writable_pte(new_spte))
559                 flush = true;
560
561         /*
562          * Flush TLB when accessed/dirty states are changed in the page tables,
563          * to guarantee consistency between TLB and page tables.
564          */
565
566         if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
567                 flush = true;
568                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
569         }
570
571         if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
572                 flush = true;
573                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
574         }
575
576         return flush;
577 }
578
579 /*
580  * Rules for using mmu_spte_clear_track_bits:
581  * It sets the sptep from present to nonpresent, and track the
582  * state bits, it is used to clear the last level sptep.
583  * Returns the old PTE.
584  */
585 static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
586 {
587         kvm_pfn_t pfn;
588         u64 old_spte = *sptep;
589         int level = sptep_to_sp(sptep)->role.level;
590
591         if (!spte_has_volatile_bits(old_spte))
592                 __update_clear_spte_fast(sptep, 0ull);
593         else
594                 old_spte = __update_clear_spte_slow(sptep, 0ull);
595
596         if (!is_shadow_present_pte(old_spte))
597                 return old_spte;
598
599         kvm_update_page_stats(kvm, level, -1);
600
601         pfn = spte_to_pfn(old_spte);
602
603         /*
604          * KVM does not hold the refcount of the page used by
605          * kvm mmu, before reclaiming the page, we should
606          * unmap it from mmu first.
607          */
608         WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
609
610         if (is_accessed_spte(old_spte))
611                 kvm_set_pfn_accessed(pfn);
612
613         if (is_dirty_spte(old_spte))
614                 kvm_set_pfn_dirty(pfn);
615
616         return old_spte;
617 }
618
619 /*
620  * Rules for using mmu_spte_clear_no_track:
621  * Directly clear spte without caring the state bits of sptep,
622  * it is used to set the upper level spte.
623  */
624 static void mmu_spte_clear_no_track(u64 *sptep)
625 {
626         __update_clear_spte_fast(sptep, 0ull);
627 }
628
629 static u64 mmu_spte_get_lockless(u64 *sptep)
630 {
631         return __get_spte_lockless(sptep);
632 }
633
634 /* Returns the Accessed status of the PTE and resets it at the same time. */
635 static bool mmu_spte_age(u64 *sptep)
636 {
637         u64 spte = mmu_spte_get_lockless(sptep);
638
639         if (!is_accessed_spte(spte))
640                 return false;
641
642         if (spte_ad_enabled(spte)) {
643                 clear_bit((ffs(shadow_accessed_mask) - 1),
644                           (unsigned long *)sptep);
645         } else {
646                 /*
647                  * Capture the dirty status of the page, so that it doesn't get
648                  * lost when the SPTE is marked for access tracking.
649                  */
650                 if (is_writable_pte(spte))
651                         kvm_set_pfn_dirty(spte_to_pfn(spte));
652
653                 spte = mark_spte_for_access_track(spte);
654                 mmu_spte_update_no_track(sptep, spte);
655         }
656
657         return true;
658 }
659
660 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
661 {
662         if (is_tdp_mmu(vcpu->arch.mmu)) {
663                 kvm_tdp_mmu_walk_lockless_begin();
664         } else {
665                 /*
666                  * Prevent page table teardown by making any free-er wait during
667                  * kvm_flush_remote_tlbs() IPI to all active vcpus.
668                  */
669                 local_irq_disable();
670
671                 /*
672                  * Make sure a following spte read is not reordered ahead of the write
673                  * to vcpu->mode.
674                  */
675                 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
676         }
677 }
678
679 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
680 {
681         if (is_tdp_mmu(vcpu->arch.mmu)) {
682                 kvm_tdp_mmu_walk_lockless_end();
683         } else {
684                 /*
685                  * Make sure the write to vcpu->mode is not reordered in front of
686                  * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
687                  * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
688                  */
689                 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
690                 local_irq_enable();
691         }
692 }
693
694 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
695 {
696         int r;
697
698         /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
699         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
700                                        1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
701         if (r)
702                 return r;
703         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
704                                        PT64_ROOT_MAX_LEVEL);
705         if (r)
706                 return r;
707         if (maybe_indirect) {
708                 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
709                                                PT64_ROOT_MAX_LEVEL);
710                 if (r)
711                         return r;
712         }
713         return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
714                                           PT64_ROOT_MAX_LEVEL);
715 }
716
717 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
718 {
719         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
720         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
721         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
722         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
723 }
724
725 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
726 {
727         return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
728 }
729
730 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
731 {
732         kmem_cache_free(pte_list_desc_cache, pte_list_desc);
733 }
734
735 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
736 {
737         if (!sp->role.direct)
738                 return sp->gfns[index];
739
740         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
741 }
742
743 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
744 {
745         if (!sp->role.direct) {
746                 sp->gfns[index] = gfn;
747                 return;
748         }
749
750         if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
751                 pr_err_ratelimited("gfn mismatch under direct page %llx "
752                                    "(expected %llx, got %llx)\n",
753                                    sp->gfn,
754                                    kvm_mmu_page_get_gfn(sp, index), gfn);
755 }
756
757 /*
758  * Return the pointer to the large page information for a given gfn,
759  * handling slots that are not large page aligned.
760  */
761 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
762                 const struct kvm_memory_slot *slot, int level)
763 {
764         unsigned long idx;
765
766         idx = gfn_to_index(gfn, slot->base_gfn, level);
767         return &slot->arch.lpage_info[level - 2][idx];
768 }
769
770 static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
771                                             gfn_t gfn, int count)
772 {
773         struct kvm_lpage_info *linfo;
774         int i;
775
776         for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
777                 linfo = lpage_info_slot(gfn, slot, i);
778                 linfo->disallow_lpage += count;
779                 WARN_ON(linfo->disallow_lpage < 0);
780         }
781 }
782
783 void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
784 {
785         update_gfn_disallow_lpage_count(slot, gfn, 1);
786 }
787
788 void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
789 {
790         update_gfn_disallow_lpage_count(slot, gfn, -1);
791 }
792
793 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
794 {
795         struct kvm_memslots *slots;
796         struct kvm_memory_slot *slot;
797         gfn_t gfn;
798
799         kvm->arch.indirect_shadow_pages++;
800         gfn = sp->gfn;
801         slots = kvm_memslots_for_spte_role(kvm, sp->role);
802         slot = __gfn_to_memslot(slots, gfn);
803
804         /* the non-leaf shadow pages are keeping readonly. */
805         if (sp->role.level > PG_LEVEL_4K)
806                 return kvm_slot_page_track_add_page(kvm, slot, gfn,
807                                                     KVM_PAGE_TRACK_WRITE);
808
809         kvm_mmu_gfn_disallow_lpage(slot, gfn);
810 }
811
812 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
813 {
814         if (sp->lpage_disallowed)
815                 return;
816
817         ++kvm->stat.nx_lpage_splits;
818         list_add_tail(&sp->lpage_disallowed_link,
819                       &kvm->arch.lpage_disallowed_mmu_pages);
820         sp->lpage_disallowed = true;
821 }
822
823 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
824 {
825         struct kvm_memslots *slots;
826         struct kvm_memory_slot *slot;
827         gfn_t gfn;
828
829         kvm->arch.indirect_shadow_pages--;
830         gfn = sp->gfn;
831         slots = kvm_memslots_for_spte_role(kvm, sp->role);
832         slot = __gfn_to_memslot(slots, gfn);
833         if (sp->role.level > PG_LEVEL_4K)
834                 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
835                                                        KVM_PAGE_TRACK_WRITE);
836
837         kvm_mmu_gfn_allow_lpage(slot, gfn);
838 }
839
840 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
841 {
842         --kvm->stat.nx_lpage_splits;
843         sp->lpage_disallowed = false;
844         list_del(&sp->lpage_disallowed_link);
845 }
846
847 static struct kvm_memory_slot *
848 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
849                             bool no_dirty_log)
850 {
851         struct kvm_memory_slot *slot;
852
853         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
854         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
855                 return NULL;
856         if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
857                 return NULL;
858
859         return slot;
860 }
861
862 /*
863  * About rmap_head encoding:
864  *
865  * If the bit zero of rmap_head->val is clear, then it points to the only spte
866  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
867  * pte_list_desc containing more mappings.
868  */
869
870 /*
871  * Returns the number of pointers in the rmap chain, not counting the new one.
872  */
873 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
874                         struct kvm_rmap_head *rmap_head)
875 {
876         struct pte_list_desc *desc;
877         int count = 0;
878
879         if (!rmap_head->val) {
880                 rmap_printk("%p %llx 0->1\n", spte, *spte);
881                 rmap_head->val = (unsigned long)spte;
882         } else if (!(rmap_head->val & 1)) {
883                 rmap_printk("%p %llx 1->many\n", spte, *spte);
884                 desc = mmu_alloc_pte_list_desc(vcpu);
885                 desc->sptes[0] = (u64 *)rmap_head->val;
886                 desc->sptes[1] = spte;
887                 desc->spte_count = 2;
888                 rmap_head->val = (unsigned long)desc | 1;
889                 ++count;
890         } else {
891                 rmap_printk("%p %llx many->many\n", spte, *spte);
892                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
893                 while (desc->spte_count == PTE_LIST_EXT) {
894                         count += PTE_LIST_EXT;
895                         if (!desc->more) {
896                                 desc->more = mmu_alloc_pte_list_desc(vcpu);
897                                 desc = desc->more;
898                                 desc->spte_count = 0;
899                                 break;
900                         }
901                         desc = desc->more;
902                 }
903                 count += desc->spte_count;
904                 desc->sptes[desc->spte_count++] = spte;
905         }
906         return count;
907 }
908
909 static void
910 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
911                            struct pte_list_desc *desc, int i,
912                            struct pte_list_desc *prev_desc)
913 {
914         int j = desc->spte_count - 1;
915
916         desc->sptes[i] = desc->sptes[j];
917         desc->sptes[j] = NULL;
918         desc->spte_count--;
919         if (desc->spte_count)
920                 return;
921         if (!prev_desc && !desc->more)
922                 rmap_head->val = 0;
923         else
924                 if (prev_desc)
925                         prev_desc->more = desc->more;
926                 else
927                         rmap_head->val = (unsigned long)desc->more | 1;
928         mmu_free_pte_list_desc(desc);
929 }
930
931 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
932 {
933         struct pte_list_desc *desc;
934         struct pte_list_desc *prev_desc;
935         int i;
936
937         if (!rmap_head->val) {
938                 pr_err("%s: %p 0->BUG\n", __func__, spte);
939                 BUG();
940         } else if (!(rmap_head->val & 1)) {
941                 rmap_printk("%p 1->0\n", spte);
942                 if ((u64 *)rmap_head->val != spte) {
943                         pr_err("%s:  %p 1->BUG\n", __func__, spte);
944                         BUG();
945                 }
946                 rmap_head->val = 0;
947         } else {
948                 rmap_printk("%p many->many\n", spte);
949                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
950                 prev_desc = NULL;
951                 while (desc) {
952                         for (i = 0; i < desc->spte_count; ++i) {
953                                 if (desc->sptes[i] == spte) {
954                                         pte_list_desc_remove_entry(rmap_head,
955                                                         desc, i, prev_desc);
956                                         return;
957                                 }
958                         }
959                         prev_desc = desc;
960                         desc = desc->more;
961                 }
962                 pr_err("%s: %p many->many\n", __func__, spte);
963                 BUG();
964         }
965 }
966
967 static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
968                             u64 *sptep)
969 {
970         mmu_spte_clear_track_bits(kvm, sptep);
971         __pte_list_remove(sptep, rmap_head);
972 }
973
974 /* Return true if rmap existed, false otherwise */
975 static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
976 {
977         struct pte_list_desc *desc, *next;
978         int i;
979
980         if (!rmap_head->val)
981                 return false;
982
983         if (!(rmap_head->val & 1)) {
984                 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
985                 goto out;
986         }
987
988         desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
989
990         for (; desc; desc = next) {
991                 for (i = 0; i < desc->spte_count; i++)
992                         mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
993                 next = desc->more;
994                 mmu_free_pte_list_desc(desc);
995         }
996 out:
997         /* rmap_head is meaningless now, remember to reset it */
998         rmap_head->val = 0;
999         return true;
1000 }
1001
1002 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
1003 {
1004         struct pte_list_desc *desc;
1005         unsigned int count = 0;
1006
1007         if (!rmap_head->val)
1008                 return 0;
1009         else if (!(rmap_head->val & 1))
1010                 return 1;
1011
1012         desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1013
1014         while (desc) {
1015                 count += desc->spte_count;
1016                 desc = desc->more;
1017         }
1018
1019         return count;
1020 }
1021
1022 static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
1023                                          const struct kvm_memory_slot *slot)
1024 {
1025         unsigned long idx;
1026
1027         idx = gfn_to_index(gfn, slot->base_gfn, level);
1028         return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
1029 }
1030
1031 static bool rmap_can_add(struct kvm_vcpu *vcpu)
1032 {
1033         struct kvm_mmu_memory_cache *mc;
1034
1035         mc = &vcpu->arch.mmu_pte_list_desc_cache;
1036         return kvm_mmu_memory_cache_nr_free_objects(mc);
1037 }
1038
1039 static void rmap_remove(struct kvm *kvm, u64 *spte)
1040 {
1041         struct kvm_memslots *slots;
1042         struct kvm_memory_slot *slot;
1043         struct kvm_mmu_page *sp;
1044         gfn_t gfn;
1045         struct kvm_rmap_head *rmap_head;
1046
1047         sp = sptep_to_sp(spte);
1048         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1049
1050         /*
1051          * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
1052          * so we have to determine which memslots to use based on context
1053          * information in sp->role.
1054          */
1055         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1056
1057         slot = __gfn_to_memslot(slots, gfn);
1058         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1059
1060         __pte_list_remove(spte, rmap_head);
1061 }
1062
1063 /*
1064  * Used by the following functions to iterate through the sptes linked by a
1065  * rmap.  All fields are private and not assumed to be used outside.
1066  */
1067 struct rmap_iterator {
1068         /* private fields */
1069         struct pte_list_desc *desc;     /* holds the sptep if not NULL */
1070         int pos;                        /* index of the sptep */
1071 };
1072
1073 /*
1074  * Iteration must be started by this function.  This should also be used after
1075  * removing/dropping sptes from the rmap link because in such cases the
1076  * information in the iterator may not be valid.
1077  *
1078  * Returns sptep if found, NULL otherwise.
1079  */
1080 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1081                            struct rmap_iterator *iter)
1082 {
1083         u64 *sptep;
1084
1085         if (!rmap_head->val)
1086                 return NULL;
1087
1088         if (!(rmap_head->val & 1)) {
1089                 iter->desc = NULL;
1090                 sptep = (u64 *)rmap_head->val;
1091                 goto out;
1092         }
1093
1094         iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1095         iter->pos = 0;
1096         sptep = iter->desc->sptes[iter->pos];
1097 out:
1098         BUG_ON(!is_shadow_present_pte(*sptep));
1099         return sptep;
1100 }
1101
1102 /*
1103  * Must be used with a valid iterator: e.g. after rmap_get_first().
1104  *
1105  * Returns sptep if found, NULL otherwise.
1106  */
1107 static u64 *rmap_get_next(struct rmap_iterator *iter)
1108 {
1109         u64 *sptep;
1110
1111         if (iter->desc) {
1112                 if (iter->pos < PTE_LIST_EXT - 1) {
1113                         ++iter->pos;
1114                         sptep = iter->desc->sptes[iter->pos];
1115                         if (sptep)
1116                                 goto out;
1117                 }
1118
1119                 iter->desc = iter->desc->more;
1120
1121                 if (iter->desc) {
1122                         iter->pos = 0;
1123                         /* desc->sptes[0] cannot be NULL */
1124                         sptep = iter->desc->sptes[iter->pos];
1125                         goto out;
1126                 }
1127         }
1128
1129         return NULL;
1130 out:
1131         BUG_ON(!is_shadow_present_pte(*sptep));
1132         return sptep;
1133 }
1134
1135 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)                 \
1136         for (_spte_ = rmap_get_first(_rmap_head_, _iter_);              \
1137              _spte_; _spte_ = rmap_get_next(_iter_))
1138
1139 static void drop_spte(struct kvm *kvm, u64 *sptep)
1140 {
1141         u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
1142
1143         if (is_shadow_present_pte(old_spte))
1144                 rmap_remove(kvm, sptep);
1145 }
1146
1147
1148 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1149 {
1150         if (is_large_pte(*sptep)) {
1151                 WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
1152                 drop_spte(kvm, sptep);
1153                 return true;
1154         }
1155
1156         return false;
1157 }
1158
1159 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1160 {
1161         if (__drop_large_spte(vcpu->kvm, sptep)) {
1162                 struct kvm_mmu_page *sp = sptep_to_sp(sptep);
1163
1164                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1165                         KVM_PAGES_PER_HPAGE(sp->role.level));
1166         }
1167 }
1168
1169 /*
1170  * Write-protect on the specified @sptep, @pt_protect indicates whether
1171  * spte write-protection is caused by protecting shadow page table.
1172  *
1173  * Note: write protection is difference between dirty logging and spte
1174  * protection:
1175  * - for dirty logging, the spte can be set to writable at anytime if
1176  *   its dirty bitmap is properly set.
1177  * - for spte protection, the spte can be writable only after unsync-ing
1178  *   shadow page.
1179  *
1180  * Return true if tlb need be flushed.
1181  */
1182 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1183 {
1184         u64 spte = *sptep;
1185
1186         if (!is_writable_pte(spte) &&
1187               !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1188                 return false;
1189
1190         rmap_printk("spte %p %llx\n", sptep, *sptep);
1191
1192         if (pt_protect)
1193                 spte &= ~shadow_mmu_writable_mask;
1194         spte = spte & ~PT_WRITABLE_MASK;
1195
1196         return mmu_spte_update(sptep, spte);
1197 }
1198
1199 static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
1200                                bool pt_protect)
1201 {
1202         u64 *sptep;
1203         struct rmap_iterator iter;
1204         bool flush = false;
1205
1206         for_each_rmap_spte(rmap_head, &iter, sptep)
1207                 flush |= spte_write_protect(sptep, pt_protect);
1208
1209         return flush;
1210 }
1211
1212 static bool spte_clear_dirty(u64 *sptep)
1213 {
1214         u64 spte = *sptep;
1215
1216         rmap_printk("spte %p %llx\n", sptep, *sptep);
1217
1218         MMU_WARN_ON(!spte_ad_enabled(spte));
1219         spte &= ~shadow_dirty_mask;
1220         return mmu_spte_update(sptep, spte);
1221 }
1222
1223 static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1224 {
1225         bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1226                                                (unsigned long *)sptep);
1227         if (was_writable && !spte_ad_enabled(*sptep))
1228                 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1229
1230         return was_writable;
1231 }
1232
1233 /*
1234  * Gets the GFN ready for another round of dirty logging by clearing the
1235  *      - D bit on ad-enabled SPTEs, and
1236  *      - W bit on ad-disabled SPTEs.
1237  * Returns true iff any D or W bits were cleared.
1238  */
1239 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1240                                const struct kvm_memory_slot *slot)
1241 {
1242         u64 *sptep;
1243         struct rmap_iterator iter;
1244         bool flush = false;
1245
1246         for_each_rmap_spte(rmap_head, &iter, sptep)
1247                 if (spte_ad_need_write_protect(*sptep))
1248                         flush |= spte_wrprot_for_clear_dirty(sptep);
1249                 else
1250                         flush |= spte_clear_dirty(sptep);
1251
1252         return flush;
1253 }
1254
1255 /**
1256  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1257  * @kvm: kvm instance
1258  * @slot: slot to protect
1259  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1260  * @mask: indicates which pages we should protect
1261  *
1262  * Used when we do not need to care about huge page mappings.
1263  */
1264 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1265                                      struct kvm_memory_slot *slot,
1266                                      gfn_t gfn_offset, unsigned long mask)
1267 {
1268         struct kvm_rmap_head *rmap_head;
1269
1270         if (is_tdp_mmu_enabled(kvm))
1271                 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1272                                 slot->base_gfn + gfn_offset, mask, true);
1273
1274         if (!kvm_memslots_have_rmaps(kvm))
1275                 return;
1276
1277         while (mask) {
1278                 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1279                                         PG_LEVEL_4K, slot);
1280                 rmap_write_protect(rmap_head, false);
1281
1282                 /* clear the first set bit */
1283                 mask &= mask - 1;
1284         }
1285 }
1286
1287 /**
1288  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1289  * protect the page if the D-bit isn't supported.
1290  * @kvm: kvm instance
1291  * @slot: slot to clear D-bit
1292  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1293  * @mask: indicates which pages we should clear D-bit
1294  *
1295  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1296  */
1297 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1298                                          struct kvm_memory_slot *slot,
1299                                          gfn_t gfn_offset, unsigned long mask)
1300 {
1301         struct kvm_rmap_head *rmap_head;
1302
1303         if (is_tdp_mmu_enabled(kvm))
1304                 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1305                                 slot->base_gfn + gfn_offset, mask, false);
1306
1307         if (!kvm_memslots_have_rmaps(kvm))
1308                 return;
1309
1310         while (mask) {
1311                 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1312                                         PG_LEVEL_4K, slot);
1313                 __rmap_clear_dirty(kvm, rmap_head, slot);
1314
1315                 /* clear the first set bit */
1316                 mask &= mask - 1;
1317         }
1318 }
1319
1320 /**
1321  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1322  * PT level pages.
1323  *
1324  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1325  * enable dirty logging for them.
1326  *
1327  * We need to care about huge page mappings: e.g. during dirty logging we may
1328  * have such mappings.
1329  */
1330 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1331                                 struct kvm_memory_slot *slot,
1332                                 gfn_t gfn_offset, unsigned long mask)
1333 {
1334         /*
1335          * Huge pages are NOT write protected when we start dirty logging in
1336          * initially-all-set mode; must write protect them here so that they
1337          * are split to 4K on the first write.
1338          *
1339          * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
1340          * of memslot has no such restriction, so the range can cross two large
1341          * pages.
1342          */
1343         if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1344                 gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
1345                 gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
1346
1347                 if (READ_ONCE(eager_page_split))
1348                         kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
1349
1350                 kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
1351
1352                 /* Cross two large pages? */
1353                 if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
1354                     ALIGN(end << PAGE_SHIFT, PMD_SIZE))
1355                         kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
1356                                                        PG_LEVEL_2M);
1357         }
1358
1359         /* Now handle 4K PTEs.  */
1360         if (kvm_x86_ops.cpu_dirty_log_size)
1361                 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
1362         else
1363                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1364 }
1365
1366 int kvm_cpu_dirty_log_size(void)
1367 {
1368         return kvm_x86_ops.cpu_dirty_log_size;
1369 }
1370
1371 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1372                                     struct kvm_memory_slot *slot, u64 gfn,
1373                                     int min_level)
1374 {
1375         struct kvm_rmap_head *rmap_head;
1376         int i;
1377         bool write_protected = false;
1378
1379         if (kvm_memslots_have_rmaps(kvm)) {
1380                 for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
1381                         rmap_head = gfn_to_rmap(gfn, i, slot);
1382                         write_protected |= rmap_write_protect(rmap_head, true);
1383                 }
1384         }
1385
1386         if (is_tdp_mmu_enabled(kvm))
1387                 write_protected |=
1388                         kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
1389
1390         return write_protected;
1391 }
1392
1393 static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
1394 {
1395         struct kvm_memory_slot *slot;
1396
1397         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1398         return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
1399 }
1400
1401 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1402                           const struct kvm_memory_slot *slot)
1403 {
1404         return pte_list_destroy(kvm, rmap_head);
1405 }
1406
1407 static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1408                             struct kvm_memory_slot *slot, gfn_t gfn, int level,
1409                             pte_t unused)
1410 {
1411         return kvm_zap_rmapp(kvm, rmap_head, slot);
1412 }
1413
1414 static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1415                               struct kvm_memory_slot *slot, gfn_t gfn, int level,
1416                               pte_t pte)
1417 {
1418         u64 *sptep;
1419         struct rmap_iterator iter;
1420         bool need_flush = false;
1421         u64 new_spte;
1422         kvm_pfn_t new_pfn;
1423
1424         WARN_ON(pte_huge(pte));
1425         new_pfn = pte_pfn(pte);
1426
1427 restart:
1428         for_each_rmap_spte(rmap_head, &iter, sptep) {
1429                 rmap_printk("spte %p %llx gfn %llx (%d)\n",
1430                             sptep, *sptep, gfn, level);
1431
1432                 need_flush = true;
1433
1434                 if (pte_write(pte)) {
1435                         pte_list_remove(kvm, rmap_head, sptep);
1436                         goto restart;
1437                 } else {
1438                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1439                                         *sptep, new_pfn);
1440
1441                         mmu_spte_clear_track_bits(kvm, sptep);
1442                         mmu_spte_set(sptep, new_spte);
1443                 }
1444         }
1445
1446         if (need_flush && kvm_available_flush_tlb_with_range()) {
1447                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1448                 return false;
1449         }
1450
1451         return need_flush;
1452 }
1453
1454 struct slot_rmap_walk_iterator {
1455         /* input fields. */
1456         const struct kvm_memory_slot *slot;
1457         gfn_t start_gfn;
1458         gfn_t end_gfn;
1459         int start_level;
1460         int end_level;
1461
1462         /* output fields. */
1463         gfn_t gfn;
1464         struct kvm_rmap_head *rmap;
1465         int level;
1466
1467         /* private field. */
1468         struct kvm_rmap_head *end_rmap;
1469 };
1470
1471 static void
1472 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1473 {
1474         iterator->level = level;
1475         iterator->gfn = iterator->start_gfn;
1476         iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
1477         iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
1478 }
1479
1480 static void
1481 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1482                     const struct kvm_memory_slot *slot, int start_level,
1483                     int end_level, gfn_t start_gfn, gfn_t end_gfn)
1484 {
1485         iterator->slot = slot;
1486         iterator->start_level = start_level;
1487         iterator->end_level = end_level;
1488         iterator->start_gfn = start_gfn;
1489         iterator->end_gfn = end_gfn;
1490
1491         rmap_walk_init_level(iterator, iterator->start_level);
1492 }
1493
1494 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1495 {
1496         return !!iterator->rmap;
1497 }
1498
1499 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1500 {
1501         if (++iterator->rmap <= iterator->end_rmap) {
1502                 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1503                 return;
1504         }
1505
1506         if (++iterator->level > iterator->end_level) {
1507                 iterator->rmap = NULL;
1508                 return;
1509         }
1510
1511         rmap_walk_init_level(iterator, iterator->level);
1512 }
1513
1514 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1515            _start_gfn, _end_gfn, _iter_)                                \
1516         for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
1517                                  _end_level_, _start_gfn, _end_gfn);    \
1518              slot_rmap_walk_okay(_iter_);                               \
1519              slot_rmap_walk_next(_iter_))
1520
1521 typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1522                                struct kvm_memory_slot *slot, gfn_t gfn,
1523                                int level, pte_t pte);
1524
1525 static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
1526                                                  struct kvm_gfn_range *range,
1527                                                  rmap_handler_t handler)
1528 {
1529         struct slot_rmap_walk_iterator iterator;
1530         bool ret = false;
1531
1532         for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
1533                                  range->start, range->end - 1, &iterator)
1534                 ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
1535                                iterator.level, range->pte);
1536
1537         return ret;
1538 }
1539
1540 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1541 {
1542         bool flush = false;
1543
1544         if (kvm_memslots_have_rmaps(kvm))
1545                 flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
1546
1547         if (is_tdp_mmu_enabled(kvm))
1548                 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
1549
1550         return flush;
1551 }
1552
1553 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1554 {
1555         bool flush = false;
1556
1557         if (kvm_memslots_have_rmaps(kvm))
1558                 flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
1559
1560         if (is_tdp_mmu_enabled(kvm))
1561                 flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
1562
1563         return flush;
1564 }
1565
1566 static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1567                           struct kvm_memory_slot *slot, gfn_t gfn, int level,
1568                           pte_t unused)
1569 {
1570         u64 *sptep;
1571         struct rmap_iterator iter;
1572         int young = 0;
1573
1574         for_each_rmap_spte(rmap_head, &iter, sptep)
1575                 young |= mmu_spte_age(sptep);
1576
1577         return young;
1578 }
1579
1580 static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1581                                struct kvm_memory_slot *slot, gfn_t gfn,
1582                                int level, pte_t unused)
1583 {
1584         u64 *sptep;
1585         struct rmap_iterator iter;
1586
1587         for_each_rmap_spte(rmap_head, &iter, sptep)
1588                 if (is_accessed_spte(*sptep))
1589                         return true;
1590         return false;
1591 }
1592
1593 #define RMAP_RECYCLE_THRESHOLD 1000
1594
1595 static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
1596                      u64 *spte, gfn_t gfn)
1597 {
1598         struct kvm_mmu_page *sp;
1599         struct kvm_rmap_head *rmap_head;
1600         int rmap_count;
1601
1602         sp = sptep_to_sp(spte);
1603         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1604         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1605         rmap_count = pte_list_add(vcpu, spte, rmap_head);
1606
1607         if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
1608                 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
1609                 kvm_flush_remote_tlbs_with_address(
1610                                 vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
1611         }
1612 }
1613
1614 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1615 {
1616         bool young = false;
1617
1618         if (kvm_memslots_have_rmaps(kvm))
1619                 young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
1620
1621         if (is_tdp_mmu_enabled(kvm))
1622                 young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
1623
1624         return young;
1625 }
1626
1627 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1628 {
1629         bool young = false;
1630
1631         if (kvm_memslots_have_rmaps(kvm))
1632                 young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
1633
1634         if (is_tdp_mmu_enabled(kvm))
1635                 young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
1636
1637         return young;
1638 }
1639
1640 #ifdef MMU_DEBUG
1641 static int is_empty_shadow_page(u64 *spt)
1642 {
1643         u64 *pos;
1644         u64 *end;
1645
1646         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1647                 if (is_shadow_present_pte(*pos)) {
1648                         printk(KERN_ERR "%s: %p %llx\n", __func__,
1649                                pos, *pos);
1650                         return 0;
1651                 }
1652         return 1;
1653 }
1654 #endif
1655
1656 /*
1657  * This value is the sum of all of the kvm instances's
1658  * kvm->arch.n_used_mmu_pages values.  We need a global,
1659  * aggregate version in order to make the slab shrinker
1660  * faster
1661  */
1662 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
1663 {
1664         kvm->arch.n_used_mmu_pages += nr;
1665         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1666 }
1667
1668 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1669 {
1670         MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
1671         hlist_del(&sp->hash_link);
1672         list_del(&sp->link);
1673         free_page((unsigned long)sp->spt);
1674         if (!sp->role.direct)
1675                 free_page((unsigned long)sp->gfns);
1676         kmem_cache_free(mmu_page_header_cache, sp);
1677 }
1678
1679 static unsigned kvm_page_table_hashfn(gfn_t gfn)
1680 {
1681         return hash_64(gfn, KVM_MMU_HASH_SHIFT);
1682 }
1683
1684 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1685                                     struct kvm_mmu_page *sp, u64 *parent_pte)
1686 {
1687         if (!parent_pte)
1688                 return;
1689
1690         pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
1691 }
1692
1693 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1694                                        u64 *parent_pte)
1695 {
1696         __pte_list_remove(parent_pte, &sp->parent_ptes);
1697 }
1698
1699 static void drop_parent_pte(struct kvm_mmu_page *sp,
1700                             u64 *parent_pte)
1701 {
1702         mmu_page_remove_parent_pte(sp, parent_pte);
1703         mmu_spte_clear_no_track(parent_pte);
1704 }
1705
1706 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
1707 {
1708         struct kvm_mmu_page *sp;
1709
1710         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
1711         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
1712         if (!direct)
1713                 sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
1714         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1715
1716         /*
1717          * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
1718          * depends on valid pages being added to the head of the list.  See
1719          * comments in kvm_zap_obsolete_pages().
1720          */
1721         sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
1722         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1723         kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1724         return sp;
1725 }
1726
1727 static void mark_unsync(u64 *spte);
1728 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1729 {
1730         u64 *sptep;
1731         struct rmap_iterator iter;
1732
1733         for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
1734                 mark_unsync(sptep);
1735         }
1736 }
1737
1738 static void mark_unsync(u64 *spte)
1739 {
1740         struct kvm_mmu_page *sp;
1741         unsigned int index;
1742
1743         sp = sptep_to_sp(spte);
1744         index = spte - sp->spt;
1745         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1746                 return;
1747         if (sp->unsync_children++)
1748                 return;
1749         kvm_mmu_mark_parents_unsync(sp);
1750 }
1751
1752 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1753                                struct kvm_mmu_page *sp)
1754 {
1755         return -1;
1756 }
1757
1758 #define KVM_PAGE_ARRAY_NR 16
1759
1760 struct kvm_mmu_pages {
1761         struct mmu_page_and_offset {
1762                 struct kvm_mmu_page *sp;
1763                 unsigned int idx;
1764         } page[KVM_PAGE_ARRAY_NR];
1765         unsigned int nr;
1766 };
1767
1768 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1769                          int idx)
1770 {
1771         int i;
1772
1773         if (sp->unsync)
1774                 for (i=0; i < pvec->nr; i++)
1775                         if (pvec->page[i].sp == sp)
1776                                 return 0;
1777
1778         pvec->page[pvec->nr].sp = sp;
1779         pvec->page[pvec->nr].idx = idx;
1780         pvec->nr++;
1781         return (pvec->nr == KVM_PAGE_ARRAY_NR);
1782 }
1783
1784 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
1785 {
1786         --sp->unsync_children;
1787         WARN_ON((int)sp->unsync_children < 0);
1788         __clear_bit(idx, sp->unsync_child_bitmap);
1789 }
1790
1791 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1792                            struct kvm_mmu_pages *pvec)
1793 {
1794         int i, ret, nr_unsync_leaf = 0;
1795
1796         for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1797                 struct kvm_mmu_page *child;
1798                 u64 ent = sp->spt[i];
1799
1800                 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
1801                         clear_unsync_child_bit(sp, i);
1802                         continue;
1803                 }
1804
1805                 child = to_shadow_page(ent & PT64_BASE_ADDR_MASK);
1806
1807                 if (child->unsync_children) {
1808                         if (mmu_pages_add(pvec, child, i))
1809                                 return -ENOSPC;
1810
1811                         ret = __mmu_unsync_walk(child, pvec);
1812                         if (!ret) {
1813                                 clear_unsync_child_bit(sp, i);
1814                                 continue;
1815                         } else if (ret > 0) {
1816                                 nr_unsync_leaf += ret;
1817                         } else
1818                                 return ret;
1819                 } else if (child->unsync) {
1820                         nr_unsync_leaf++;
1821                         if (mmu_pages_add(pvec, child, i))
1822                                 return -ENOSPC;
1823                 } else
1824                         clear_unsync_child_bit(sp, i);
1825         }
1826
1827         return nr_unsync_leaf;
1828 }
1829
1830 #define INVALID_INDEX (-1)
1831
1832 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1833                            struct kvm_mmu_pages *pvec)
1834 {
1835         pvec->nr = 0;
1836         if (!sp->unsync_children)
1837                 return 0;
1838
1839         mmu_pages_add(pvec, sp, INVALID_INDEX);
1840         return __mmu_unsync_walk(sp, pvec);
1841 }
1842
1843 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1844 {
1845         WARN_ON(!sp->unsync);
1846         trace_kvm_mmu_sync_page(sp);
1847         sp->unsync = 0;
1848         --kvm->stat.mmu_unsync;
1849 }
1850
1851 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1852                                      struct list_head *invalid_list);
1853 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1854                                     struct list_head *invalid_list);
1855
1856 #define for_each_valid_sp(_kvm, _sp, _list)                             \
1857         hlist_for_each_entry(_sp, _list, hash_link)                     \
1858                 if (is_obsolete_sp((_kvm), (_sp))) {                    \
1859                 } else
1860
1861 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                 \
1862         for_each_valid_sp(_kvm, _sp,                                    \
1863           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])     \
1864                 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
1865
1866 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1867                          struct list_head *invalid_list)
1868 {
1869         int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
1870
1871         if (ret < 0)
1872                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1873         return ret;
1874 }
1875
1876 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
1877                                         struct list_head *invalid_list,
1878                                         bool remote_flush)
1879 {
1880         if (!remote_flush && list_empty(invalid_list))
1881                 return false;
1882
1883         if (!list_empty(invalid_list))
1884                 kvm_mmu_commit_zap_page(kvm, invalid_list);
1885         else
1886                 kvm_flush_remote_tlbs(kvm);
1887         return true;
1888 }
1889
1890 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
1891 {
1892         if (sp->role.invalid)
1893                 return true;
1894
1895         /* TDP MMU pages due not use the MMU generation. */
1896         return !sp->tdp_mmu_page &&
1897                unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
1898 }
1899
1900 struct mmu_page_path {
1901         struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
1902         unsigned int idx[PT64_ROOT_MAX_LEVEL];
1903 };
1904
1905 #define for_each_sp(pvec, sp, parents, i)                       \
1906                 for (i = mmu_pages_first(&pvec, &parents);      \
1907                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
1908                         i = mmu_pages_next(&pvec, &parents, i))
1909
1910 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1911                           struct mmu_page_path *parents,
1912                           int i)
1913 {
1914         int n;
1915
1916         for (n = i+1; n < pvec->nr; n++) {
1917                 struct kvm_mmu_page *sp = pvec->page[n].sp;
1918                 unsigned idx = pvec->page[n].idx;
1919                 int level = sp->role.level;
1920
1921                 parents->idx[level-1] = idx;
1922                 if (level == PG_LEVEL_4K)
1923                         break;
1924
1925                 parents->parent[level-2] = sp;
1926         }
1927
1928         return n;
1929 }
1930
1931 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
1932                            struct mmu_page_path *parents)
1933 {
1934         struct kvm_mmu_page *sp;
1935         int level;
1936
1937         if (pvec->nr == 0)
1938                 return 0;
1939
1940         WARN_ON(pvec->page[0].idx != INVALID_INDEX);
1941
1942         sp = pvec->page[0].sp;
1943         level = sp->role.level;
1944         WARN_ON(level == PG_LEVEL_4K);
1945
1946         parents->parent[level-2] = sp;
1947
1948         /* Also set up a sentinel.  Further entries in pvec are all
1949          * children of sp, so this element is never overwritten.
1950          */
1951         parents->parent[level-1] = NULL;
1952         return mmu_pages_next(pvec, parents, 0);
1953 }
1954
1955 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1956 {
1957         struct kvm_mmu_page *sp;
1958         unsigned int level = 0;
1959
1960         do {
1961                 unsigned int idx = parents->idx[level];
1962                 sp = parents->parent[level];
1963                 if (!sp)
1964                         return;
1965
1966                 WARN_ON(idx == INVALID_INDEX);
1967                 clear_unsync_child_bit(sp, idx);
1968                 level++;
1969         } while (!sp->unsync_children);
1970 }
1971
1972 static int mmu_sync_children(struct kvm_vcpu *vcpu,
1973                              struct kvm_mmu_page *parent, bool can_yield)
1974 {
1975         int i;
1976         struct kvm_mmu_page *sp;
1977         struct mmu_page_path parents;
1978         struct kvm_mmu_pages pages;
1979         LIST_HEAD(invalid_list);
1980         bool flush = false;
1981
1982         while (mmu_unsync_walk(parent, &pages)) {
1983                 bool protected = false;
1984
1985                 for_each_sp(pages, sp, parents, i)
1986                         protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
1987
1988                 if (protected) {
1989                         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
1990                         flush = false;
1991                 }
1992
1993                 for_each_sp(pages, sp, parents, i) {
1994                         kvm_unlink_unsync_page(vcpu->kvm, sp);
1995                         flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
1996                         mmu_pages_clear_parents(&parents);
1997                 }
1998                 if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
1999                         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
2000                         if (!can_yield) {
2001                                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2002                                 return -EINTR;
2003                         }
2004
2005                         cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
2006                         flush = false;
2007                 }
2008         }
2009
2010         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
2011         return 0;
2012 }
2013
2014 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2015 {
2016         atomic_set(&sp->write_flooding_count,  0);
2017 }
2018
2019 static void clear_sp_write_flooding_count(u64 *spte)
2020 {
2021         __clear_sp_write_flooding_count(sptep_to_sp(spte));
2022 }
2023
2024 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2025                                              gfn_t gfn,
2026                                              gva_t gaddr,
2027                                              unsigned level,
2028                                              int direct,
2029                                              unsigned int access)
2030 {
2031         bool direct_mmu = vcpu->arch.mmu->direct_map;
2032         union kvm_mmu_page_role role;
2033         struct hlist_head *sp_list;
2034         unsigned quadrant;
2035         struct kvm_mmu_page *sp;
2036         int ret;
2037         int collisions = 0;
2038         LIST_HEAD(invalid_list);
2039
2040         role = vcpu->arch.mmu->root_role;
2041         role.level = level;
2042         role.direct = direct;
2043         role.access = access;
2044         if (role.has_4_byte_gpte) {
2045                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2046                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2047                 role.quadrant = quadrant;
2048         }
2049
2050         sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
2051         for_each_valid_sp(vcpu->kvm, sp, sp_list) {
2052                 if (sp->gfn != gfn) {
2053                         collisions++;
2054                         continue;
2055                 }
2056
2057                 if (sp->role.word != role.word) {
2058                         /*
2059                          * If the guest is creating an upper-level page, zap
2060                          * unsync pages for the same gfn.  While it's possible
2061                          * the guest is using recursive page tables, in all
2062                          * likelihood the guest has stopped using the unsync
2063                          * page and is installing a completely unrelated page.
2064                          * Unsync pages must not be left as is, because the new
2065                          * upper-level page will be write-protected.
2066                          */
2067                         if (level > PG_LEVEL_4K && sp->unsync)
2068                                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2069                                                          &invalid_list);
2070                         continue;
2071                 }
2072
2073                 if (direct_mmu)
2074                         goto trace_get_page;
2075
2076                 if (sp->unsync) {
2077                         /*
2078                          * The page is good, but is stale.  kvm_sync_page does
2079                          * get the latest guest state, but (unlike mmu_unsync_children)
2080                          * it doesn't write-protect the page or mark it synchronized!
2081                          * This way the validity of the mapping is ensured, but the
2082                          * overhead of write protection is not incurred until the
2083                          * guest invalidates the TLB mapping.  This allows multiple
2084                          * SPs for a single gfn to be unsync.
2085                          *
2086                          * If the sync fails, the page is zapped.  If so, break
2087                          * in order to rebuild it.
2088                          */
2089                         ret = kvm_sync_page(vcpu, sp, &invalid_list);
2090                         if (ret < 0)
2091                                 break;
2092
2093                         WARN_ON(!list_empty(&invalid_list));
2094                         if (ret > 0)
2095                                 kvm_flush_remote_tlbs(vcpu->kvm);
2096                 }
2097
2098                 __clear_sp_write_flooding_count(sp);
2099
2100 trace_get_page:
2101                 trace_kvm_mmu_get_page(sp, false);
2102                 goto out;
2103         }
2104
2105         ++vcpu->kvm->stat.mmu_cache_miss;
2106
2107         sp = kvm_mmu_alloc_page(vcpu, direct);
2108
2109         sp->gfn = gfn;
2110         sp->role = role;
2111         hlist_add_head(&sp->hash_link, sp_list);
2112         if (!direct) {
2113                 account_shadowed(vcpu->kvm, sp);
2114                 if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
2115                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2116         }
2117         trace_kvm_mmu_get_page(sp, true);
2118 out:
2119         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2120
2121         if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2122                 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2123         return sp;
2124 }
2125
2126 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2127                                         struct kvm_vcpu *vcpu, hpa_t root,
2128                                         u64 addr)
2129 {
2130         iterator->addr = addr;
2131         iterator->shadow_addr = root;
2132         iterator->level = vcpu->arch.mmu->root_role.level;
2133
2134         if (iterator->level >= PT64_ROOT_4LEVEL &&
2135             vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
2136             !vcpu->arch.mmu->direct_map)
2137                 iterator->level = PT32E_ROOT_LEVEL;
2138
2139         if (iterator->level == PT32E_ROOT_LEVEL) {
2140                 /*
2141                  * prev_root is currently only used for 64-bit hosts. So only
2142                  * the active root_hpa is valid here.
2143                  */
2144                 BUG_ON(root != vcpu->arch.mmu->root.hpa);
2145
2146                 iterator->shadow_addr
2147                         = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2148                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2149                 --iterator->level;
2150                 if (!iterator->shadow_addr)
2151                         iterator->level = 0;
2152         }
2153 }
2154
2155 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2156                              struct kvm_vcpu *vcpu, u64 addr)
2157 {
2158         shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
2159                                     addr);
2160 }
2161
2162 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2163 {
2164         if (iterator->level < PG_LEVEL_4K)
2165                 return false;
2166
2167         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2168         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2169         return true;
2170 }
2171
2172 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2173                                u64 spte)
2174 {
2175         if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
2176                 iterator->level = 0;
2177                 return;
2178         }
2179
2180         iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2181         --iterator->level;
2182 }
2183
2184 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2185 {
2186         __shadow_walk_next(iterator, *iterator->sptep);
2187 }
2188
2189 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2190                              struct kvm_mmu_page *sp)
2191 {
2192         u64 spte;
2193
2194         BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2195
2196         spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
2197
2198         mmu_spte_set(sptep, spte);
2199
2200         mmu_page_add_parent_pte(vcpu, sp, sptep);
2201
2202         if (sp->unsync_children || sp->unsync)
2203                 mark_unsync(sptep);
2204 }
2205
2206 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2207                                    unsigned direct_access)
2208 {
2209         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2210                 struct kvm_mmu_page *child;
2211
2212                 /*
2213                  * For the direct sp, if the guest pte's dirty bit
2214                  * changed form clean to dirty, it will corrupt the
2215                  * sp's access: allow writable in the read-only sp,
2216                  * so we should update the spte at this point to get
2217                  * a new sp with the correct access.
2218                  */
2219                 child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK);
2220                 if (child->role.access == direct_access)
2221                         return;
2222
2223                 drop_parent_pte(child, sptep);
2224                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2225         }
2226 }
2227
2228 /* Returns the number of zapped non-leaf child shadow pages. */
2229 static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2230                             u64 *spte, struct list_head *invalid_list)
2231 {
2232         u64 pte;
2233         struct kvm_mmu_page *child;
2234
2235         pte = *spte;
2236         if (is_shadow_present_pte(pte)) {
2237                 if (is_last_spte(pte, sp->role.level)) {
2238                         drop_spte(kvm, spte);
2239                 } else {
2240                         child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
2241                         drop_parent_pte(child, spte);
2242
2243                         /*
2244                          * Recursively zap nested TDP SPs, parentless SPs are
2245                          * unlikely to be used again in the near future.  This
2246                          * avoids retaining a large number of stale nested SPs.
2247                          */
2248                         if (tdp_enabled && invalid_list &&
2249                             child->role.guest_mode && !child->parent_ptes.val)
2250                                 return kvm_mmu_prepare_zap_page(kvm, child,
2251                                                                 invalid_list);
2252                 }
2253         } else if (is_mmio_spte(pte)) {
2254                 mmu_spte_clear_no_track(spte);
2255         }
2256         return 0;
2257 }
2258
2259 static int kvm_mmu_page_unlink_children(struct kvm *kvm,
2260                                         struct kvm_mmu_page *sp,
2261                                         struct list_head *invalid_list)
2262 {
2263         int zapped = 0;
2264         unsigned i;
2265
2266         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2267                 zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
2268
2269         return zapped;
2270 }
2271
2272 static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
2273 {
2274         u64 *sptep;
2275         struct rmap_iterator iter;
2276
2277         while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2278                 drop_parent_pte(sp, sptep);
2279 }
2280
2281 static int mmu_zap_unsync_children(struct kvm *kvm,
2282                                    struct kvm_mmu_page *parent,
2283                                    struct list_head *invalid_list)
2284 {
2285         int i, zapped = 0;
2286         struct mmu_page_path parents;
2287         struct kvm_mmu_pages pages;
2288
2289         if (parent->role.level == PG_LEVEL_4K)
2290                 return 0;
2291
2292         while (mmu_unsync_walk(parent, &pages)) {
2293                 struct kvm_mmu_page *sp;
2294
2295                 for_each_sp(pages, sp, parents, i) {
2296                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2297                         mmu_pages_clear_parents(&parents);
2298                         zapped++;
2299                 }
2300         }
2301
2302         return zapped;
2303 }
2304
2305 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2306                                        struct kvm_mmu_page *sp,
2307                                        struct list_head *invalid_list,
2308                                        int *nr_zapped)
2309 {
2310         bool list_unstable, zapped_root = false;
2311
2312         trace_kvm_mmu_prepare_zap_page(sp);
2313         ++kvm->stat.mmu_shadow_zapped;
2314         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2315         *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
2316         kvm_mmu_unlink_parents(sp);
2317
2318         /* Zapping children means active_mmu_pages has become unstable. */
2319         list_unstable = *nr_zapped;
2320
2321         if (!sp->role.invalid && !sp->role.direct)
2322                 unaccount_shadowed(kvm, sp);
2323
2324         if (sp->unsync)
2325                 kvm_unlink_unsync_page(kvm, sp);
2326         if (!sp->root_count) {
2327                 /* Count self */
2328                 (*nr_zapped)++;
2329
2330                 /*
2331                  * Already invalid pages (previously active roots) are not on
2332                  * the active page list.  See list_del() in the "else" case of
2333                  * !sp->root_count.
2334                  */
2335                 if (sp->role.invalid)
2336                         list_add(&sp->link, invalid_list);
2337                 else
2338                         list_move(&sp->link, invalid_list);
2339                 kvm_mod_used_mmu_pages(kvm, -1);
2340         } else {
2341                 /*
2342                  * Remove the active root from the active page list, the root
2343                  * will be explicitly freed when the root_count hits zero.
2344                  */
2345                 list_del(&sp->link);
2346
2347                 /*
2348                  * Obsolete pages cannot be used on any vCPUs, see the comment
2349                  * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
2350                  * treats invalid shadow pages as being obsolete.
2351                  */
2352                 zapped_root = !is_obsolete_sp(kvm, sp);
2353         }
2354
2355         if (sp->lpage_disallowed)
2356                 unaccount_huge_nx_page(kvm, sp);
2357
2358         sp->role.invalid = 1;
2359
2360         /*
2361          * Make the request to free obsolete roots after marking the root
2362          * invalid, otherwise other vCPUs may not see it as invalid.
2363          */
2364         if (zapped_root)
2365                 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
2366         return list_unstable;
2367 }
2368
2369 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2370                                      struct list_head *invalid_list)
2371 {
2372         int nr_zapped;
2373
2374         __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2375         return nr_zapped;
2376 }
2377
2378 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2379                                     struct list_head *invalid_list)
2380 {
2381         struct kvm_mmu_page *sp, *nsp;
2382
2383         if (list_empty(invalid_list))
2384                 return;
2385
2386         /*
2387          * We need to make sure everyone sees our modifications to
2388          * the page tables and see changes to vcpu->mode here. The barrier
2389          * in the kvm_flush_remote_tlbs() achieves this. This pairs
2390          * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2391          *
2392          * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2393          * guest mode and/or lockless shadow page table walks.
2394          */
2395         kvm_flush_remote_tlbs(kvm);
2396
2397         list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2398                 WARN_ON(!sp->role.invalid || sp->root_count);
2399                 kvm_mmu_free_page(sp);
2400         }
2401 }
2402
2403 static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
2404                                                   unsigned long nr_to_zap)
2405 {
2406         unsigned long total_zapped = 0;
2407         struct kvm_mmu_page *sp, *tmp;
2408         LIST_HEAD(invalid_list);
2409         bool unstable;
2410         int nr_zapped;
2411
2412         if (list_empty(&kvm->arch.active_mmu_pages))
2413                 return 0;
2414
2415 restart:
2416         list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
2417                 /*
2418                  * Don't zap active root pages, the page itself can't be freed
2419                  * and zapping it will just force vCPUs to realloc and reload.
2420                  */
2421                 if (sp->root_count)
2422                         continue;
2423
2424                 unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
2425                                                       &nr_zapped);
2426                 total_zapped += nr_zapped;
2427                 if (total_zapped >= nr_to_zap)
2428                         break;
2429
2430                 if (unstable)
2431                         goto restart;
2432         }
2433
2434         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2435
2436         kvm->stat.mmu_recycled += total_zapped;
2437         return total_zapped;
2438 }
2439
2440 static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
2441 {
2442         if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
2443                 return kvm->arch.n_max_mmu_pages -
2444                         kvm->arch.n_used_mmu_pages;
2445
2446         return 0;
2447 }
2448
2449 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
2450 {
2451         unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
2452
2453         if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
2454                 return 0;
2455
2456         kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
2457
2458         /*
2459          * Note, this check is intentionally soft, it only guarantees that one
2460          * page is available, while the caller may end up allocating as many as
2461          * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
2462          * exceeding the (arbitrary by default) limit will not harm the host,
2463          * being too aggressive may unnecessarily kill the guest, and getting an
2464          * exact count is far more trouble than it's worth, especially in the
2465          * page fault paths.
2466          */
2467         if (!kvm_mmu_available_pages(vcpu->kvm))
2468                 return -ENOSPC;
2469         return 0;
2470 }
2471
2472 /*
2473  * Changing the number of mmu pages allocated to the vm
2474  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2475  */
2476 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2477 {
2478         write_lock(&kvm->mmu_lock);
2479
2480         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2481                 kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
2482                                                   goal_nr_mmu_pages);
2483
2484                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2485         }
2486
2487         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2488
2489         write_unlock(&kvm->mmu_lock);
2490 }
2491
2492 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2493 {
2494         struct kvm_mmu_page *sp;
2495         LIST_HEAD(invalid_list);
2496         int r;
2497
2498         pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2499         r = 0;
2500         write_lock(&kvm->mmu_lock);
2501         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2502                 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2503                          sp->role.word);
2504                 r = 1;
2505                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2506         }
2507         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2508         write_unlock(&kvm->mmu_lock);
2509
2510         return r;
2511 }
2512
2513 static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2514 {
2515         gpa_t gpa;
2516         int r;
2517
2518         if (vcpu->arch.mmu->direct_map)
2519                 return 0;
2520
2521         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2522
2523         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2524
2525         return r;
2526 }
2527
2528 static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2529 {
2530         trace_kvm_mmu_unsync_page(sp);
2531         ++kvm->stat.mmu_unsync;
2532         sp->unsync = 1;
2533
2534         kvm_mmu_mark_parents_unsync(sp);
2535 }
2536
2537 /*
2538  * Attempt to unsync any shadow pages that can be reached by the specified gfn,
2539  * KVM is creating a writable mapping for said gfn.  Returns 0 if all pages
2540  * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
2541  * be write-protected.
2542  */
2543 int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
2544                             gfn_t gfn, bool can_unsync, bool prefetch)
2545 {
2546         struct kvm_mmu_page *sp;
2547         bool locked = false;
2548
2549         /*
2550          * Force write-protection if the page is being tracked.  Note, the page
2551          * track machinery is used to write-protect upper-level shadow pages,
2552          * i.e. this guards the role.level == 4K assertion below!
2553          */
2554         if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
2555                 return -EPERM;
2556
2557         /*
2558          * The page is not write-tracked, mark existing shadow pages unsync
2559          * unless KVM is synchronizing an unsync SP (can_unsync = false).  In
2560          * that case, KVM must complete emulation of the guest TLB flush before
2561          * allowing shadow pages to become unsync (writable by the guest).
2562          */
2563         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2564                 if (!can_unsync)
2565                         return -EPERM;
2566
2567                 if (sp->unsync)
2568                         continue;
2569
2570                 if (prefetch)
2571                         return -EEXIST;
2572
2573                 /*
2574                  * TDP MMU page faults require an additional spinlock as they
2575                  * run with mmu_lock held for read, not write, and the unsync
2576                  * logic is not thread safe.  Take the spinklock regardless of
2577                  * the MMU type to avoid extra conditionals/parameters, there's
2578                  * no meaningful penalty if mmu_lock is held for write.
2579                  */
2580                 if (!locked) {
2581                         locked = true;
2582                         spin_lock(&kvm->arch.mmu_unsync_pages_lock);
2583
2584                         /*
2585                          * Recheck after taking the spinlock, a different vCPU
2586                          * may have since marked the page unsync.  A false
2587                          * positive on the unprotected check above is not
2588                          * possible as clearing sp->unsync _must_ hold mmu_lock
2589                          * for write, i.e. unsync cannot transition from 0->1
2590                          * while this CPU holds mmu_lock for read (or write).
2591                          */
2592                         if (READ_ONCE(sp->unsync))
2593                                 continue;
2594                 }
2595
2596                 WARN_ON(sp->role.level != PG_LEVEL_4K);
2597                 kvm_unsync_page(kvm, sp);
2598         }
2599         if (locked)
2600                 spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
2601
2602         /*
2603          * We need to ensure that the marking of unsync pages is visible
2604          * before the SPTE is updated to allow writes because
2605          * kvm_mmu_sync_roots() checks the unsync flags without holding
2606          * the MMU lock and so can race with this. If the SPTE was updated
2607          * before the page had been marked as unsync-ed, something like the
2608          * following could happen:
2609          *
2610          * CPU 1                    CPU 2
2611          * ---------------------------------------------------------------------
2612          * 1.2 Host updates SPTE
2613          *     to be writable
2614          *                      2.1 Guest writes a GPTE for GVA X.
2615          *                          (GPTE being in the guest page table shadowed
2616          *                           by the SP from CPU 1.)
2617          *                          This reads SPTE during the page table walk.
2618          *                          Since SPTE.W is read as 1, there is no
2619          *                          fault.
2620          *
2621          *                      2.2 Guest issues TLB flush.
2622          *                          That causes a VM Exit.
2623          *
2624          *                      2.3 Walking of unsync pages sees sp->unsync is
2625          *                          false and skips the page.
2626          *
2627          *                      2.4 Guest accesses GVA X.
2628          *                          Since the mapping in the SP was not updated,
2629          *                          so the old mapping for GVA X incorrectly
2630          *                          gets used.
2631          * 1.1 Host marks SP
2632          *     as unsync
2633          *     (sp->unsync = true)
2634          *
2635          * The write barrier below ensures that 1.1 happens before 1.2 and thus
2636          * the situation in 2.4 does not arise.  It pairs with the read barrier
2637          * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
2638          */
2639         smp_wmb();
2640
2641         return 0;
2642 }
2643
2644 static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
2645                         u64 *sptep, unsigned int pte_access, gfn_t gfn,
2646                         kvm_pfn_t pfn, struct kvm_page_fault *fault)
2647 {
2648         struct kvm_mmu_page *sp = sptep_to_sp(sptep);
2649         int level = sp->role.level;
2650         int was_rmapped = 0;
2651         int ret = RET_PF_FIXED;
2652         bool flush = false;
2653         bool wrprot;
2654         u64 spte;
2655
2656         /* Prefetching always gets a writable pfn.  */
2657         bool host_writable = !fault || fault->map_writable;
2658         bool prefetch = !fault || fault->prefetch;
2659         bool write_fault = fault && fault->write;
2660
2661         pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2662                  *sptep, write_fault, gfn);
2663
2664         if (unlikely(is_noslot_pfn(pfn))) {
2665                 mark_mmio_spte(vcpu, sptep, gfn, pte_access);
2666                 return RET_PF_EMULATE;
2667         }
2668
2669         if (is_shadow_present_pte(*sptep)) {
2670                 /*
2671                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2672                  * the parent of the now unreachable PTE.
2673                  */
2674                 if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
2675                         struct kvm_mmu_page *child;
2676                         u64 pte = *sptep;
2677
2678                         child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
2679                         drop_parent_pte(child, sptep);
2680                         flush = true;
2681                 } else if (pfn != spte_to_pfn(*sptep)) {
2682                         pgprintk("hfn old %llx new %llx\n",
2683                                  spte_to_pfn(*sptep), pfn);
2684                         drop_spte(vcpu->kvm, sptep);
2685                         flush = true;
2686                 } else
2687                         was_rmapped = 1;
2688         }
2689
2690         wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
2691                            true, host_writable, &spte);
2692
2693         if (*sptep == spte) {
2694                 ret = RET_PF_SPURIOUS;
2695         } else {
2696                 flush |= mmu_spte_update(sptep, spte);
2697                 trace_kvm_mmu_set_spte(level, gfn, sptep);
2698         }
2699
2700         if (wrprot) {
2701                 if (write_fault)
2702                         ret = RET_PF_EMULATE;
2703         }
2704
2705         if (flush)
2706                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
2707                                 KVM_PAGES_PER_HPAGE(level));
2708
2709         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2710
2711         if (!was_rmapped) {
2712                 WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
2713                 kvm_update_page_stats(vcpu->kvm, level, 1);
2714                 rmap_add(vcpu, slot, sptep, gfn);
2715         }
2716
2717         return ret;
2718 }
2719
2720 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2721                                     struct kvm_mmu_page *sp,
2722                                     u64 *start, u64 *end)
2723 {
2724         struct page *pages[PTE_PREFETCH_NUM];
2725         struct kvm_memory_slot *slot;
2726         unsigned int access = sp->role.access;
2727         int i, ret;
2728         gfn_t gfn;
2729
2730         gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2731         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
2732         if (!slot)
2733                 return -1;
2734
2735         ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
2736         if (ret <= 0)
2737                 return -1;
2738
2739         for (i = 0; i < ret; i++, gfn++, start++) {
2740                 mmu_set_spte(vcpu, slot, start, access, gfn,
2741                              page_to_pfn(pages[i]), NULL);
2742                 put_page(pages[i]);
2743         }
2744
2745         return 0;
2746 }
2747
2748 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2749                                   struct kvm_mmu_page *sp, u64 *sptep)
2750 {
2751         u64 *spte, *start = NULL;
2752         int i;
2753
2754         WARN_ON(!sp->role.direct);
2755
2756         i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2757         spte = sp->spt + i;
2758
2759         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2760                 if (is_shadow_present_pte(*spte) || spte == sptep) {
2761                         if (!start)
2762                                 continue;
2763                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2764                                 return;
2765                         start = NULL;
2766                 } else if (!start)
2767                         start = spte;
2768         }
2769         if (start)
2770                 direct_pte_prefetch_many(vcpu, sp, start, spte);
2771 }
2772
2773 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2774 {
2775         struct kvm_mmu_page *sp;
2776
2777         sp = sptep_to_sp(sptep);
2778
2779         /*
2780          * Without accessed bits, there's no way to distinguish between
2781          * actually accessed translations and prefetched, so disable pte
2782          * prefetch if accessed bits aren't available.
2783          */
2784         if (sp_ad_disabled(sp))
2785                 return;
2786
2787         if (sp->role.level > PG_LEVEL_4K)
2788                 return;
2789
2790         /*
2791          * If addresses are being invalidated, skip prefetching to avoid
2792          * accidentally prefetching those addresses.
2793          */
2794         if (unlikely(vcpu->kvm->mmu_notifier_count))
2795                 return;
2796
2797         __direct_pte_prefetch(vcpu, sp, sptep);
2798 }
2799
2800 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
2801                                   const struct kvm_memory_slot *slot)
2802 {
2803         unsigned long hva;
2804         unsigned long flags;
2805         int level = PG_LEVEL_4K;
2806         pgd_t pgd;
2807         p4d_t p4d;
2808         pud_t pud;
2809         pmd_t pmd;
2810
2811         if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
2812                 return PG_LEVEL_4K;
2813
2814         /*
2815          * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
2816          * is not solely for performance, it's also necessary to avoid the
2817          * "writable" check in __gfn_to_hva_many(), which will always fail on
2818          * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
2819          * page fault steps have already verified the guest isn't writing a
2820          * read-only memslot.
2821          */
2822         hva = __gfn_to_hva_memslot(slot, gfn);
2823
2824         /*
2825          * Lookup the mapping level in the current mm.  The information
2826          * may become stale soon, but it is safe to use as long as
2827          * 1) mmu_notifier_retry was checked after taking mmu_lock, and
2828          * 2) mmu_lock is taken now.
2829          *
2830          * We still need to disable IRQs to prevent concurrent tear down
2831          * of page tables.
2832          */
2833         local_irq_save(flags);
2834
2835         pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
2836         if (pgd_none(pgd))
2837                 goto out;
2838
2839         p4d = READ_ONCE(*p4d_offset(&pgd, hva));
2840         if (p4d_none(p4d) || !p4d_present(p4d))
2841                 goto out;
2842
2843         pud = READ_ONCE(*pud_offset(&p4d, hva));
2844         if (pud_none(pud) || !pud_present(pud))
2845                 goto out;
2846
2847         if (pud_large(pud)) {
2848                 level = PG_LEVEL_1G;
2849                 goto out;
2850         }
2851
2852         pmd = READ_ONCE(*pmd_offset(&pud, hva));
2853         if (pmd_none(pmd) || !pmd_present(pmd))
2854                 goto out;
2855
2856         if (pmd_large(pmd))
2857                 level = PG_LEVEL_2M;
2858
2859 out:
2860         local_irq_restore(flags);
2861         return level;
2862 }
2863
2864 int kvm_mmu_max_mapping_level(struct kvm *kvm,
2865                               const struct kvm_memory_slot *slot, gfn_t gfn,
2866                               kvm_pfn_t pfn, int max_level)
2867 {
2868         struct kvm_lpage_info *linfo;
2869         int host_level;
2870
2871         max_level = min(max_level, max_huge_page_level);
2872         for ( ; max_level > PG_LEVEL_4K; max_level--) {
2873                 linfo = lpage_info_slot(gfn, slot, max_level);
2874                 if (!linfo->disallow_lpage)
2875                         break;
2876         }
2877
2878         if (max_level == PG_LEVEL_4K)
2879                 return PG_LEVEL_4K;
2880
2881         host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
2882         return min(host_level, max_level);
2883 }
2884
2885 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
2886 {
2887         struct kvm_memory_slot *slot = fault->slot;
2888         kvm_pfn_t mask;
2889
2890         fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
2891
2892         if (unlikely(fault->max_level == PG_LEVEL_4K))
2893                 return;
2894
2895         if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn))
2896                 return;
2897
2898         if (kvm_slot_dirty_track_enabled(slot))
2899                 return;
2900
2901         /*
2902          * Enforce the iTLB multihit workaround after capturing the requested
2903          * level, which will be used to do precise, accurate accounting.
2904          */
2905         fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
2906                                                      fault->gfn, fault->pfn,
2907                                                      fault->max_level);
2908         if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
2909                 return;
2910
2911         /*
2912          * mmu_notifier_retry() was successful and mmu_lock is held, so
2913          * the pmd can't be split from under us.
2914          */
2915         fault->goal_level = fault->req_level;
2916         mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
2917         VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
2918         fault->pfn &= ~mask;
2919 }
2920
2921 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
2922 {
2923         if (cur_level > PG_LEVEL_4K &&
2924             cur_level == fault->goal_level &&
2925             is_shadow_present_pte(spte) &&
2926             !is_large_pte(spte)) {
2927                 /*
2928                  * A small SPTE exists for this pfn, but FNAME(fetch)
2929                  * and __direct_map would like to create a large PTE
2930                  * instead: just force them to go down another level,
2931                  * patching back for them into pfn the next 9 bits of
2932                  * the address.
2933                  */
2934                 u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
2935                                 KVM_PAGES_PER_HPAGE(cur_level - 1);
2936                 fault->pfn |= fault->gfn & page_mask;
2937                 fault->goal_level--;
2938         }
2939 }
2940
2941 static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
2942 {
2943         struct kvm_shadow_walk_iterator it;
2944         struct kvm_mmu_page *sp;
2945         int ret;
2946         gfn_t base_gfn = fault->gfn;
2947
2948         kvm_mmu_hugepage_adjust(vcpu, fault);
2949
2950         trace_kvm_mmu_spte_requested(fault);
2951         for_each_shadow_entry(vcpu, fault->addr, it) {
2952                 /*
2953                  * We cannot overwrite existing page tables with an NX
2954                  * large page, as the leaf could be executable.
2955                  */
2956                 if (fault->nx_huge_page_workaround_enabled)
2957                         disallowed_hugepage_adjust(fault, *it.sptep, it.level);
2958
2959                 base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
2960                 if (it.level == fault->goal_level)
2961                         break;
2962
2963                 drop_large_spte(vcpu, it.sptep);
2964                 if (is_shadow_present_pte(*it.sptep))
2965                         continue;
2966
2967                 sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
2968                                       it.level - 1, true, ACC_ALL);
2969
2970                 link_shadow_page(vcpu, it.sptep, sp);
2971                 if (fault->is_tdp && fault->huge_page_disallowed &&
2972                     fault->req_level >= it.level)
2973                         account_huge_nx_page(vcpu->kvm, sp);
2974         }
2975
2976         if (WARN_ON_ONCE(it.level != fault->goal_level))
2977                 return -EFAULT;
2978
2979         ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
2980                            base_gfn, fault->pfn, fault);
2981         if (ret == RET_PF_SPURIOUS)
2982                 return ret;
2983
2984         direct_pte_prefetch(vcpu, it.sptep);
2985         ++vcpu->stat.pf_fixed;
2986         return ret;
2987 }
2988
2989 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2990 {
2991         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
2992 }
2993
2994 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
2995 {
2996         /*
2997          * Do not cache the mmio info caused by writing the readonly gfn
2998          * into the spte otherwise read access on readonly gfn also can
2999          * caused mmio page fault and treat it as mmio access.
3000          */
3001         if (pfn == KVM_PFN_ERR_RO_FAULT)
3002                 return RET_PF_EMULATE;
3003
3004         if (pfn == KVM_PFN_ERR_HWPOISON) {
3005                 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3006                 return RET_PF_RETRY;
3007         }
3008
3009         return -EFAULT;
3010 }
3011
3012 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
3013                                 unsigned int access, int *ret_val)
3014 {
3015         /* The pfn is invalid, report the error! */
3016         if (unlikely(is_error_pfn(fault->pfn))) {
3017                 *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
3018                 return true;
3019         }
3020
3021         if (unlikely(!fault->slot)) {
3022                 gva_t gva = fault->is_tdp ? 0 : fault->addr;
3023
3024                 vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
3025                                      access & shadow_mmio_access_mask);
3026                 /*
3027                  * If MMIO caching is disabled, emulate immediately without
3028                  * touching the shadow page tables as attempting to install an
3029                  * MMIO SPTE will just be an expensive nop.  Do not cache MMIO
3030                  * whose gfn is greater than host.MAXPHYADDR, any guest that
3031                  * generates such gfns is running nested and is being tricked
3032                  * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
3033                  * and only if L1's MAXPHYADDR is inaccurate with respect to
3034                  * the hardware's).
3035                  */
3036                 if (unlikely(!enable_mmio_caching) ||
3037                     unlikely(fault->gfn > kvm_mmu_max_gfn())) {
3038                         *ret_val = RET_PF_EMULATE;
3039                         return true;
3040                 }
3041         }
3042
3043         return false;
3044 }
3045
3046 static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
3047 {
3048         /*
3049          * Do not fix the mmio spte with invalid generation number which
3050          * need to be updated by slow page fault path.
3051          */
3052         if (fault->rsvd)
3053                 return false;
3054
3055         /* See if the page fault is due to an NX violation */
3056         if (unlikely(fault->exec && fault->present))
3057                 return false;
3058
3059         /*
3060          * #PF can be fast if:
3061          * 1. The shadow page table entry is not present, which could mean that
3062          *    the fault is potentially caused by access tracking (if enabled).
3063          * 2. The shadow page table entry is present and the fault
3064          *    is caused by write-protect, that means we just need change the W
3065          *    bit of the spte which can be done out of mmu-lock.
3066          *
3067          * However, if access tracking is disabled we know that a non-present
3068          * page must be a genuine page fault where we have to create a new SPTE.
3069          * So, if access tracking is disabled, we return true only for write
3070          * accesses to a present page.
3071          */
3072
3073         return shadow_acc_track_mask != 0 || (fault->write && fault->present);
3074 }
3075
3076 /*
3077  * Returns true if the SPTE was fixed successfully. Otherwise,
3078  * someone else modified the SPTE from its original value.
3079  */
3080 static bool
3081 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
3082                         u64 *sptep, u64 old_spte, u64 new_spte)
3083 {
3084         /*
3085          * Theoretically we could also set dirty bit (and flush TLB) here in
3086          * order to eliminate unnecessary PML logging. See comments in
3087          * set_spte. But fast_page_fault is very unlikely to happen with PML
3088          * enabled, so we do not do this. This might result in the same GPA
3089          * to be logged in PML buffer again when the write really happens, and
3090          * eventually to be called by mark_page_dirty twice. But it's also no
3091          * harm. This also avoids the TLB flush needed after setting dirty bit
3092          * so non-PML cases won't be impacted.
3093          *
3094          * Compare with set_spte where instead shadow_dirty_mask is set.
3095          */
3096         if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3097                 return false;
3098
3099         if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
3100                 mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
3101
3102         return true;
3103 }
3104
3105 static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
3106 {
3107         if (fault->exec)
3108                 return is_executable_pte(spte);
3109
3110         if (fault->write)
3111                 return is_writable_pte(spte);
3112
3113         /* Fault was on Read access */
3114         return spte & PT_PRESENT_MASK;
3115 }
3116
3117 /*
3118  * Returns the last level spte pointer of the shadow page walk for the given
3119  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
3120  * walk could be performed, returns NULL and *spte does not contain valid data.
3121  *
3122  * Contract:
3123  *  - Must be called between walk_shadow_page_lockless_{begin,end}.
3124  *  - The returned sptep must not be used after walk_shadow_page_lockless_end.
3125  */
3126 static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
3127 {
3128         struct kvm_shadow_walk_iterator iterator;
3129         u64 old_spte;
3130         u64 *sptep = NULL;
3131
3132         for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
3133                 sptep = iterator.sptep;
3134                 *spte = old_spte;
3135         }
3136
3137         return sptep;
3138 }
3139
3140 /*
3141  * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
3142  */
3143 static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3144 {
3145         struct kvm_mmu_page *sp;
3146         int ret = RET_PF_INVALID;
3147         u64 spte = 0ull;
3148         u64 *sptep = NULL;
3149         uint retry_count = 0;
3150
3151         if (!page_fault_can_be_fast(fault))
3152                 return ret;
3153
3154         walk_shadow_page_lockless_begin(vcpu);
3155
3156         do {
3157                 u64 new_spte;
3158
3159                 if (is_tdp_mmu(vcpu->arch.mmu))
3160                         sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3161                 else
3162                         sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3163
3164                 if (!is_shadow_present_pte(spte))
3165                         break;
3166
3167                 sp = sptep_to_sp(sptep);
3168                 if (!is_last_spte(spte, sp->role.level))
3169                         break;
3170
3171                 /*
3172                  * Check whether the memory access that caused the fault would
3173                  * still cause it if it were to be performed right now. If not,
3174                  * then this is a spurious fault caused by TLB lazily flushed,
3175                  * or some other CPU has already fixed the PTE after the
3176                  * current CPU took the fault.
3177                  *
3178                  * Need not check the access of upper level table entries since
3179                  * they are always ACC_ALL.
3180                  */
3181                 if (is_access_allowed(fault, spte)) {
3182                         ret = RET_PF_SPURIOUS;
3183                         break;
3184                 }
3185
3186                 new_spte = spte;
3187
3188                 if (is_access_track_spte(spte))
3189                         new_spte = restore_acc_track_spte(new_spte);
3190
3191                 /*
3192                  * Currently, to simplify the code, write-protection can
3193                  * be removed in the fast path only if the SPTE was
3194                  * write-protected for dirty-logging or access tracking.
3195                  */
3196                 if (fault->write &&
3197                     spte_can_locklessly_be_made_writable(spte)) {
3198                         new_spte |= PT_WRITABLE_MASK;
3199
3200                         /*
3201                          * Do not fix write-permission on the large spte when
3202                          * dirty logging is enabled. Since we only dirty the
3203                          * first page into the dirty-bitmap in
3204                          * fast_pf_fix_direct_spte(), other pages are missed
3205                          * if its slot has dirty logging enabled.
3206                          *
3207                          * Instead, we let the slow page fault path create a
3208                          * normal spte to fix the access.
3209                          */
3210                         if (sp->role.level > PG_LEVEL_4K &&
3211                             kvm_slot_dirty_track_enabled(fault->slot))
3212                                 break;
3213                 }
3214
3215                 /* Verify that the fault can be handled in the fast path */
3216                 if (new_spte == spte ||
3217                     !is_access_allowed(fault, new_spte))
3218                         break;
3219
3220                 /*
3221                  * Currently, fast page fault only works for direct mapping
3222                  * since the gfn is not stable for indirect shadow page. See
3223                  * Documentation/virt/kvm/locking.rst to get more detail.
3224                  */
3225                 if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
3226                         ret = RET_PF_FIXED;
3227                         break;
3228                 }
3229
3230                 if (++retry_count > 4) {
3231                         printk_once(KERN_WARNING
3232                                 "kvm: Fast #PF retrying more than 4 times.\n");
3233                         break;
3234                 }
3235
3236         } while (true);
3237
3238         trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
3239         walk_shadow_page_lockless_end(vcpu);
3240
3241         return ret;
3242 }
3243
3244 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3245                                struct list_head *invalid_list)
3246 {
3247         struct kvm_mmu_page *sp;
3248
3249         if (!VALID_PAGE(*root_hpa))
3250                 return;
3251
3252         sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
3253         if (WARN_ON(!sp))
3254                 return;
3255
3256         if (is_tdp_mmu_page(sp))
3257                 kvm_tdp_mmu_put_root(kvm, sp, false);
3258         else if (!--sp->root_count && sp->role.invalid)
3259                 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3260
3261         *root_hpa = INVALID_PAGE;
3262 }
3263
3264 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3265 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
3266                         ulong roots_to_free)
3267 {
3268         int i;
3269         LIST_HEAD(invalid_list);
3270         bool free_active_root;
3271
3272         BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3273
3274         /* Before acquiring the MMU lock, see if we need to do any real work. */
3275         free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
3276                 && VALID_PAGE(mmu->root.hpa);
3277
3278         if (!free_active_root) {
3279                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3280                         if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3281                             VALID_PAGE(mmu->prev_roots[i].hpa))
3282                                 break;
3283
3284                 if (i == KVM_MMU_NUM_PREV_ROOTS)
3285                         return;
3286         }
3287
3288         write_lock(&kvm->mmu_lock);
3289
3290         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3291                 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3292                         mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
3293                                            &invalid_list);
3294
3295         if (free_active_root) {
3296                 if (to_shadow_page(mmu->root.hpa)) {
3297                         mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
3298                 } else if (mmu->pae_root) {
3299                         for (i = 0; i < 4; ++i) {
3300                                 if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
3301                                         continue;
3302
3303                                 mmu_free_root_page(kvm, &mmu->pae_root[i],
3304                                                    &invalid_list);
3305                                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3306                         }
3307                 }
3308                 mmu->root.hpa = INVALID_PAGE;
3309                 mmu->root.pgd = 0;
3310         }
3311
3312         kvm_mmu_commit_zap_page(kvm, &invalid_list);
3313         write_unlock(&kvm->mmu_lock);
3314 }
3315 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3316
3317 void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
3318 {
3319         unsigned long roots_to_free = 0;
3320         hpa_t root_hpa;
3321         int i;
3322
3323         /*
3324          * This should not be called while L2 is active, L2 can't invalidate
3325          * _only_ its own roots, e.g. INVVPID unconditionally exits.
3326          */
3327         WARN_ON_ONCE(mmu->root_role.guest_mode);
3328
3329         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
3330                 root_hpa = mmu->prev_roots[i].hpa;
3331                 if (!VALID_PAGE(root_hpa))
3332                         continue;
3333
3334                 if (!to_shadow_page(root_hpa) ||
3335                         to_shadow_page(root_hpa)->role.guest_mode)
3336                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3337         }
3338
3339         kvm_mmu_free_roots(kvm, mmu, roots_to_free);
3340 }
3341 EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
3342
3343
3344 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3345 {
3346         int ret = 0;
3347
3348         if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
3349                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3350                 ret = 1;
3351         }
3352
3353         return ret;
3354 }
3355
3356 static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
3357                             u8 level, bool direct)
3358 {
3359         struct kvm_mmu_page *sp;
3360
3361         sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
3362         ++sp->root_count;
3363
3364         return __pa(sp->spt);
3365 }
3366
3367 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3368 {
3369         struct kvm_mmu *mmu = vcpu->arch.mmu;
3370         u8 shadow_root_level = mmu->root_role.level;
3371         hpa_t root;
3372         unsigned i;
3373         int r;
3374
3375         write_lock(&vcpu->kvm->mmu_lock);
3376         r = make_mmu_pages_available(vcpu);
3377         if (r < 0)
3378                 goto out_unlock;
3379
3380         if (is_tdp_mmu_enabled(vcpu->kvm)) {
3381                 root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
3382                 mmu->root.hpa = root;
3383         } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
3384                 root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
3385                 mmu->root.hpa = root;
3386         } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
3387                 if (WARN_ON_ONCE(!mmu->pae_root)) {
3388                         r = -EIO;
3389                         goto out_unlock;
3390                 }
3391
3392                 for (i = 0; i < 4; ++i) {
3393                         WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3394
3395                         root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
3396                                               i << 30, PT32_ROOT_LEVEL, true);
3397                         mmu->pae_root[i] = root | PT_PRESENT_MASK |
3398                                            shadow_me_mask;
3399                 }
3400                 mmu->root.hpa = __pa(mmu->pae_root);
3401         } else {
3402                 WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
3403                 r = -EIO;
3404                 goto out_unlock;
3405         }
3406
3407         /* root.pgd is ignored for direct MMUs. */
3408         mmu->root.pgd = 0;
3409 out_unlock:
3410         write_unlock(&vcpu->kvm->mmu_lock);
3411         return r;
3412 }
3413
3414 static int mmu_first_shadow_root_alloc(struct kvm *kvm)
3415 {
3416         struct kvm_memslots *slots;
3417         struct kvm_memory_slot *slot;
3418         int r = 0, i, bkt;
3419
3420         /*
3421          * Check if this is the first shadow root being allocated before
3422          * taking the lock.
3423          */
3424         if (kvm_shadow_root_allocated(kvm))
3425                 return 0;
3426
3427         mutex_lock(&kvm->slots_arch_lock);
3428
3429         /* Recheck, under the lock, whether this is the first shadow root. */
3430         if (kvm_shadow_root_allocated(kvm))
3431                 goto out_unlock;
3432
3433         /*
3434          * Check if anything actually needs to be allocated, e.g. all metadata
3435          * will be allocated upfront if TDP is disabled.
3436          */
3437         if (kvm_memslots_have_rmaps(kvm) &&
3438             kvm_page_track_write_tracking_enabled(kvm))
3439                 goto out_success;
3440
3441         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
3442                 slots = __kvm_memslots(kvm, i);
3443                 kvm_for_each_memslot(slot, bkt, slots) {
3444                         /*
3445                          * Both of these functions are no-ops if the target is
3446                          * already allocated, so unconditionally calling both
3447                          * is safe.  Intentionally do NOT free allocations on
3448                          * failure to avoid having to track which allocations
3449                          * were made now versus when the memslot was created.
3450                          * The metadata is guaranteed to be freed when the slot
3451                          * is freed, and will be kept/used if userspace retries
3452                          * KVM_RUN instead of killing the VM.
3453                          */
3454                         r = memslot_rmap_alloc(slot, slot->npages);
3455                         if (r)
3456                                 goto out_unlock;
3457                         r = kvm_page_track_write_tracking_alloc(slot);
3458                         if (r)
3459                                 goto out_unlock;
3460                 }
3461         }
3462
3463         /*
3464          * Ensure that shadow_root_allocated becomes true strictly after
3465          * all the related pointers are set.
3466          */
3467 out_success:
3468         smp_store_release(&kvm->arch.shadow_root_allocated, true);
3469
3470 out_unlock:
3471         mutex_unlock(&kvm->slots_arch_lock);
3472         return r;
3473 }
3474
3475 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3476 {
3477         struct kvm_mmu *mmu = vcpu->arch.mmu;
3478         u64 pdptrs[4], pm_mask;
3479         gfn_t root_gfn, root_pgd;
3480         hpa_t root;
3481         unsigned i;
3482         int r;
3483
3484         root_pgd = mmu->get_guest_pgd(vcpu);
3485         root_gfn = root_pgd >> PAGE_SHIFT;
3486
3487         if (mmu_check_root(vcpu, root_gfn))
3488                 return 1;
3489
3490         /*
3491          * On SVM, reading PDPTRs might access guest memory, which might fault
3492          * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
3493          */
3494         if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3495                 for (i = 0; i < 4; ++i) {
3496                         pdptrs[i] = mmu->get_pdptr(vcpu, i);
3497                         if (!(pdptrs[i] & PT_PRESENT_MASK))
3498                                 continue;
3499
3500                         if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
3501                                 return 1;
3502                 }
3503         }
3504
3505         r = mmu_first_shadow_root_alloc(vcpu->kvm);
3506         if (r)
3507                 return r;
3508
3509         write_lock(&vcpu->kvm->mmu_lock);
3510         r = make_mmu_pages_available(vcpu);
3511         if (r < 0)
3512                 goto out_unlock;
3513
3514         /*
3515          * Do we shadow a long mode page table? If so we need to
3516          * write-protect the guests page table root.
3517          */
3518         if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
3519                 root = mmu_alloc_root(vcpu, root_gfn, 0,
3520                                       mmu->root_role.level, false);
3521                 mmu->root.hpa = root;
3522                 goto set_root_pgd;
3523         }
3524
3525         if (WARN_ON_ONCE(!mmu->pae_root)) {
3526                 r = -EIO;
3527                 goto out_unlock;
3528         }
3529
3530         /*
3531          * We shadow a 32 bit page table. This may be a legacy 2-level
3532          * or a PAE 3-level page table. In either case we need to be aware that
3533          * the shadow page table may be a PAE or a long mode page table.
3534          */
3535         pm_mask = PT_PRESENT_MASK | shadow_me_mask;
3536         if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
3537                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3538
3539                 if (WARN_ON_ONCE(!mmu->pml4_root)) {
3540                         r = -EIO;
3541                         goto out_unlock;
3542                 }
3543                 mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
3544
3545                 if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
3546                         if (WARN_ON_ONCE(!mmu->pml5_root)) {
3547                                 r = -EIO;
3548                                 goto out_unlock;
3549                         }
3550                         mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
3551                 }
3552         }
3553
3554         for (i = 0; i < 4; ++i) {
3555                 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3556
3557                 if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3558                         if (!(pdptrs[i] & PT_PRESENT_MASK)) {
3559                                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3560                                 continue;
3561                         }
3562                         root_gfn = pdptrs[i] >> PAGE_SHIFT;
3563                 }
3564
3565                 root = mmu_alloc_root(vcpu, root_gfn, i << 30,
3566                                       PT32_ROOT_LEVEL, false);
3567                 mmu->pae_root[i] = root | pm_mask;
3568         }
3569
3570         if (mmu->root_role.level == PT64_ROOT_5LEVEL)
3571                 mmu->root.hpa = __pa(mmu->pml5_root);
3572         else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
3573                 mmu->root.hpa = __pa(mmu->pml4_root);
3574         else
3575                 mmu->root.hpa = __pa(mmu->pae_root);
3576
3577 set_root_pgd:
3578         mmu->root.pgd = root_pgd;
3579 out_unlock:
3580         write_unlock(&vcpu->kvm->mmu_lock);
3581
3582         return r;
3583 }
3584
3585 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
3586 {
3587         struct kvm_mmu *mmu = vcpu->arch.mmu;
3588         bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
3589         u64 *pml5_root = NULL;
3590         u64 *pml4_root = NULL;
3591         u64 *pae_root;
3592
3593         /*
3594          * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
3595          * tables are allocated and initialized at root creation as there is no
3596          * equivalent level in the guest's NPT to shadow.  Allocate the tables
3597          * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
3598          */
3599         if (mmu->direct_map || mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
3600             mmu->root_role.level < PT64_ROOT_4LEVEL)
3601                 return 0;
3602
3603         /*
3604          * NPT, the only paging mode that uses this horror, uses a fixed number
3605          * of levels for the shadow page tables, e.g. all MMUs are 4-level or
3606          * all MMus are 5-level.  Thus, this can safely require that pml5_root
3607          * is allocated if the other roots are valid and pml5 is needed, as any
3608          * prior MMU would also have required pml5.
3609          */
3610         if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
3611                 return 0;
3612
3613         /*
3614          * The special roots should always be allocated in concert.  Yell and
3615          * bail if KVM ends up in a state where only one of the roots is valid.
3616          */
3617         if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
3618                          (need_pml5 && mmu->pml5_root)))
3619                 return -EIO;
3620
3621         /*
3622          * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
3623          * doesn't need to be decrypted.
3624          */
3625         pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3626         if (!pae_root)
3627                 return -ENOMEM;
3628
3629 #ifdef CONFIG_X86_64
3630         pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3631         if (!pml4_root)
3632                 goto err_pml4;
3633
3634         if (need_pml5) {
3635                 pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3636                 if (!pml5_root)
3637                         goto err_pml5;
3638         }
3639 #endif
3640
3641         mmu->pae_root = pae_root;
3642         mmu->pml4_root = pml4_root;
3643         mmu->pml5_root = pml5_root;
3644
3645         return 0;
3646
3647 #ifdef CONFIG_X86_64
3648 err_pml5:
3649         free_page((unsigned long)pml4_root);
3650 err_pml4:
3651         free_page((unsigned long)pae_root);
3652         return -ENOMEM;
3653 #endif
3654 }
3655
3656 static bool is_unsync_root(hpa_t root)
3657 {
3658         struct kvm_mmu_page *sp;
3659
3660         if (!VALID_PAGE(root))
3661                 return false;
3662
3663         /*
3664          * The read barrier orders the CPU's read of SPTE.W during the page table
3665          * walk before the reads of sp->unsync/sp->unsync_children here.
3666          *
3667          * Even if another CPU was marking the SP as unsync-ed simultaneously,
3668          * any guest page table changes are not guaranteed to be visible anyway
3669          * until this VCPU issues a TLB flush strictly after those changes are
3670          * made.  We only need to ensure that the other CPU sets these flags
3671          * before any actual changes to the page tables are made.  The comments
3672          * in mmu_try_to_unsync_pages() describe what could go wrong if this
3673          * requirement isn't satisfied.
3674          */
3675         smp_rmb();
3676         sp = to_shadow_page(root);
3677
3678         /*
3679          * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
3680          * PDPTEs for a given PAE root need to be synchronized individually.
3681          */
3682         if (WARN_ON_ONCE(!sp))
3683                 return false;
3684
3685         if (sp->unsync || sp->unsync_children)
3686                 return true;
3687
3688         return false;
3689 }
3690
3691 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3692 {
3693         int i;
3694         struct kvm_mmu_page *sp;
3695
3696         if (vcpu->arch.mmu->direct_map)
3697                 return;
3698
3699         if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
3700                 return;
3701
3702         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3703
3704         if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
3705                 hpa_t root = vcpu->arch.mmu->root.hpa;
3706                 sp = to_shadow_page(root);
3707
3708                 if (!is_unsync_root(root))
3709                         return;
3710
3711                 write_lock(&vcpu->kvm->mmu_lock);
3712                 mmu_sync_children(vcpu, sp, true);
3713                 write_unlock(&vcpu->kvm->mmu_lock);
3714                 return;
3715         }
3716
3717         write_lock(&vcpu->kvm->mmu_lock);
3718
3719         for (i = 0; i < 4; ++i) {
3720                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3721
3722                 if (IS_VALID_PAE_ROOT(root)) {
3723                         root &= PT64_BASE_ADDR_MASK;
3724                         sp = to_shadow_page(root);
3725                         mmu_sync_children(vcpu, sp, true);
3726                 }
3727         }
3728
3729         write_unlock(&vcpu->kvm->mmu_lock);
3730 }
3731
3732 void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
3733 {
3734         unsigned long roots_to_free = 0;
3735         int i;
3736
3737         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3738                 if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
3739                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3740
3741         /* sync prev_roots by simply freeing them */
3742         kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
3743 }
3744
3745 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3746                                   gpa_t vaddr, u64 access,
3747                                   struct x86_exception *exception)
3748 {
3749         if (exception)
3750                 exception->error_code = 0;
3751         return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
3752 }
3753
3754 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3755 {
3756         /*
3757          * A nested guest cannot use the MMIO cache if it is using nested
3758          * page tables, because cr2 is a nGPA while the cache stores GPAs.
3759          */
3760         if (mmu_is_nested(vcpu))
3761                 return false;
3762
3763         if (direct)
3764                 return vcpu_match_mmio_gpa(vcpu, addr);
3765
3766         return vcpu_match_mmio_gva(vcpu, addr);
3767 }
3768
3769 /*
3770  * Return the level of the lowest level SPTE added to sptes.
3771  * That SPTE may be non-present.
3772  *
3773  * Must be called between walk_shadow_page_lockless_{begin,end}.
3774  */
3775 static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
3776 {
3777         struct kvm_shadow_walk_iterator iterator;
3778         int leaf = -1;
3779         u64 spte;
3780
3781         for (shadow_walk_init(&iterator, vcpu, addr),
3782              *root_level = iterator.level;
3783              shadow_walk_okay(&iterator);
3784              __shadow_walk_next(&iterator, spte)) {
3785                 leaf = iterator.level;
3786                 spte = mmu_spte_get_lockless(iterator.sptep);
3787
3788                 sptes[leaf] = spte;
3789         }
3790
3791         return leaf;
3792 }
3793
3794 /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
3795 static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3796 {
3797         u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
3798         struct rsvd_bits_validate *rsvd_check;
3799         int root, leaf, level;
3800         bool reserved = false;
3801
3802         walk_shadow_page_lockless_begin(vcpu);
3803
3804         if (is_tdp_mmu(vcpu->arch.mmu))
3805                 leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
3806         else
3807                 leaf = get_walk(vcpu, addr, sptes, &root);
3808
3809         walk_shadow_page_lockless_end(vcpu);
3810
3811         if (unlikely(leaf < 0)) {
3812                 *sptep = 0ull;
3813                 return reserved;
3814         }
3815
3816         *sptep = sptes[leaf];
3817
3818         /*
3819          * Skip reserved bits checks on the terminal leaf if it's not a valid
3820          * SPTE.  Note, this also (intentionally) skips MMIO SPTEs, which, by
3821          * design, always have reserved bits set.  The purpose of the checks is
3822          * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
3823          */
3824         if (!is_shadow_present_pte(sptes[leaf]))
3825                 leaf++;
3826
3827         rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
3828
3829         for (level = root; level >= leaf; level--)
3830                 reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
3831
3832         if (reserved) {
3833                 pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
3834                        __func__, addr);
3835                 for (level = root; level >= leaf; level--)
3836                         pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
3837                                sptes[level], level,
3838                                get_rsvd_bits(rsvd_check, sptes[level], level));
3839         }
3840
3841         return reserved;
3842 }
3843
3844 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3845 {
3846         u64 spte;
3847         bool reserved;
3848
3849         if (mmio_info_in_cache(vcpu, addr, direct))
3850                 return RET_PF_EMULATE;
3851
3852         reserved = get_mmio_spte(vcpu, addr, &spte);
3853         if (WARN_ON(reserved))
3854                 return -EINVAL;
3855
3856         if (is_mmio_spte(spte)) {
3857                 gfn_t gfn = get_mmio_spte_gfn(spte);
3858                 unsigned int access = get_mmio_spte_access(spte);
3859
3860                 if (!check_mmio_spte(vcpu, spte))
3861                         return RET_PF_INVALID;
3862
3863                 if (direct)
3864                         addr = 0;
3865
3866                 trace_handle_mmio_page_fault(addr, gfn, access);
3867                 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3868                 return RET_PF_EMULATE;
3869         }
3870
3871         /*
3872          * If the page table is zapped by other cpus, let CPU fault again on
3873          * the address.
3874          */
3875         return RET_PF_RETRY;
3876 }
3877
3878 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
3879                                          struct kvm_page_fault *fault)
3880 {
3881         if (unlikely(fault->rsvd))
3882                 return false;
3883
3884         if (!fault->present || !fault->write)
3885                 return false;
3886
3887         /*
3888          * guest is writing the page which is write tracked which can
3889          * not be fixed by page fault handler.
3890          */
3891         if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
3892                 return true;
3893
3894         return false;
3895 }
3896
3897 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
3898 {
3899         struct kvm_shadow_walk_iterator iterator;
3900         u64 spte;
3901
3902         walk_shadow_page_lockless_begin(vcpu);
3903         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
3904                 clear_sp_write_flooding_count(iterator.sptep);
3905         walk_shadow_page_lockless_end(vcpu);
3906 }
3907
3908 static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
3909 {
3910         /* make sure the token value is not 0 */
3911         u32 id = vcpu->arch.apf.id;
3912
3913         if (id << 12 == 0)
3914                 vcpu->arch.apf.id = 1;
3915
3916         return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
3917 }
3918
3919 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
3920                                     gfn_t gfn)
3921 {
3922         struct kvm_arch_async_pf arch;
3923
3924         arch.token = alloc_apf_token(vcpu);
3925         arch.gfn = gfn;
3926         arch.direct_map = vcpu->arch.mmu->direct_map;
3927         arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
3928
3929         return kvm_setup_async_pf(vcpu, cr2_or_gpa,
3930                                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
3931 }
3932
3933 static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r)
3934 {
3935         struct kvm_memory_slot *slot = fault->slot;
3936         bool async;
3937
3938         /*
3939          * Retry the page fault if the gfn hit a memslot that is being deleted
3940          * or moved.  This ensures any existing SPTEs for the old memslot will
3941          * be zapped before KVM inserts a new MMIO SPTE for the gfn.
3942          */
3943         if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
3944                 goto out_retry;
3945
3946         if (!kvm_is_visible_memslot(slot)) {
3947                 /* Don't expose private memslots to L2. */
3948                 if (is_guest_mode(vcpu)) {
3949                         fault->slot = NULL;
3950                         fault->pfn = KVM_PFN_NOSLOT;
3951                         fault->map_writable = false;
3952                         return false;
3953                 }
3954                 /*
3955                  * If the APIC access page exists but is disabled, go directly
3956                  * to emulation without caching the MMIO access or creating a
3957                  * MMIO SPTE.  That way the cache doesn't need to be purged
3958                  * when the AVIC is re-enabled.
3959                  */
3960                 if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
3961                     !kvm_apicv_activated(vcpu->kvm)) {
3962                         *r = RET_PF_EMULATE;
3963                         return true;
3964                 }
3965         }
3966
3967         async = false;
3968         fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
3969                                           fault->write, &fault->map_writable,
3970                                           &fault->hva);
3971         if (!async)
3972                 return false; /* *pfn has correct page already */
3973
3974         if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
3975                 trace_kvm_try_async_get_page(fault->addr, fault->gfn);
3976                 if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
3977                         trace_kvm_async_pf_doublefault(fault->addr, fault->gfn);
3978                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
3979                         goto out_retry;
3980                 } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn))
3981                         goto out_retry;
3982         }
3983
3984         fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
3985                                           fault->write, &fault->map_writable,
3986                                           &fault->hva);
3987         return false;
3988
3989 out_retry:
3990         *r = RET_PF_RETRY;
3991         return true;
3992 }
3993
3994 /*
3995  * Returns true if the page fault is stale and needs to be retried, i.e. if the
3996  * root was invalidated by a memslot update or a relevant mmu_notifier fired.
3997  */
3998 static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
3999                                 struct kvm_page_fault *fault, int mmu_seq)
4000 {
4001         struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
4002
4003         /* Special roots, e.g. pae_root, are not backed by shadow pages. */
4004         if (sp && is_obsolete_sp(vcpu->kvm, sp))
4005                 return true;
4006
4007         /*
4008          * Roots without an associated shadow page are considered invalid if
4009          * there is a pending request to free obsolete roots.  The request is
4010          * only a hint that the current root _may_ be obsolete and needs to be
4011          * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
4012          * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
4013          * to reload even if no vCPU is actively using the root.
4014          */
4015         if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
4016                 return true;
4017
4018         return fault->slot &&
4019                mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
4020 }
4021
4022 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4023 {
4024         bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
4025
4026         unsigned long mmu_seq;
4027         int r;
4028
4029         fault->gfn = fault->addr >> PAGE_SHIFT;
4030         fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
4031
4032         if (page_fault_handle_page_track(vcpu, fault))
4033                 return RET_PF_EMULATE;
4034
4035         r = fast_page_fault(vcpu, fault);
4036         if (r != RET_PF_INVALID)
4037                 return r;
4038
4039         r = mmu_topup_memory_caches(vcpu, false);
4040         if (r)
4041                 return r;
4042
4043         mmu_seq = vcpu->kvm->mmu_notifier_seq;
4044         smp_rmb();
4045
4046         if (kvm_faultin_pfn(vcpu, fault, &r))
4047                 return r;
4048
4049         if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r))
4050                 return r;
4051
4052         r = RET_PF_RETRY;
4053
4054         if (is_tdp_mmu_fault)
4055                 read_lock(&vcpu->kvm->mmu_lock);
4056         else
4057                 write_lock(&vcpu->kvm->mmu_lock);
4058
4059         if (is_page_fault_stale(vcpu, fault, mmu_seq))
4060                 goto out_unlock;
4061
4062         r = make_mmu_pages_available(vcpu);
4063         if (r)
4064                 goto out_unlock;
4065
4066         if (is_tdp_mmu_fault)
4067                 r = kvm_tdp_mmu_map(vcpu, fault);
4068         else
4069                 r = __direct_map(vcpu, fault);
4070
4071 out_unlock:
4072         if (is_tdp_mmu_fault)
4073                 read_unlock(&vcpu->kvm->mmu_lock);
4074         else
4075                 write_unlock(&vcpu->kvm->mmu_lock);
4076         kvm_release_pfn_clean(fault->pfn);
4077         return r;
4078 }
4079
4080 static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
4081                                 struct kvm_page_fault *fault)
4082 {
4083         pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
4084
4085         /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
4086         fault->max_level = PG_LEVEL_2M;
4087         return direct_page_fault(vcpu, fault);
4088 }
4089
4090 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4091                                 u64 fault_address, char *insn, int insn_len)
4092 {
4093         int r = 1;
4094         u32 flags = vcpu->arch.apf.host_apf_flags;
4095
4096 #ifndef CONFIG_X86_64
4097         /* A 64-bit CR2 should be impossible on 32-bit KVM. */
4098         if (WARN_ON_ONCE(fault_address >> 32))
4099                 return -EFAULT;
4100 #endif
4101
4102         vcpu->arch.l1tf_flush_l1d = true;
4103         if (!flags) {
4104                 trace_kvm_page_fault(fault_address, error_code);
4105
4106                 if (kvm_event_needs_reinjection(vcpu))
4107                         kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4108                 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4109                                 insn_len);
4110         } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
4111                 vcpu->arch.apf.host_apf_flags = 0;
4112                 local_irq_disable();
4113                 kvm_async_pf_task_wait_schedule(fault_address);
4114                 local_irq_enable();
4115         } else {
4116                 WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
4117         }
4118
4119         return r;
4120 }
4121 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4122
4123 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4124 {
4125         while (fault->max_level > PG_LEVEL_4K) {
4126                 int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
4127                 gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
4128
4129                 if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
4130                         break;
4131
4132                 --fault->max_level;
4133         }
4134
4135         return direct_page_fault(vcpu, fault);
4136 }
4137
4138 static void nonpaging_init_context(struct kvm_mmu *context)
4139 {
4140         context->page_fault = nonpaging_page_fault;
4141         context->gva_to_gpa = nonpaging_gva_to_gpa;
4142         context->sync_page = nonpaging_sync_page;
4143         context->invlpg = NULL;
4144         context->direct_map = true;
4145 }
4146
4147 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
4148                                   union kvm_mmu_page_role role)
4149 {
4150         return (role.direct || pgd == root->pgd) &&
4151                VALID_PAGE(root->hpa) &&
4152                role.word == to_shadow_page(root->hpa)->role.word;
4153 }
4154
4155 /*
4156  * Find out if a previously cached root matching the new pgd/role is available,
4157  * and insert the current root as the MRU in the cache.
4158  * If a matching root is found, it is assigned to kvm_mmu->root and
4159  * true is returned.
4160  * If no match is found, kvm_mmu->root is left invalid, the LRU root is
4161  * evicted to make room for the current root, and false is returned.
4162  */
4163 static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
4164                                               gpa_t new_pgd,
4165                                               union kvm_mmu_page_role new_role)
4166 {
4167         uint i;
4168
4169         if (is_root_usable(&mmu->root, new_pgd, new_role))
4170                 return true;
4171
4172         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4173                 /*
4174                  * The swaps end up rotating the cache like this:
4175                  *   C   0 1 2 3   (on entry to the function)
4176                  *   0   C 1 2 3
4177                  *   1   C 0 2 3
4178                  *   2   C 0 1 3
4179                  *   3   C 0 1 2   (on exit from the loop)
4180                  */
4181                 swap(mmu->root, mmu->prev_roots[i]);
4182                 if (is_root_usable(&mmu->root, new_pgd, new_role))
4183                         return true;
4184         }
4185
4186         kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4187         return false;
4188 }
4189
4190 /*
4191  * Find out if a previously cached root matching the new pgd/role is available.
4192  * On entry, mmu->root is invalid.
4193  * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
4194  * of the cache becomes invalid, and true is returned.
4195  * If no match is found, kvm_mmu->root is left invalid and false is returned.
4196  */
4197 static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
4198                                              gpa_t new_pgd,
4199                                              union kvm_mmu_page_role new_role)
4200 {
4201         uint i;
4202
4203         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4204                 if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
4205                         goto hit;
4206
4207         return false;
4208
4209 hit:
4210         swap(mmu->root, mmu->prev_roots[i]);
4211         /* Bubble up the remaining roots.  */
4212         for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
4213                 mmu->prev_roots[i] = mmu->prev_roots[i + 1];
4214         mmu->prev_roots[i].hpa = INVALID_PAGE;
4215         return true;
4216 }
4217
4218 static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
4219                             gpa_t new_pgd, union kvm_mmu_page_role new_role)
4220 {
4221         /*
4222          * For now, limit the caching to 64-bit hosts+VMs in order to avoid
4223          * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4224          * later if necessary.
4225          */
4226         if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
4227                 kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4228
4229         if (VALID_PAGE(mmu->root.hpa))
4230                 return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
4231         else
4232                 return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
4233 }
4234
4235 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
4236 {
4237         struct kvm_mmu *mmu = vcpu->arch.mmu;
4238         union kvm_mmu_page_role new_role = mmu->root_role;
4239
4240         if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
4241                 /* kvm_mmu_ensure_valid_pgd will set up a new root.  */
4242                 return;
4243         }
4244
4245         /*
4246          * It's possible that the cached previous root page is obsolete because
4247          * of a change in the MMU generation number. However, changing the
4248          * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
4249          * which will free the root set here and allocate a new one.
4250          */
4251         kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
4252
4253         if (force_flush_and_sync_on_reuse) {
4254                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4255                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
4256         }
4257
4258         /*
4259          * The last MMIO access's GVA and GPA are cached in the VCPU. When
4260          * switching to a new CR3, that GVA->GPA mapping may no longer be
4261          * valid. So clear any cached MMIO info even when we don't need to sync
4262          * the shadow page tables.
4263          */
4264         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4265
4266         /*
4267          * If this is a direct root page, it doesn't have a write flooding
4268          * count. Otherwise, clear the write flooding count.
4269          */
4270         if (!new_role.direct)
4271                 __clear_sp_write_flooding_count(
4272                                 to_shadow_page(vcpu->arch.mmu->root.hpa));
4273 }
4274 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
4275
4276 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4277 {
4278         return kvm_read_cr3(vcpu);
4279 }
4280
4281 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4282                            unsigned int access)
4283 {
4284         if (unlikely(is_mmio_spte(*sptep))) {
4285                 if (gfn != get_mmio_spte_gfn(*sptep)) {
4286                         mmu_spte_clear_no_track(sptep);
4287                         return true;
4288                 }
4289
4290                 mark_mmio_spte(vcpu, sptep, gfn, access);
4291                 return true;
4292         }
4293
4294         return false;
4295 }
4296
4297 #define PTTYPE_EPT 18 /* arbitrary */
4298 #define PTTYPE PTTYPE_EPT
4299 #include "paging_tmpl.h"
4300 #undef PTTYPE
4301
4302 #define PTTYPE 64
4303 #include "paging_tmpl.h"
4304 #undef PTTYPE
4305
4306 #define PTTYPE 32
4307 #include "paging_tmpl.h"
4308 #undef PTTYPE
4309
4310 static void
4311 __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
4312                         u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
4313                         bool pse, bool amd)
4314 {
4315         u64 gbpages_bit_rsvd = 0;
4316         u64 nonleaf_bit8_rsvd = 0;
4317         u64 high_bits_rsvd;
4318
4319         rsvd_check->bad_mt_xwr = 0;
4320
4321         if (!gbpages)
4322                 gbpages_bit_rsvd = rsvd_bits(7, 7);
4323
4324         if (level == PT32E_ROOT_LEVEL)
4325                 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
4326         else
4327                 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4328
4329         /* Note, NX doesn't exist in PDPTEs, this is handled below. */
4330         if (!nx)
4331                 high_bits_rsvd |= rsvd_bits(63, 63);
4332
4333         /*
4334          * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4335          * leaf entries) on AMD CPUs only.
4336          */
4337         if (amd)
4338                 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4339
4340         switch (level) {
4341         case PT32_ROOT_LEVEL:
4342                 /* no rsvd bits for 2 level 4K page table entries */
4343                 rsvd_check->rsvd_bits_mask[0][1] = 0;
4344                 rsvd_check->rsvd_bits_mask[0][0] = 0;
4345                 rsvd_check->rsvd_bits_mask[1][0] =
4346                         rsvd_check->rsvd_bits_mask[0][0];
4347
4348                 if (!pse) {
4349                         rsvd_check->rsvd_bits_mask[1][1] = 0;
4350                         break;
4351                 }
4352
4353                 if (is_cpuid_PSE36())
4354                         /* 36bits PSE 4MB page */
4355                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4356                 else
4357                         /* 32 bits PSE 4MB page */
4358                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4359                 break;
4360         case PT32E_ROOT_LEVEL:
4361                 rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
4362                                                    high_bits_rsvd |
4363                                                    rsvd_bits(5, 8) |
4364                                                    rsvd_bits(1, 2);     /* PDPTE */
4365                 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;      /* PDE */
4366                 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;      /* PTE */
4367                 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4368                                                    rsvd_bits(13, 20);   /* large page */
4369                 rsvd_check->rsvd_bits_mask[1][0] =
4370                         rsvd_check->rsvd_bits_mask[0][0];
4371                 break;
4372         case PT64_ROOT_5LEVEL:
4373                 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
4374                                                    nonleaf_bit8_rsvd |
4375                                                    rsvd_bits(7, 7);
4376                 rsvd_check->rsvd_bits_mask[1][4] =
4377                         rsvd_check->rsvd_bits_mask[0][4];
4378                 fallthrough;
4379         case PT64_ROOT_4LEVEL:
4380                 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
4381                                                    nonleaf_bit8_rsvd |
4382                                                    rsvd_bits(7, 7);
4383                 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
4384                                                    gbpages_bit_rsvd;
4385                 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
4386                 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4387                 rsvd_check->rsvd_bits_mask[1][3] =
4388                         rsvd_check->rsvd_bits_mask[0][3];
4389                 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
4390                                                    gbpages_bit_rsvd |
4391                                                    rsvd_bits(13, 29);
4392                 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4393                                                    rsvd_bits(13, 20); /* large page */
4394                 rsvd_check->rsvd_bits_mask[1][0] =
4395                         rsvd_check->rsvd_bits_mask[0][0];
4396                 break;
4397         }
4398 }
4399
4400 static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
4401 {
4402         /*
4403          * If TDP is enabled, let the guest use GBPAGES if they're supported in
4404          * hardware.  The hardware page walker doesn't let KVM disable GBPAGES,
4405          * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
4406          * walk for performance and complexity reasons.  Not to mention KVM
4407          * _can't_ solve the problem because GVA->GPA walks aren't visible to
4408          * KVM once a TDP translation is installed.  Mimic hardware behavior so
4409          * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
4410          */
4411         return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
4412                              guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
4413 }
4414
4415 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4416                                   struct kvm_mmu *context)
4417 {
4418         __reset_rsvds_bits_mask(&context->guest_rsvd_check,
4419                                 vcpu->arch.reserved_gpa_bits,
4420                                 context->cpu_role.base.level, is_efer_nx(context),
4421                                 guest_can_use_gbpages(vcpu),
4422                                 is_cr4_pse(context),
4423                                 guest_cpuid_is_amd_or_hygon(vcpu));
4424 }
4425
4426 static void
4427 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4428                             u64 pa_bits_rsvd, bool execonly, int huge_page_level)
4429 {
4430         u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4431         u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
4432         u64 bad_mt_xwr;
4433
4434         if (huge_page_level < PG_LEVEL_1G)
4435                 large_1g_rsvd = rsvd_bits(7, 7);
4436         if (huge_page_level < PG_LEVEL_2M)
4437                 large_2m_rsvd = rsvd_bits(7, 7);
4438
4439         rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
4440         rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
4441         rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
4442         rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
4443         rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4444
4445         /* large page */
4446         rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4447         rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4448         rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
4449         rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
4450         rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4451
4452         bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
4453         bad_mt_xwr |= 0xFFull << (3 * 8);       /* bits 3..5 must not be 3 */
4454         bad_mt_xwr |= 0xFFull << (7 * 8);       /* bits 3..5 must not be 7 */
4455         bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   /* bits 0..2 must not be 010 */
4456         bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   /* bits 0..2 must not be 110 */
4457         if (!execonly) {
4458                 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4459                 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4460         }
4461         rsvd_check->bad_mt_xwr = bad_mt_xwr;
4462 }
4463
4464 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4465                 struct kvm_mmu *context, bool execonly, int huge_page_level)
4466 {
4467         __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4468                                     vcpu->arch.reserved_gpa_bits, execonly,
4469                                     huge_page_level);
4470 }
4471
4472 static inline u64 reserved_hpa_bits(void)
4473 {
4474         return rsvd_bits(shadow_phys_bits, 63);
4475 }
4476
4477 /*
4478  * the page table on host is the shadow page table for the page
4479  * table in guest or amd nested guest, its mmu features completely
4480  * follow the features in guest.
4481  */
4482 static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4483                                         struct kvm_mmu *context)
4484 {
4485         /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
4486         bool is_amd = true;
4487         /* KVM doesn't use 2-level page tables for the shadow MMU. */
4488         bool is_pse = false;
4489         struct rsvd_bits_validate *shadow_zero_check;
4490         int i;
4491
4492         WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
4493
4494         shadow_zero_check = &context->shadow_zero_check;
4495         __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4496                                 context->root_role.level,
4497                                 context->root_role.efer_nx,
4498                                 guest_can_use_gbpages(vcpu), is_pse, is_amd);
4499
4500         if (!shadow_me_mask)
4501                 return;
4502
4503         for (i = context->root_role.level; --i >= 0;) {
4504                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4505                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4506         }
4507
4508 }
4509
4510 static inline bool boot_cpu_is_amd(void)
4511 {
4512         WARN_ON_ONCE(!tdp_enabled);
4513         return shadow_x_mask == 0;
4514 }
4515
4516 /*
4517  * the direct page table on host, use as much mmu features as
4518  * possible, however, kvm currently does not do execution-protection.
4519  */
4520 static void
4521 reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
4522 {
4523         struct rsvd_bits_validate *shadow_zero_check;
4524         int i;
4525
4526         shadow_zero_check = &context->shadow_zero_check;
4527
4528         if (boot_cpu_is_amd())
4529                 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4530                                         context->root_role.level, false,
4531                                         boot_cpu_has(X86_FEATURE_GBPAGES),
4532                                         false, true);
4533         else
4534                 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4535                                             reserved_hpa_bits(), false,
4536                                             max_huge_page_level);
4537
4538         if (!shadow_me_mask)
4539                 return;
4540
4541         for (i = context->root_role.level; --i >= 0;) {
4542                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4543                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4544         }
4545 }
4546
4547 /*
4548  * as the comments in reset_shadow_zero_bits_mask() except it
4549  * is the shadow page table for intel nested guest.
4550  */
4551 static void
4552 reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
4553 {
4554         __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4555                                     reserved_hpa_bits(), execonly,
4556                                     max_huge_page_level);
4557 }
4558
4559 #define BYTE_MASK(access) \
4560         ((1 & (access) ? 2 : 0) | \
4561          (2 & (access) ? 4 : 0) | \
4562          (3 & (access) ? 8 : 0) | \
4563          (4 & (access) ? 16 : 0) | \
4564          (5 & (access) ? 32 : 0) | \
4565          (6 & (access) ? 64 : 0) | \
4566          (7 & (access) ? 128 : 0))
4567
4568
4569 static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
4570 {
4571         unsigned byte;
4572
4573         const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4574         const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4575         const u8 u = BYTE_MASK(ACC_USER_MASK);
4576
4577         bool cr4_smep = is_cr4_smep(mmu);
4578         bool cr4_smap = is_cr4_smap(mmu);
4579         bool cr0_wp = is_cr0_wp(mmu);
4580         bool efer_nx = is_efer_nx(mmu);
4581
4582         for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4583                 unsigned pfec = byte << 1;
4584
4585                 /*
4586                  * Each "*f" variable has a 1 bit for each UWX value
4587                  * that causes a fault with the given PFEC.
4588                  */
4589
4590                 /* Faults from writes to non-writable pages */
4591                 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4592                 /* Faults from user mode accesses to supervisor pages */
4593                 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4594                 /* Faults from fetches of non-executable pages*/
4595                 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4596                 /* Faults from kernel mode fetches of user pages */
4597                 u8 smepf = 0;
4598                 /* Faults from kernel mode accesses of user pages */
4599                 u8 smapf = 0;
4600
4601                 if (!ept) {
4602                         /* Faults from kernel mode accesses to user pages */
4603                         u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4604
4605                         /* Not really needed: !nx will cause pte.nx to fault */
4606                         if (!efer_nx)
4607                                 ff = 0;
4608
4609                         /* Allow supervisor writes if !cr0.wp */
4610                         if (!cr0_wp)
4611                                 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4612
4613                         /* Disallow supervisor fetches of user code if cr4.smep */
4614                         if (cr4_smep)
4615                                 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4616
4617                         /*
4618                          * SMAP:kernel-mode data accesses from user-mode
4619                          * mappings should fault. A fault is considered
4620                          * as a SMAP violation if all of the following
4621                          * conditions are true:
4622                          *   - X86_CR4_SMAP is set in CR4
4623                          *   - A user page is accessed
4624                          *   - The access is not a fetch
4625                          *   - The access is supervisor mode
4626                          *   - If implicit supervisor access or X86_EFLAGS_AC is clear
4627                          *
4628                          * Here, we cover the first four conditions.
4629                          * The fifth is computed dynamically in permission_fault();
4630                          * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4631                          * *not* subject to SMAP restrictions.
4632                          */
4633                         if (cr4_smap)
4634                                 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4635                 }
4636
4637                 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4638         }
4639 }
4640
4641 /*
4642 * PKU is an additional mechanism by which the paging controls access to
4643 * user-mode addresses based on the value in the PKRU register.  Protection
4644 * key violations are reported through a bit in the page fault error code.
4645 * Unlike other bits of the error code, the PK bit is not known at the
4646 * call site of e.g. gva_to_gpa; it must be computed directly in
4647 * permission_fault based on two bits of PKRU, on some machine state (CR4,
4648 * CR0, EFER, CPL), and on other bits of the error code and the page tables.
4649 *
4650 * In particular the following conditions come from the error code, the
4651 * page tables and the machine state:
4652 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4653 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4654 * - PK is always zero if U=0 in the page tables
4655 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4656 *
4657 * The PKRU bitmask caches the result of these four conditions.  The error
4658 * code (minus the P bit) and the page table's U bit form an index into the
4659 * PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
4660 * with the two bits of the PKRU register corresponding to the protection key.
4661 * For the first three conditions above the bits will be 00, thus masking
4662 * away both AD and WD.  For all reads or if the last condition holds, WD
4663 * only will be masked away.
4664 */
4665 static void update_pkru_bitmask(struct kvm_mmu *mmu)
4666 {
4667         unsigned bit;
4668         bool wp;
4669
4670         mmu->pkru_mask = 0;
4671
4672         if (!is_cr4_pke(mmu))
4673                 return;
4674
4675         wp = is_cr0_wp(mmu);
4676
4677         for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4678                 unsigned pfec, pkey_bits;
4679                 bool check_pkey, check_write, ff, uf, wf, pte_user;
4680
4681                 pfec = bit << 1;
4682                 ff = pfec & PFERR_FETCH_MASK;
4683                 uf = pfec & PFERR_USER_MASK;
4684                 wf = pfec & PFERR_WRITE_MASK;
4685
4686                 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
4687                 pte_user = pfec & PFERR_RSVD_MASK;
4688
4689                 /*
4690                  * Only need to check the access which is not an
4691                  * instruction fetch and is to a user page.
4692                  */
4693                 check_pkey = (!ff && pte_user);
4694                 /*
4695                  * write access is controlled by PKRU if it is a
4696                  * user access or CR0.WP = 1.
4697                  */
4698                 check_write = check_pkey && wf && (uf || wp);
4699
4700                 /* PKRU.AD stops both read and write access. */
4701                 pkey_bits = !!check_pkey;
4702                 /* PKRU.WD stops write access. */
4703                 pkey_bits |= (!!check_write) << 1;
4704
4705                 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4706         }
4707 }
4708
4709 static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
4710                                         struct kvm_mmu *mmu)
4711 {
4712         if (!is_cr0_pg(mmu))
4713                 return;
4714
4715         reset_rsvds_bits_mask(vcpu, mmu);
4716         update_permission_bitmask(mmu, false);
4717         update_pkru_bitmask(mmu);
4718 }
4719
4720 static void paging64_init_context(struct kvm_mmu *context)
4721 {
4722         context->page_fault = paging64_page_fault;
4723         context->gva_to_gpa = paging64_gva_to_gpa;
4724         context->sync_page = paging64_sync_page;
4725         context->invlpg = paging64_invlpg;
4726         context->direct_map = false;
4727 }
4728
4729 static void paging32_init_context(struct kvm_mmu *context)
4730 {
4731         context->page_fault = paging32_page_fault;
4732         context->gva_to_gpa = paging32_gva_to_gpa;
4733         context->sync_page = paging32_sync_page;
4734         context->invlpg = paging32_invlpg;
4735         context->direct_map = false;
4736 }
4737
4738 static union kvm_cpu_role
4739 kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
4740 {
4741         union kvm_cpu_role role = {0};
4742
4743         role.base.access = ACC_ALL;
4744         role.base.smm = is_smm(vcpu);
4745         role.base.guest_mode = is_guest_mode(vcpu);
4746         role.ext.valid = 1;
4747
4748         if (!____is_cr0_pg(regs)) {
4749                 role.base.direct = 1;
4750                 return role;
4751         }
4752
4753         role.base.efer_nx = ____is_efer_nx(regs);
4754         role.base.cr0_wp = ____is_cr0_wp(regs);
4755         role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
4756         role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
4757         role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
4758
4759         if (____is_efer_lma(regs))
4760                 role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
4761                                                         : PT64_ROOT_4LEVEL;
4762         else if (____is_cr4_pae(regs))
4763                 role.base.level = PT32E_ROOT_LEVEL;
4764         else
4765                 role.base.level = PT32_ROOT_LEVEL;
4766
4767         role.ext.cr4_smep = ____is_cr4_smep(regs);
4768         role.ext.cr4_smap = ____is_cr4_smap(regs);
4769         role.ext.cr4_pse = ____is_cr4_pse(regs);
4770
4771         /* PKEY and LA57 are active iff long mode is active. */
4772         role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
4773         role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
4774         role.ext.efer_lma = ____is_efer_lma(regs);
4775         return role;
4776 }
4777
4778 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
4779 {
4780         /* tdp_root_level is architecture forced level, use it if nonzero */
4781         if (tdp_root_level)
4782                 return tdp_root_level;
4783
4784         /* Use 5-level TDP if and only if it's useful/necessary. */
4785         if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
4786                 return 4;
4787
4788         return max_tdp_level;
4789 }
4790
4791 static union kvm_mmu_page_role
4792 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
4793                                 union kvm_cpu_role cpu_role)
4794 {
4795         union kvm_mmu_page_role role = {0};
4796
4797         role.access = ACC_ALL;
4798         role.cr0_wp = true;
4799         role.efer_nx = true;
4800         role.smm = cpu_role.base.smm;
4801         role.guest_mode = cpu_role.base.guest_mode;
4802         role.ad_disabled = (shadow_accessed_mask == 0);
4803         role.level = kvm_mmu_get_tdp_level(vcpu);
4804         role.direct = true;
4805         role.has_4_byte_gpte = false;
4806
4807         return role;
4808 }
4809
4810 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
4811                              union kvm_cpu_role cpu_role)
4812 {
4813         struct kvm_mmu *context = &vcpu->arch.root_mmu;
4814         union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
4815
4816         if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
4817             root_role.word == context->root_role.word)
4818                 return;
4819
4820         context->cpu_role.as_u64 = cpu_role.as_u64;
4821         context->root_role.word = root_role.word;
4822         context->page_fault = kvm_tdp_page_fault;
4823         context->sync_page = nonpaging_sync_page;
4824         context->invlpg = NULL;
4825         context->direct_map = true;
4826         context->get_guest_pgd = get_cr3;
4827         context->get_pdptr = kvm_pdptr_read;
4828         context->inject_page_fault = kvm_inject_page_fault;
4829
4830         if (!is_cr0_pg(context))
4831                 context->gva_to_gpa = nonpaging_gva_to_gpa;
4832         else if (is_cr4_pae(context))
4833                 context->gva_to_gpa = paging64_gva_to_gpa;
4834         else
4835                 context->gva_to_gpa = paging32_gva_to_gpa;
4836
4837         reset_guest_paging_metadata(vcpu, context);
4838         reset_tdp_shadow_zero_bits_mask(context);
4839 }
4840
4841 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
4842                                     union kvm_cpu_role cpu_role,
4843                                     union kvm_mmu_page_role root_role)
4844 {
4845         if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
4846             root_role.word == context->root_role.word)
4847                 return;
4848
4849         context->cpu_role.as_u64 = cpu_role.as_u64;
4850         context->root_role.word = root_role.word;
4851
4852         if (!is_cr0_pg(context))
4853                 nonpaging_init_context(context);
4854         else if (is_cr4_pae(context))
4855                 paging64_init_context(context);
4856         else
4857                 paging32_init_context(context);
4858
4859         reset_guest_paging_metadata(vcpu, context);
4860         reset_shadow_zero_bits_mask(vcpu, context);
4861 }
4862
4863 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
4864                                 union kvm_cpu_role cpu_role)
4865 {
4866         struct kvm_mmu *context = &vcpu->arch.root_mmu;
4867         union kvm_mmu_page_role root_role;
4868
4869         root_role = cpu_role.base;
4870
4871         /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
4872         root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
4873
4874         /*
4875          * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
4876          * KVM uses NX when TDP is disabled to handle a variety of scenarios,
4877          * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
4878          * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
4879          * The iTLB multi-hit workaround can be toggled at any time, so assume
4880          * NX can be used by any non-nested shadow MMU to avoid having to reset
4881          * MMU contexts.
4882          */
4883         root_role.efer_nx = true;
4884
4885         shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
4886 }
4887
4888 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
4889                              unsigned long cr4, u64 efer, gpa_t nested_cr3)
4890 {
4891         struct kvm_mmu *context = &vcpu->arch.guest_mmu;
4892         struct kvm_mmu_role_regs regs = {
4893                 .cr0 = cr0,
4894                 .cr4 = cr4 & ~X86_CR4_PKE,
4895                 .efer = efer,
4896         };
4897         union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
4898         union kvm_mmu_page_role root_role;
4899
4900         /* NPT requires CR0.PG=1. */
4901         WARN_ON_ONCE(cpu_role.base.direct);
4902
4903         root_role = cpu_role.base;
4904         root_role.level = kvm_mmu_get_tdp_level(vcpu);
4905
4906         shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
4907         kvm_mmu_new_pgd(vcpu, nested_cr3);
4908 }
4909 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
4910
4911 static union kvm_cpu_role
4912 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
4913                                    bool execonly, u8 level)
4914 {
4915         union kvm_cpu_role role = {0};
4916
4917         /*
4918          * KVM does not support SMM transfer monitors, and consequently does not
4919          * support the "entry to SMM" control either.  role.base.smm is always 0.
4920          */
4921         WARN_ON_ONCE(is_smm(vcpu));
4922         role.base.level = level;
4923         role.base.has_4_byte_gpte = false;
4924         role.base.direct = false;
4925         role.base.ad_disabled = !accessed_dirty;
4926         role.base.guest_mode = true;
4927         role.base.access = ACC_ALL;
4928
4929         role.ext.word = 0;
4930         role.ext.execonly = execonly;
4931         role.ext.valid = 1;
4932
4933         return role;
4934 }
4935
4936 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4937                              int huge_page_level, bool accessed_dirty,
4938                              gpa_t new_eptp)
4939 {
4940         struct kvm_mmu *context = &vcpu->arch.guest_mmu;
4941         u8 level = vmx_eptp_page_walk_level(new_eptp);
4942         union kvm_cpu_role new_mode =
4943                 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
4944                                                    execonly, level);
4945
4946         if (new_mode.as_u64 != context->cpu_role.as_u64) {
4947                 /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
4948                 context->cpu_role.as_u64 = new_mode.as_u64;
4949                 context->root_role.word = new_mode.base.word;
4950
4951                 context->page_fault = ept_page_fault;
4952                 context->gva_to_gpa = ept_gva_to_gpa;
4953                 context->sync_page = ept_sync_page;
4954                 context->invlpg = ept_invlpg;
4955                 context->direct_map = false;
4956                 update_permission_bitmask(context, true);
4957                 context->pkru_mask = 0;
4958                 reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
4959                 reset_ept_shadow_zero_bits_mask(context, execonly);
4960         }
4961
4962         kvm_mmu_new_pgd(vcpu, new_eptp);
4963 }
4964 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
4965
4966 static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
4967                              union kvm_cpu_role cpu_role)
4968 {
4969         struct kvm_mmu *context = &vcpu->arch.root_mmu;
4970
4971         kvm_init_shadow_mmu(vcpu, cpu_role);
4972
4973         context->get_guest_pgd     = get_cr3;
4974         context->get_pdptr         = kvm_pdptr_read;
4975         context->inject_page_fault = kvm_inject_page_fault;
4976 }
4977
4978 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
4979                                 union kvm_cpu_role new_mode)
4980 {
4981         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
4982
4983         if (new_mode.as_u64 == g_context->cpu_role.as_u64)
4984                 return;
4985
4986         g_context->cpu_role.as_u64   = new_mode.as_u64;
4987         g_context->get_guest_pgd     = get_cr3;
4988         g_context->get_pdptr         = kvm_pdptr_read;
4989         g_context->inject_page_fault = kvm_inject_page_fault;
4990
4991         /*
4992          * L2 page tables are never shadowed, so there is no need to sync
4993          * SPTEs.
4994          */
4995         g_context->invlpg            = NULL;
4996
4997         /*
4998          * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
4999          * L1's nested page tables (e.g. EPT12). The nested translation
5000          * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5001          * L2's page tables as the first level of translation and L1's
5002          * nested page tables as the second level of translation. Basically
5003          * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5004          */
5005         if (!is_paging(vcpu))
5006                 g_context->gva_to_gpa = nonpaging_gva_to_gpa;
5007         else if (is_long_mode(vcpu))
5008                 g_context->gva_to_gpa = paging64_gva_to_gpa;
5009         else if (is_pae(vcpu))
5010                 g_context->gva_to_gpa = paging64_gva_to_gpa;
5011         else
5012                 g_context->gva_to_gpa = paging32_gva_to_gpa;
5013
5014         reset_guest_paging_metadata(vcpu, g_context);
5015 }
5016
5017 void kvm_init_mmu(struct kvm_vcpu *vcpu)
5018 {
5019         struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
5020         union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
5021
5022         if (mmu_is_nested(vcpu))
5023                 init_kvm_nested_mmu(vcpu, cpu_role);
5024         else if (tdp_enabled)
5025                 init_kvm_tdp_mmu(vcpu, cpu_role);
5026         else
5027                 init_kvm_softmmu(vcpu, cpu_role);
5028 }
5029 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5030
5031 void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
5032 {
5033         /*
5034          * Invalidate all MMU roles to force them to reinitialize as CPUID
5035          * information is factored into reserved bit calculations.
5036          *
5037          * Correctly handling multiple vCPU models with respect to paging and
5038          * physical address properties) in a single VM would require tracking
5039          * all relevant CPUID information in kvm_mmu_page_role. That is very
5040          * undesirable as it would increase the memory requirements for
5041          * gfn_track (see struct kvm_mmu_page_role comments).  For now that
5042          * problem is swept under the rug; KVM's CPUID API is horrific and
5043          * it's all but impossible to solve it without introducing a new API.
5044          */
5045         vcpu->arch.root_mmu.root_role.word = 0;
5046         vcpu->arch.guest_mmu.root_role.word = 0;
5047         vcpu->arch.nested_mmu.root_role.word = 0;
5048         vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
5049         vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
5050         vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
5051         kvm_mmu_reset_context(vcpu);
5052
5053         /*
5054          * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
5055          * kvm_arch_vcpu_ioctl().
5056          */
5057         KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
5058 }
5059
5060 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5061 {
5062         kvm_mmu_unload(vcpu);
5063         kvm_init_mmu(vcpu);
5064 }
5065 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5066
5067 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5068 {
5069         int r;
5070
5071         r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
5072         if (r)
5073                 goto out;
5074         r = mmu_alloc_special_roots(vcpu);
5075         if (r)
5076                 goto out;
5077         if (vcpu->arch.mmu->direct_map)
5078                 r = mmu_alloc_direct_roots(vcpu);
5079         else
5080                 r = mmu_alloc_shadow_roots(vcpu);
5081         if (r)
5082                 goto out;
5083
5084         kvm_mmu_sync_roots(vcpu);
5085
5086         kvm_mmu_load_pgd(vcpu);
5087
5088         /*
5089          * Flush any TLB entries for the new root, the provenance of the root
5090          * is unknown.  Even if KVM ensures there are no stale TLB entries
5091          * for a freed root, in theory another hypervisor could have left
5092          * stale entries.  Flushing on alloc also allows KVM to skip the TLB
5093          * flush when freeing a root (see kvm_tdp_mmu_put_root()).
5094          */
5095         static_call(kvm_x86_flush_tlb_current)(vcpu);
5096 out:
5097         return r;
5098 }
5099
5100 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5101 {
5102         struct kvm *kvm = vcpu->kvm;
5103
5104         kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5105         WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
5106         kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5107         WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
5108         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
5109 }
5110
5111 static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
5112 {
5113         struct kvm_mmu_page *sp;
5114
5115         if (!VALID_PAGE(root_hpa))
5116                 return false;
5117
5118         /*
5119          * When freeing obsolete roots, treat roots as obsolete if they don't
5120          * have an associated shadow page.  This does mean KVM will get false
5121          * positives and free roots that don't strictly need to be freed, but
5122          * such false positives are relatively rare:
5123          *
5124          *  (a) only PAE paging and nested NPT has roots without shadow pages
5125          *  (b) remote reloads due to a memslot update obsoletes _all_ roots
5126          *  (c) KVM doesn't track previous roots for PAE paging, and the guest
5127          *      is unlikely to zap an in-use PGD.
5128          */
5129         sp = to_shadow_page(root_hpa);
5130         return !sp || is_obsolete_sp(kvm, sp);
5131 }
5132
5133 static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
5134 {
5135         unsigned long roots_to_free = 0;
5136         int i;
5137
5138         if (is_obsolete_root(kvm, mmu->root.hpa))
5139                 roots_to_free |= KVM_MMU_ROOT_CURRENT;
5140
5141         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5142                 if (is_obsolete_root(kvm, mmu->root.hpa))
5143                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5144         }
5145
5146         if (roots_to_free)
5147                 kvm_mmu_free_roots(kvm, mmu, roots_to_free);
5148 }
5149
5150 void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
5151 {
5152         __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
5153         __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
5154 }
5155
5156 static bool need_remote_flush(u64 old, u64 new)
5157 {
5158         if (!is_shadow_present_pte(old))
5159                 return false;
5160         if (!is_shadow_present_pte(new))
5161                 return true;
5162         if ((old ^ new) & PT64_BASE_ADDR_MASK)
5163                 return true;
5164         old ^= shadow_nx_mask;
5165         new ^= shadow_nx_mask;
5166         return (old & ~new & PT64_PERM_MASK) != 0;
5167 }
5168
5169 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5170                                     int *bytes)
5171 {
5172         u64 gentry = 0;
5173         int r;
5174
5175         /*
5176          * Assume that the pte write on a page table of the same type
5177          * as the current vcpu paging mode since we update the sptes only
5178          * when they have the same mode.
5179          */
5180         if (is_pae(vcpu) && *bytes == 4) {
5181                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5182                 *gpa &= ~(gpa_t)7;
5183                 *bytes = 8;
5184         }
5185
5186         if (*bytes == 4 || *bytes == 8) {
5187                 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5188                 if (r)
5189                         gentry = 0;
5190         }
5191
5192         return gentry;
5193 }
5194
5195 /*
5196  * If we're seeing too many writes to a page, it may no longer be a page table,
5197  * or we may be forking, in which case it is better to unmap the page.
5198  */
5199 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5200 {
5201         /*
5202          * Skip write-flooding detected for the sp whose level is 1, because
5203          * it can become unsync, then the guest page is not write-protected.
5204          */
5205         if (sp->role.level == PG_LEVEL_4K)
5206                 return false;
5207
5208         atomic_inc(&sp->write_flooding_count);
5209         return atomic_read(&sp->write_flooding_count) >= 3;
5210 }
5211
5212 /*
5213  * Misaligned accesses are too much trouble to fix up; also, they usually
5214  * indicate a page is not used as a page table.
5215  */
5216 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5217                                     int bytes)
5218 {
5219         unsigned offset, pte_size, misaligned;
5220
5221         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5222                  gpa, bytes, sp->role.word);
5223
5224         offset = offset_in_page(gpa);
5225         pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
5226
5227         /*
5228          * Sometimes, the OS only writes the last one bytes to update status
5229          * bits, for example, in linux, andb instruction is used in clear_bit().
5230          */
5231         if (!(offset & (pte_size - 1)) && bytes == 1)
5232                 return false;
5233
5234         misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5235         misaligned |= bytes < 4;
5236
5237         return misaligned;
5238 }
5239
5240 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5241 {
5242         unsigned page_offset, quadrant;
5243         u64 *spte;
5244         int level;
5245
5246         page_offset = offset_in_page(gpa);
5247         level = sp->role.level;
5248         *nspte = 1;
5249         if (sp->role.has_4_byte_gpte) {
5250                 page_offset <<= 1;      /* 32->64 */
5251                 /*
5252                  * A 32-bit pde maps 4MB while the shadow pdes map
5253                  * only 2MB.  So we need to double the offset again
5254                  * and zap two pdes instead of one.
5255                  */
5256                 if (level == PT32_ROOT_LEVEL) {
5257                         page_offset &= ~7; /* kill rounding error */
5258                         page_offset <<= 1;
5259                         *nspte = 2;
5260                 }
5261                 quadrant = page_offset >> PAGE_SHIFT;
5262                 page_offset &= ~PAGE_MASK;
5263                 if (quadrant != sp->role.quadrant)
5264                         return NULL;
5265         }
5266
5267         spte = &sp->spt[page_offset / sizeof(*spte)];
5268         return spte;
5269 }
5270
5271 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5272                               const u8 *new, int bytes,
5273                               struct kvm_page_track_notifier_node *node)
5274 {
5275         gfn_t gfn = gpa >> PAGE_SHIFT;
5276         struct kvm_mmu_page *sp;
5277         LIST_HEAD(invalid_list);
5278         u64 entry, gentry, *spte;
5279         int npte;
5280         bool flush = false;
5281
5282         /*
5283          * If we don't have indirect shadow pages, it means no page is
5284          * write-protected, so we can exit simply.
5285          */
5286         if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5287                 return;
5288
5289         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5290
5291         /*
5292          * No need to care whether allocation memory is successful
5293          * or not since pte prefetch is skipped if it does not have
5294          * enough objects in the cache.
5295          */
5296         mmu_topup_memory_caches(vcpu, true);
5297
5298         write_lock(&vcpu->kvm->mmu_lock);
5299
5300         gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5301
5302         ++vcpu->kvm->stat.mmu_pte_write;
5303
5304         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5305                 if (detect_write_misaligned(sp, gpa, bytes) ||
5306                       detect_write_flooding(sp)) {
5307                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5308                         ++vcpu->kvm->stat.mmu_flooded;
5309                         continue;
5310                 }
5311
5312                 spte = get_written_sptes(sp, gpa, &npte);
5313                 if (!spte)
5314                         continue;
5315
5316                 while (npte--) {
5317                         entry = *spte;
5318                         mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
5319                         if (gentry && sp->role.level != PG_LEVEL_4K)
5320                                 ++vcpu->kvm->stat.mmu_pde_zapped;
5321                         if (need_remote_flush(entry, *spte))
5322                                 flush = true;
5323                         ++spte;
5324                 }
5325         }
5326         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
5327         write_unlock(&vcpu->kvm->mmu_lock);
5328 }
5329
5330 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5331                        void *insn, int insn_len)
5332 {
5333         int r, emulation_type = EMULTYPE_PF;
5334         bool direct = vcpu->arch.mmu->direct_map;
5335
5336         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
5337                 return RET_PF_RETRY;
5338
5339         r = RET_PF_INVALID;
5340         if (unlikely(error_code & PFERR_RSVD_MASK)) {
5341                 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5342                 if (r == RET_PF_EMULATE)
5343                         goto emulate;
5344         }
5345
5346         if (r == RET_PF_INVALID) {
5347                 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
5348                                           lower_32_bits(error_code), false);
5349                 if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
5350                         return -EIO;
5351         }
5352
5353         if (r < 0)
5354                 return r;
5355         if (r != RET_PF_EMULATE)
5356                 return 1;
5357
5358         /*
5359          * Before emulating the instruction, check if the error code
5360          * was due to a RO violation while translating the guest page.
5361          * This can occur when using nested virtualization with nested
5362          * paging in both guests. If true, we simply unprotect the page
5363          * and resume the guest.
5364          */
5365         if (vcpu->arch.mmu->direct_map &&
5366             (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5367                 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5368                 return 1;
5369         }
5370
5371         /*
5372          * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5373          * optimistically try to just unprotect the page and let the processor
5374          * re-execute the instruction that caused the page fault.  Do not allow
5375          * retrying MMIO emulation, as it's not only pointless but could also
5376          * cause us to enter an infinite loop because the processor will keep
5377          * faulting on the non-existent MMIO address.  Retrying an instruction
5378          * from a nested guest is also pointless and dangerous as we are only
5379          * explicitly shadowing L1's page tables, i.e. unprotecting something
5380          * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5381          */
5382         if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5383                 emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
5384 emulate:
5385         return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5386                                        insn_len);
5387 }
5388 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5389
5390 void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
5391                             gva_t gva, hpa_t root_hpa)
5392 {
5393         int i;
5394
5395         /* It's actually a GPA for vcpu->arch.guest_mmu.  */
5396         if (mmu != &vcpu->arch.guest_mmu) {
5397                 /* INVLPG on a non-canonical address is a NOP according to the SDM.  */
5398                 if (is_noncanonical_address(gva, vcpu))
5399                         return;
5400
5401                 static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
5402         }
5403
5404         if (!mmu->invlpg)
5405                 return;
5406
5407         if (root_hpa == INVALID_PAGE) {
5408                 mmu->invlpg(vcpu, gva, mmu->root.hpa);
5409
5410                 /*
5411                  * INVLPG is required to invalidate any global mappings for the VA,
5412                  * irrespective of PCID. Since it would take us roughly similar amount
5413                  * of work to determine whether any of the prev_root mappings of the VA
5414                  * is marked global, or to just sync it blindly, so we might as well
5415                  * just always sync it.
5416                  *
5417                  * Mappings not reachable via the current cr3 or the prev_roots will be
5418                  * synced when switching to that cr3, so nothing needs to be done here
5419                  * for them.
5420                  */
5421                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5422                         if (VALID_PAGE(mmu->prev_roots[i].hpa))
5423                                 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5424         } else {
5425                 mmu->invlpg(vcpu, gva, root_hpa);
5426         }
5427 }
5428
5429 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5430 {
5431         kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
5432         ++vcpu->stat.invlpg;
5433 }
5434 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5435
5436
5437 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5438 {
5439         struct kvm_mmu *mmu = vcpu->arch.mmu;
5440         bool tlb_flush = false;
5441         uint i;
5442
5443         if (pcid == kvm_get_active_pcid(vcpu)) {
5444                 mmu->invlpg(vcpu, gva, mmu->root.hpa);
5445                 tlb_flush = true;
5446         }
5447
5448         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5449                 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5450                     pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
5451                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5452                         tlb_flush = true;
5453                 }
5454         }
5455
5456         if (tlb_flush)
5457                 static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
5458
5459         ++vcpu->stat.invlpg;
5460
5461         /*
5462          * Mappings not reachable via the current cr3 or the prev_roots will be
5463          * synced when switching to that cr3, so nothing needs to be done here
5464          * for them.
5465          */
5466 }
5467
5468 void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
5469                        int tdp_max_root_level, int tdp_huge_page_level)
5470 {
5471         tdp_enabled = enable_tdp;
5472         tdp_root_level = tdp_forced_root_level;
5473         max_tdp_level = tdp_max_root_level;
5474
5475         /*
5476          * max_huge_page_level reflects KVM's MMU capabilities irrespective
5477          * of kernel support, e.g. KVM may be capable of using 1GB pages when
5478          * the kernel is not.  But, KVM never creates a page size greater than
5479          * what is used by the kernel for any given HVA, i.e. the kernel's
5480          * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
5481          */
5482         if (tdp_enabled)
5483                 max_huge_page_level = tdp_huge_page_level;
5484         else if (boot_cpu_has(X86_FEATURE_GBPAGES))
5485                 max_huge_page_level = PG_LEVEL_1G;
5486         else
5487                 max_huge_page_level = PG_LEVEL_2M;
5488 }
5489 EXPORT_SYMBOL_GPL(kvm_configure_mmu);
5490
5491 /* The return value indicates if tlb flush on all vcpus is needed. */
5492 typedef bool (*slot_level_handler) (struct kvm *kvm,
5493                                     struct kvm_rmap_head *rmap_head,
5494                                     const struct kvm_memory_slot *slot);
5495
5496 /* The caller should hold mmu-lock before calling this function. */
5497 static __always_inline bool
5498 slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5499                         slot_level_handler fn, int start_level, int end_level,
5500                         gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
5501                         bool flush)
5502 {
5503         struct slot_rmap_walk_iterator iterator;
5504
5505         for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5506                         end_gfn, &iterator) {
5507                 if (iterator.rmap)
5508                         flush |= fn(kvm, iterator.rmap, memslot);
5509
5510                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
5511                         if (flush && flush_on_yield) {
5512                                 kvm_flush_remote_tlbs_with_address(kvm,
5513                                                 start_gfn,
5514                                                 iterator.gfn - start_gfn + 1);
5515                                 flush = false;
5516                         }
5517                         cond_resched_rwlock_write(&kvm->mmu_lock);
5518                 }
5519         }
5520
5521         return flush;
5522 }
5523
5524 static __always_inline bool
5525 slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5526                   slot_level_handler fn, int start_level, int end_level,
5527                   bool flush_on_yield)
5528 {
5529         return slot_handle_level_range(kvm, memslot, fn, start_level,
5530                         end_level, memslot->base_gfn,
5531                         memslot->base_gfn + memslot->npages - 1,
5532                         flush_on_yield, false);
5533 }
5534
5535 static __always_inline bool
5536 slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5537                      slot_level_handler fn, bool flush_on_yield)
5538 {
5539         return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
5540                                  PG_LEVEL_4K, flush_on_yield);
5541 }
5542
5543 static void free_mmu_pages(struct kvm_mmu *mmu)
5544 {
5545         if (!tdp_enabled && mmu->pae_root)
5546                 set_memory_encrypted((unsigned long)mmu->pae_root, 1);
5547         free_page((unsigned long)mmu->pae_root);
5548         free_page((unsigned long)mmu->pml4_root);
5549         free_page((unsigned long)mmu->pml5_root);
5550 }
5551
5552 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5553 {
5554         struct page *page;
5555         int i;
5556
5557         mmu->root.hpa = INVALID_PAGE;
5558         mmu->root.pgd = 0;
5559         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5560                 mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5561
5562         /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
5563         if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
5564                 return 0;
5565
5566         /*
5567          * When using PAE paging, the four PDPTEs are treated as 'root' pages,
5568          * while the PDP table is a per-vCPU construct that's allocated at MMU
5569          * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
5570          * x86_64.  Therefore we need to allocate the PDP table in the first
5571          * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
5572          * generally doesn't use PAE paging and can skip allocating the PDP
5573          * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
5574          * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
5575          * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
5576          */
5577         if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5578                 return 0;
5579
5580         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5581         if (!page)
5582                 return -ENOMEM;
5583
5584         mmu->pae_root = page_address(page);
5585
5586         /*
5587          * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
5588          * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
5589          * that KVM's writes and the CPU's reads get along.  Note, this is
5590          * only necessary when using shadow paging, as 64-bit NPT can get at
5591          * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
5592          * by 32-bit kernels (when KVM itself uses 32-bit NPT).
5593          */
5594         if (!tdp_enabled)
5595                 set_memory_decrypted((unsigned long)mmu->pae_root, 1);
5596         else
5597                 WARN_ON_ONCE(shadow_me_mask);
5598
5599         for (i = 0; i < 4; ++i)
5600                 mmu->pae_root[i] = INVALID_PAE_ROOT;
5601
5602         return 0;
5603 }
5604
5605 int kvm_mmu_create(struct kvm_vcpu *vcpu)
5606 {
5607         int ret;
5608
5609         vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
5610         vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
5611
5612         vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
5613         vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
5614
5615         vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
5616
5617         vcpu->arch.mmu = &vcpu->arch.root_mmu;
5618         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5619
5620         ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
5621         if (ret)
5622                 return ret;
5623
5624         ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
5625         if (ret)
5626                 goto fail_allocate_root;
5627
5628         return ret;
5629  fail_allocate_root:
5630         free_mmu_pages(&vcpu->arch.guest_mmu);
5631         return ret;
5632 }
5633
5634 #define BATCH_ZAP_PAGES 10
5635 static void kvm_zap_obsolete_pages(struct kvm *kvm)
5636 {
5637         struct kvm_mmu_page *sp, *node;
5638         int nr_zapped, batch = 0;
5639
5640 restart:
5641         list_for_each_entry_safe_reverse(sp, node,
5642               &kvm->arch.active_mmu_pages, link) {
5643                 /*
5644                  * No obsolete valid page exists before a newly created page
5645                  * since active_mmu_pages is a FIFO list.
5646                  */
5647                 if (!is_obsolete_sp(kvm, sp))
5648                         break;
5649
5650                 /*
5651                  * Invalid pages should never land back on the list of active
5652                  * pages.  Skip the bogus page, otherwise we'll get stuck in an
5653                  * infinite loop if the page gets put back on the list (again).
5654                  */
5655                 if (WARN_ON(sp->role.invalid))
5656                         continue;
5657
5658                 /*
5659                  * No need to flush the TLB since we're only zapping shadow
5660                  * pages with an obsolete generation number and all vCPUS have
5661                  * loaded a new root, i.e. the shadow pages being zapped cannot
5662                  * be in active use by the guest.
5663                  */
5664                 if (batch >= BATCH_ZAP_PAGES &&
5665                     cond_resched_rwlock_write(&kvm->mmu_lock)) {
5666                         batch = 0;
5667                         goto restart;
5668                 }
5669
5670                 if (__kvm_mmu_prepare_zap_page(kvm, sp,
5671                                 &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
5672                         batch += nr_zapped;
5673                         goto restart;
5674                 }
5675         }
5676
5677         /*
5678          * Kick all vCPUs (via remote TLB flush) before freeing the page tables
5679          * to ensure KVM is not in the middle of a lockless shadow page table
5680          * walk, which may reference the pages.  The remote TLB flush itself is
5681          * not required and is simply a convenient way to kick vCPUs as needed.
5682          * KVM performs a local TLB flush when allocating a new root (see
5683          * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
5684          * running with an obsolete MMU.
5685          */
5686         kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5687 }
5688
5689 /*
5690  * Fast invalidate all shadow pages and use lock-break technique
5691  * to zap obsolete pages.
5692  *
5693  * It's required when memslot is being deleted or VM is being
5694  * destroyed, in these cases, we should ensure that KVM MMU does
5695  * not use any resource of the being-deleted slot or all slots
5696  * after calling the function.
5697  */
5698 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5699 {
5700         lockdep_assert_held(&kvm->slots_lock);
5701
5702         write_lock(&kvm->mmu_lock);
5703         trace_kvm_mmu_zap_all_fast(kvm);
5704
5705         /*
5706          * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
5707          * held for the entire duration of zapping obsolete pages, it's
5708          * impossible for there to be multiple invalid generations associated
5709          * with *valid* shadow pages at any given time, i.e. there is exactly
5710          * one valid generation and (at most) one invalid generation.
5711          */
5712         kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
5713
5714         /*
5715          * In order to ensure all vCPUs drop their soon-to-be invalid roots,
5716          * invalidating TDP MMU roots must be done while holding mmu_lock for
5717          * write and in the same critical section as making the reload request,
5718          * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
5719          */
5720         if (is_tdp_mmu_enabled(kvm))
5721                 kvm_tdp_mmu_invalidate_all_roots(kvm);
5722
5723         /*
5724          * Notify all vcpus to reload its shadow page table and flush TLB.
5725          * Then all vcpus will switch to new shadow page table with the new
5726          * mmu_valid_gen.
5727          *
5728          * Note: we need to do this under the protection of mmu_lock,
5729          * otherwise, vcpu would purge shadow page but miss tlb flush.
5730          */
5731         kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
5732
5733         kvm_zap_obsolete_pages(kvm);
5734
5735         write_unlock(&kvm->mmu_lock);
5736
5737         /*
5738          * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
5739          * returning to the caller, e.g. if the zap is in response to a memslot
5740          * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
5741          * associated with the deleted memslot once the update completes, and
5742          * Deferring the zap until the final reference to the root is put would
5743          * lead to use-after-free.
5744          */
5745         if (is_tdp_mmu_enabled(kvm))
5746                 kvm_tdp_mmu_zap_invalidated_roots(kvm);
5747 }
5748
5749 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5750 {
5751         return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5752 }
5753
5754 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5755                         struct kvm_memory_slot *slot,
5756                         struct kvm_page_track_notifier_node *node)
5757 {
5758         kvm_mmu_zap_all_fast(kvm);
5759 }
5760
5761 int kvm_mmu_init_vm(struct kvm *kvm)
5762 {
5763         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5764         int r;
5765
5766         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5767         INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
5768         INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
5769         spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
5770
5771         r = kvm_mmu_init_tdp_mmu(kvm);
5772         if (r < 0)
5773                 return r;
5774
5775         node->track_write = kvm_mmu_pte_write;
5776         node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5777         kvm_page_track_register_notifier(kvm, node);
5778         return 0;
5779 }
5780
5781 void kvm_mmu_uninit_vm(struct kvm *kvm)
5782 {
5783         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5784
5785         kvm_page_track_unregister_notifier(kvm, node);
5786
5787         kvm_mmu_uninit_tdp_mmu(kvm);
5788 }
5789
5790 static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5791 {
5792         const struct kvm_memory_slot *memslot;
5793         struct kvm_memslots *slots;
5794         struct kvm_memslot_iter iter;
5795         bool flush = false;
5796         gfn_t start, end;
5797         int i;
5798
5799         if (!kvm_memslots_have_rmaps(kvm))
5800                 return flush;
5801
5802         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5803                 slots = __kvm_memslots(kvm, i);
5804
5805                 kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
5806                         memslot = iter.slot;
5807                         start = max(gfn_start, memslot->base_gfn);
5808                         end = min(gfn_end, memslot->base_gfn + memslot->npages);
5809                         if (WARN_ON_ONCE(start >= end))
5810                                 continue;
5811
5812                         flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5813
5814                                                         PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
5815                                                         start, end - 1, true, flush);
5816                 }
5817         }
5818
5819         return flush;
5820 }
5821
5822 /*
5823  * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
5824  * (not including it)
5825  */
5826 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5827 {
5828         bool flush;
5829         int i;
5830
5831         if (WARN_ON_ONCE(gfn_end <= gfn_start))
5832                 return;
5833
5834         write_lock(&kvm->mmu_lock);
5835
5836         kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
5837
5838         flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end);
5839
5840         if (is_tdp_mmu_enabled(kvm)) {
5841                 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
5842                         flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
5843                                                       gfn_end, true, flush);
5844         }
5845
5846         if (flush)
5847                 kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
5848                                                    gfn_end - gfn_start);
5849
5850         kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
5851
5852         write_unlock(&kvm->mmu_lock);
5853 }
5854
5855 static bool slot_rmap_write_protect(struct kvm *kvm,
5856                                     struct kvm_rmap_head *rmap_head,
5857                                     const struct kvm_memory_slot *slot)
5858 {
5859         return rmap_write_protect(rmap_head, false);
5860 }
5861
5862 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5863                                       const struct kvm_memory_slot *memslot,
5864                                       int start_level)
5865 {
5866         bool flush = false;
5867
5868         if (kvm_memslots_have_rmaps(kvm)) {
5869                 write_lock(&kvm->mmu_lock);
5870                 flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
5871                                           start_level, KVM_MAX_HUGEPAGE_LEVEL,
5872                                           false);
5873                 write_unlock(&kvm->mmu_lock);
5874         }
5875
5876         if (is_tdp_mmu_enabled(kvm)) {
5877                 read_lock(&kvm->mmu_lock);
5878                 flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
5879                 read_unlock(&kvm->mmu_lock);
5880         }
5881
5882         /*
5883          * Flush TLBs if any SPTEs had to be write-protected to ensure that
5884          * guest writes are reflected in the dirty bitmap before the memslot
5885          * update completes, i.e. before enabling dirty logging is visible to
5886          * userspace.
5887          *
5888          * Perform the TLB flush outside the mmu_lock to reduce the amount of
5889          * time the lock is held. However, this does mean that another CPU can
5890          * now grab mmu_lock and encounter a write-protected SPTE while CPUs
5891          * still have a writable mapping for the associated GFN in their TLB.
5892          *
5893          * This is safe but requires KVM to be careful when making decisions
5894          * based on the write-protection status of an SPTE. Specifically, KVM
5895          * also write-protects SPTEs to monitor changes to guest page tables
5896          * during shadow paging, and must guarantee no CPUs can write to those
5897          * page before the lock is dropped. As mentioned in the previous
5898          * paragraph, a write-protected SPTE is no guarantee that CPU cannot
5899          * perform writes. So to determine if a TLB flush is truly required, KVM
5900          * will clear a separate software-only bit (MMU-writable) and skip the
5901          * flush if-and-only-if this bit was already clear.
5902          *
5903          * See is_writable_pte() for more details.
5904          */
5905         if (flush)
5906                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
5907 }
5908
5909 /* Must be called with the mmu_lock held in write-mode. */
5910 void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
5911                                    const struct kvm_memory_slot *memslot,
5912                                    u64 start, u64 end,
5913                                    int target_level)
5914 {
5915         if (is_tdp_mmu_enabled(kvm))
5916                 kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
5917                                                  target_level, false);
5918
5919         /*
5920          * A TLB flush is unnecessary at this point for the same resons as in
5921          * kvm_mmu_slot_try_split_huge_pages().
5922          */
5923 }
5924
5925 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
5926                                         const struct kvm_memory_slot *memslot,
5927                                         int target_level)
5928 {
5929         u64 start = memslot->base_gfn;
5930         u64 end = start + memslot->npages;
5931
5932         if (is_tdp_mmu_enabled(kvm)) {
5933                 read_lock(&kvm->mmu_lock);
5934                 kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
5935                 read_unlock(&kvm->mmu_lock);
5936         }
5937
5938         /*
5939          * No TLB flush is necessary here. KVM will flush TLBs after
5940          * write-protecting and/or clearing dirty on the newly split SPTEs to
5941          * ensure that guest writes are reflected in the dirty log before the
5942          * ioctl to enable dirty logging on this memslot completes. Since the
5943          * split SPTEs retain the write and dirty bits of the huge SPTE, it is
5944          * safe for KVM to decide if a TLB flush is necessary based on the split
5945          * SPTEs.
5946          */
5947 }
5948
5949 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
5950                                          struct kvm_rmap_head *rmap_head,
5951                                          const struct kvm_memory_slot *slot)
5952 {
5953         u64 *sptep;
5954         struct rmap_iterator iter;
5955         int need_tlb_flush = 0;
5956         kvm_pfn_t pfn;
5957         struct kvm_mmu_page *sp;
5958
5959 restart:
5960         for_each_rmap_spte(rmap_head, &iter, sptep) {
5961                 sp = sptep_to_sp(sptep);
5962                 pfn = spte_to_pfn(*sptep);
5963
5964                 /*
5965                  * We cannot do huge page mapping for indirect shadow pages,
5966                  * which are found on the last rmap (level = 1) when not using
5967                  * tdp; such shadow pages are synced with the page table in
5968                  * the guest, and the guest page table is using 4K page size
5969                  * mapping if the indirect sp has level = 1.
5970                  */
5971                 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
5972                     sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
5973                                                                pfn, PG_LEVEL_NUM)) {
5974                         pte_list_remove(kvm, rmap_head, sptep);
5975
5976                         if (kvm_available_flush_tlb_with_range())
5977                                 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
5978                                         KVM_PAGES_PER_HPAGE(sp->role.level));
5979                         else
5980                                 need_tlb_flush = 1;
5981
5982                         goto restart;
5983                 }
5984         }
5985
5986         return need_tlb_flush;
5987 }
5988
5989 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
5990                                    const struct kvm_memory_slot *slot)
5991 {
5992         if (kvm_memslots_have_rmaps(kvm)) {
5993                 write_lock(&kvm->mmu_lock);
5994                 /*
5995                  * Zap only 4k SPTEs since the legacy MMU only supports dirty
5996                  * logging at a 4k granularity and never creates collapsible
5997                  * 2m SPTEs during dirty logging.
5998                  */
5999                 if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true))
6000                         kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
6001                 write_unlock(&kvm->mmu_lock);
6002         }
6003
6004         if (is_tdp_mmu_enabled(kvm)) {
6005                 read_lock(&kvm->mmu_lock);
6006                 kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
6007                 read_unlock(&kvm->mmu_lock);
6008         }
6009 }
6010
6011 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
6012                                         const struct kvm_memory_slot *memslot)
6013 {
6014         /*
6015          * All current use cases for flushing the TLBs for a specific memslot
6016          * related to dirty logging, and many do the TLB flush out of mmu_lock.
6017          * The interaction between the various operations on memslot must be
6018          * serialized by slots_locks to ensure the TLB flush from one operation
6019          * is observed by any other operation on the same memslot.
6020          */
6021         lockdep_assert_held(&kvm->slots_lock);
6022         kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6023                                            memslot->npages);
6024 }
6025
6026 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
6027                                    const struct kvm_memory_slot *memslot)
6028 {
6029         bool flush = false;
6030
6031         if (kvm_memslots_have_rmaps(kvm)) {
6032                 write_lock(&kvm->mmu_lock);
6033                 /*
6034                  * Clear dirty bits only on 4k SPTEs since the legacy MMU only
6035                  * support dirty logging at a 4k granularity.
6036                  */
6037                 flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
6038                 write_unlock(&kvm->mmu_lock);
6039         }
6040
6041         if (is_tdp_mmu_enabled(kvm)) {
6042                 read_lock(&kvm->mmu_lock);
6043                 flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
6044                 read_unlock(&kvm->mmu_lock);
6045         }
6046
6047         /*
6048          * It's also safe to flush TLBs out of mmu lock here as currently this
6049          * function is only used for dirty logging, in which case flushing TLB
6050          * out of mmu lock also guarantees no dirty pages will be lost in
6051          * dirty_bitmap.
6052          */
6053         if (flush)
6054                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
6055 }
6056
6057 void kvm_mmu_zap_all(struct kvm *kvm)
6058 {
6059         struct kvm_mmu_page *sp, *node;
6060         LIST_HEAD(invalid_list);
6061         int ign;
6062
6063         write_lock(&kvm->mmu_lock);
6064 restart:
6065         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6066                 if (WARN_ON(sp->role.invalid))
6067                         continue;
6068                 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
6069                         goto restart;
6070                 if (cond_resched_rwlock_write(&kvm->mmu_lock))
6071                         goto restart;
6072         }
6073
6074         kvm_mmu_commit_zap_page(kvm, &invalid_list);
6075
6076         if (is_tdp_mmu_enabled(kvm))
6077                 kvm_tdp_mmu_zap_all(kvm);
6078
6079         write_unlock(&kvm->mmu_lock);
6080 }
6081
6082 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
6083 {
6084         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
6085
6086         gen &= MMIO_SPTE_GEN_MASK;
6087
6088         /*
6089          * Generation numbers are incremented in multiples of the number of
6090          * address spaces in order to provide unique generations across all
6091          * address spaces.  Strip what is effectively the address space
6092          * modifier prior to checking for a wrap of the MMIO generation so
6093          * that a wrap in any address space is detected.
6094          */
6095         gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6096
6097         /*
6098          * The very rare case: if the MMIO generation number has wrapped,
6099          * zap all shadow pages.
6100          */
6101         if (unlikely(gen == 0)) {
6102                 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
6103                 kvm_mmu_zap_all_fast(kvm);
6104         }
6105 }
6106
6107 static unsigned long
6108 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
6109 {
6110         struct kvm *kvm;
6111         int nr_to_scan = sc->nr_to_scan;
6112         unsigned long freed = 0;
6113
6114         mutex_lock(&kvm_lock);
6115
6116         list_for_each_entry(kvm, &vm_list, vm_list) {
6117                 int idx;
6118                 LIST_HEAD(invalid_list);
6119
6120                 /*
6121                  * Never scan more than sc->nr_to_scan VM instances.
6122                  * Will not hit this condition practically since we do not try
6123                  * to shrink more than one VM and it is very unlikely to see
6124                  * !n_used_mmu_pages so many times.
6125                  */
6126                 if (!nr_to_scan--)
6127                         break;
6128                 /*
6129                  * n_used_mmu_pages is accessed without holding kvm->mmu_lock
6130                  * here. We may skip a VM instance errorneosly, but we do not
6131                  * want to shrink a VM that only started to populate its MMU
6132                  * anyway.
6133                  */
6134                 if (!kvm->arch.n_used_mmu_pages &&
6135                     !kvm_has_zapped_obsolete_pages(kvm))
6136                         continue;
6137
6138                 idx = srcu_read_lock(&kvm->srcu);
6139                 write_lock(&kvm->mmu_lock);
6140
6141                 if (kvm_has_zapped_obsolete_pages(kvm)) {
6142                         kvm_mmu_commit_zap_page(kvm,
6143                               &kvm->arch.zapped_obsolete_pages);
6144                         goto unlock;
6145                 }
6146
6147                 freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
6148
6149 unlock:
6150                 write_unlock(&kvm->mmu_lock);
6151                 srcu_read_unlock(&kvm->srcu, idx);
6152
6153                 /*
6154                  * unfair on small ones
6155                  * per-vm shrinkers cry out
6156                  * sadness comes quickly
6157                  */
6158                 list_move_tail(&kvm->vm_list, &vm_list);
6159                 break;
6160         }
6161
6162         mutex_unlock(&kvm_lock);
6163         return freed;
6164 }
6165
6166 static unsigned long
6167 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6168 {
6169         return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6170 }
6171
6172 static struct shrinker mmu_shrinker = {
6173         .count_objects = mmu_shrink_count,
6174         .scan_objects = mmu_shrink_scan,
6175         .seeks = DEFAULT_SEEKS * 10,
6176 };
6177
6178 static void mmu_destroy_caches(void)
6179 {
6180         kmem_cache_destroy(pte_list_desc_cache);
6181         kmem_cache_destroy(mmu_page_header_cache);
6182 }
6183
6184 static bool get_nx_auto_mode(void)
6185 {
6186         /* Return true when CPU has the bug, and mitigations are ON */
6187         return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6188 }
6189
6190 static void __set_nx_huge_pages(bool val)
6191 {
6192         nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6193 }
6194
6195 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6196 {
6197         bool old_val = nx_huge_pages;
6198         bool new_val;
6199
6200         /* In "auto" mode deploy workaround only if CPU has the bug. */
6201         if (sysfs_streq(val, "off"))
6202                 new_val = 0;
6203         else if (sysfs_streq(val, "force"))
6204                 new_val = 1;
6205         else if (sysfs_streq(val, "auto"))
6206                 new_val = get_nx_auto_mode();
6207         else if (strtobool(val, &new_val) < 0)
6208                 return -EINVAL;
6209
6210         __set_nx_huge_pages(new_val);
6211
6212         if (new_val != old_val) {
6213                 struct kvm *kvm;
6214
6215                 mutex_lock(&kvm_lock);
6216
6217                 list_for_each_entry(kvm, &vm_list, vm_list) {
6218                         mutex_lock(&kvm->slots_lock);
6219                         kvm_mmu_zap_all_fast(kvm);
6220                         mutex_unlock(&kvm->slots_lock);
6221
6222                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6223                 }
6224                 mutex_unlock(&kvm_lock);
6225         }
6226
6227         return 0;
6228 }
6229
6230 /*
6231  * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
6232  * its default value of -1 is technically undefined behavior for a boolean.
6233  */
6234 void kvm_mmu_x86_module_init(void)
6235 {
6236         if (nx_huge_pages == -1)
6237                 __set_nx_huge_pages(get_nx_auto_mode());
6238 }
6239
6240 /*
6241  * The bulk of the MMU initialization is deferred until the vendor module is
6242  * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
6243  * to be reset when a potentially different vendor module is loaded.
6244  */
6245 int kvm_mmu_vendor_module_init(void)
6246 {
6247         int ret = -ENOMEM;
6248
6249         /*
6250          * MMU roles use union aliasing which is, generally speaking, an
6251          * undefined behavior. However, we supposedly know how compilers behave
6252          * and the current status quo is unlikely to change. Guardians below are
6253          * supposed to let us know if the assumption becomes false.
6254          */
6255         BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6256         BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6257         BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
6258
6259         kvm_mmu_reset_all_pte_masks();
6260
6261         pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6262                                             sizeof(struct pte_list_desc),
6263                                             0, SLAB_ACCOUNT, NULL);
6264         if (!pte_list_desc_cache)
6265                 goto out;
6266
6267         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6268                                                   sizeof(struct kvm_mmu_page),
6269                                                   0, SLAB_ACCOUNT, NULL);
6270         if (!mmu_page_header_cache)
6271                 goto out;
6272
6273         if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6274                 goto out;
6275
6276         ret = register_shrinker(&mmu_shrinker);
6277         if (ret)
6278                 goto out;
6279
6280         return 0;
6281
6282 out:
6283         mmu_destroy_caches();
6284         return ret;
6285 }
6286
6287 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6288 {
6289         kvm_mmu_unload(vcpu);
6290         free_mmu_pages(&vcpu->arch.root_mmu);
6291         free_mmu_pages(&vcpu->arch.guest_mmu);
6292         mmu_free_memory_caches(vcpu);
6293 }
6294
6295 void kvm_mmu_vendor_module_exit(void)
6296 {
6297         mmu_destroy_caches();
6298         percpu_counter_destroy(&kvm_total_used_mmu_pages);
6299         unregister_shrinker(&mmu_shrinker);
6300 }
6301
6302 /*
6303  * Calculate the effective recovery period, accounting for '0' meaning "let KVM
6304  * select a halving time of 1 hour".  Returns true if recovery is enabled.
6305  */
6306 static bool calc_nx_huge_pages_recovery_period(uint *period)
6307 {
6308         /*
6309          * Use READ_ONCE to get the params, this may be called outside of the
6310          * param setters, e.g. by the kthread to compute its next timeout.
6311          */
6312         bool enabled = READ_ONCE(nx_huge_pages);
6313         uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6314
6315         if (!enabled || !ratio)
6316                 return false;
6317
6318         *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
6319         if (!*period) {
6320                 /* Make sure the period is not less than one second.  */
6321                 ratio = min(ratio, 3600u);
6322                 *period = 60 * 60 * 1000 / ratio;
6323         }
6324         return true;
6325 }
6326
6327 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
6328 {
6329         bool was_recovery_enabled, is_recovery_enabled;
6330         uint old_period, new_period;
6331         int err;
6332
6333         was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
6334
6335         err = param_set_uint(val, kp);
6336         if (err)
6337                 return err;
6338
6339         is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
6340
6341         if (is_recovery_enabled &&
6342             (!was_recovery_enabled || old_period > new_period)) {
6343                 struct kvm *kvm;
6344
6345                 mutex_lock(&kvm_lock);
6346
6347                 list_for_each_entry(kvm, &vm_list, vm_list)
6348                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6349
6350                 mutex_unlock(&kvm_lock);
6351         }
6352
6353         return err;
6354 }
6355
6356 static void kvm_recover_nx_lpages(struct kvm *kvm)
6357 {
6358         unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
6359         int rcu_idx;
6360         struct kvm_mmu_page *sp;
6361         unsigned int ratio;
6362         LIST_HEAD(invalid_list);
6363         bool flush = false;
6364         ulong to_zap;
6365
6366         rcu_idx = srcu_read_lock(&kvm->srcu);
6367         write_lock(&kvm->mmu_lock);
6368
6369         /*
6370          * Zapping TDP MMU shadow pages, including the remote TLB flush, must
6371          * be done under RCU protection, because the pages are freed via RCU
6372          * callback.
6373          */
6374         rcu_read_lock();
6375
6376         ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6377         to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
6378         for ( ; to_zap; --to_zap) {
6379                 if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
6380                         break;
6381
6382                 /*
6383                  * We use a separate list instead of just using active_mmu_pages
6384                  * because the number of lpage_disallowed pages is expected to
6385                  * be relatively small compared to the total.
6386                  */
6387                 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6388                                       struct kvm_mmu_page,
6389                                       lpage_disallowed_link);
6390                 WARN_ON_ONCE(!sp->lpage_disallowed);
6391                 if (is_tdp_mmu_page(sp)) {
6392                         flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
6393                 } else {
6394                         kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6395                         WARN_ON_ONCE(sp->lpage_disallowed);
6396                 }
6397
6398                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
6399                         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6400                         rcu_read_unlock();
6401
6402                         cond_resched_rwlock_write(&kvm->mmu_lock);
6403                         flush = false;
6404
6405                         rcu_read_lock();
6406                 }
6407         }
6408         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6409
6410         rcu_read_unlock();
6411
6412         write_unlock(&kvm->mmu_lock);
6413         srcu_read_unlock(&kvm->srcu, rcu_idx);
6414 }
6415
6416 static long get_nx_lpage_recovery_timeout(u64 start_time)
6417 {
6418         bool enabled;
6419         uint period;
6420
6421         enabled = calc_nx_huge_pages_recovery_period(&period);
6422
6423         return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
6424                        : MAX_SCHEDULE_TIMEOUT;
6425 }
6426
6427 static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6428 {
6429         u64 start_time;
6430         long remaining_time;
6431
6432         while (true) {
6433                 start_time = get_jiffies_64();
6434                 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6435
6436                 set_current_state(TASK_INTERRUPTIBLE);
6437                 while (!kthread_should_stop() && remaining_time > 0) {
6438                         schedule_timeout(remaining_time);
6439                         remaining_time = get_nx_lpage_recovery_timeout(start_time);
6440                         set_current_state(TASK_INTERRUPTIBLE);
6441                 }
6442
6443                 set_current_state(TASK_RUNNING);
6444
6445                 if (kthread_should_stop())
6446                         return 0;
6447
6448                 kvm_recover_nx_lpages(kvm);
6449         }
6450 }
6451
6452 int kvm_mmu_post_init_vm(struct kvm *kvm)
6453 {
6454         int err;
6455
6456         err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6457                                           "kvm-nx-lpage-recovery",
6458                                           &kvm->arch.nx_lpage_recovery_thread);
6459         if (!err)
6460                 kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6461
6462         return err;
6463 }
6464
6465 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6466 {
6467         if (kvm->arch.nx_lpage_recovery_thread)
6468                 kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6469 }