Merge branch 'kvm-amd-pmu-fixes' into HEAD
[linux-2.6-microblaze.git] / arch / x86 / kvm / mmu / mmu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * This module enables machines with Intel VT-x extensions to run virtual
6  * machines without emulation or binary translation.
7  *
8  * MMU support
9  *
10  * Copyright (C) 2006 Qumranet, Inc.
11  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
12  *
13  * Authors:
14  *   Yaniv Kamay  <yaniv@qumranet.com>
15  *   Avi Kivity   <avi@qumranet.com>
16  */
17
18 #include "irq.h"
19 #include "ioapic.h"
20 #include "mmu.h"
21 #include "mmu_internal.h"
22 #include "tdp_mmu.h"
23 #include "x86.h"
24 #include "kvm_cache_regs.h"
25 #include "kvm_emulate.h"
26 #include "cpuid.h"
27 #include "spte.h"
28
29 #include <linux/kvm_host.h>
30 #include <linux/types.h>
31 #include <linux/string.h>
32 #include <linux/mm.h>
33 #include <linux/highmem.h>
34 #include <linux/moduleparam.h>
35 #include <linux/export.h>
36 #include <linux/swap.h>
37 #include <linux/hugetlb.h>
38 #include <linux/compiler.h>
39 #include <linux/srcu.h>
40 #include <linux/slab.h>
41 #include <linux/sched/signal.h>
42 #include <linux/uaccess.h>
43 #include <linux/hash.h>
44 #include <linux/kern_levels.h>
45 #include <linux/kthread.h>
46
47 #include <asm/page.h>
48 #include <asm/memtype.h>
49 #include <asm/cmpxchg.h>
50 #include <asm/io.h>
51 #include <asm/set_memory.h>
52 #include <asm/vmx.h>
53 #include <asm/kvm_page_track.h>
54 #include "trace.h"
55
56 #include "paging.h"
57
58 extern bool itlb_multihit_kvm_mitigation;
59
60 int __read_mostly nx_huge_pages = -1;
61 static uint __read_mostly nx_huge_pages_recovery_period_ms;
62 #ifdef CONFIG_PREEMPT_RT
63 /* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
64 static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
65 #else
66 static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
67 #endif
68
69 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
70 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
71
72 static const struct kernel_param_ops nx_huge_pages_ops = {
73         .set = set_nx_huge_pages,
74         .get = param_get_bool,
75 };
76
77 static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
78         .set = set_nx_huge_pages_recovery_param,
79         .get = param_get_uint,
80 };
81
82 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
83 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
84 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
85                 &nx_huge_pages_recovery_ratio, 0644);
86 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
87 module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
88                 &nx_huge_pages_recovery_period_ms, 0644);
89 __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
90
91 static bool __read_mostly force_flush_and_sync_on_reuse;
92 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
93
94 /*
95  * When setting this variable to true it enables Two-Dimensional-Paging
96  * where the hardware walks 2 page tables:
97  * 1. the guest-virtual to guest-physical
98  * 2. while doing 1. it walks guest-physical to host-physical
99  * If the hardware supports that we don't need to do shadow paging.
100  */
101 bool tdp_enabled = false;
102
103 static int max_huge_page_level __read_mostly;
104 static int tdp_root_level __read_mostly;
105 static int max_tdp_level __read_mostly;
106
107 #ifdef MMU_DEBUG
108 bool dbg = 0;
109 module_param(dbg, bool, 0644);
110 #endif
111
112 #define PTE_PREFETCH_NUM                8
113
114 #define PT32_LEVEL_BITS 10
115
116 #define PT32_LEVEL_SHIFT(level) \
117                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
118
119 #define PT32_LVL_OFFSET_MASK(level) \
120         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
121                                                 * PT32_LEVEL_BITS))) - 1))
122
123 #define PT32_INDEX(address, level)\
124         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
125
126
127 #define PT32_BASE_ADDR_MASK PAGE_MASK
128 #define PT32_DIR_BASE_ADDR_MASK \
129         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
130 #define PT32_LVL_ADDR_MASK(level) \
131         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
132                                             * PT32_LEVEL_BITS))) - 1))
133
134 #include <trace/events/kvm.h>
135
136 /* make pte_list_desc fit well in cache lines */
137 #define PTE_LIST_EXT 14
138
139 /*
140  * Slight optimization of cacheline layout, by putting `more' and `spte_count'
141  * at the start; then accessing it will only use one single cacheline for
142  * either full (entries==PTE_LIST_EXT) case or entries<=6.
143  */
144 struct pte_list_desc {
145         struct pte_list_desc *more;
146         /*
147          * Stores number of entries stored in the pte_list_desc.  No need to be
148          * u64 but just for easier alignment.  When PTE_LIST_EXT, means full.
149          */
150         u64 spte_count;
151         u64 *sptes[PTE_LIST_EXT];
152 };
153
154 struct kvm_shadow_walk_iterator {
155         u64 addr;
156         hpa_t shadow_addr;
157         u64 *sptep;
158         int level;
159         unsigned index;
160 };
161
162 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
163         for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
164                                          (_root), (_addr));                \
165              shadow_walk_okay(&(_walker));                                 \
166              shadow_walk_next(&(_walker)))
167
168 #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
169         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
170              shadow_walk_okay(&(_walker));                      \
171              shadow_walk_next(&(_walker)))
172
173 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)     \
174         for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
175              shadow_walk_okay(&(_walker)) &&                            \
176                 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
177              __shadow_walk_next(&(_walker), spte))
178
179 static struct kmem_cache *pte_list_desc_cache;
180 struct kmem_cache *mmu_page_header_cache;
181 static struct percpu_counter kvm_total_used_mmu_pages;
182
183 static void mmu_spte_set(u64 *sptep, u64 spte);
184
185 struct kvm_mmu_role_regs {
186         const unsigned long cr0;
187         const unsigned long cr4;
188         const u64 efer;
189 };
190
191 #define CREATE_TRACE_POINTS
192 #include "mmutrace.h"
193
194 /*
195  * Yes, lot's of underscores.  They're a hint that you probably shouldn't be
196  * reading from the role_regs.  Once the mmu_role is constructed, it becomes
197  * the single source of truth for the MMU's state.
198  */
199 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)                   \
200 static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
201 {                                                                       \
202         return !!(regs->reg & flag);                                    \
203 }
204 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
205 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
206 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
207 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
208 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
209 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
210 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
211 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
212 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
213 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
214
215 /*
216  * The MMU itself (with a valid role) is the single source of truth for the
217  * MMU.  Do not use the regs used to build the MMU/role, nor the vCPU.  The
218  * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
219  * and the vCPU may be incorrect/irrelevant.
220  */
221 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)         \
222 static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)        \
223 {                                                               \
224         return !!(mmu->mmu_role. base_or_ext . reg##_##name);   \
225 }
226 BUILD_MMU_ROLE_ACCESSOR(ext,  cr0, pg);
227 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
228 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pse);
229 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pae);
230 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
231 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
232 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
233 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
234 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
235
236 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
237 {
238         struct kvm_mmu_role_regs regs = {
239                 .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
240                 .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
241                 .efer = vcpu->arch.efer,
242         };
243
244         return regs;
245 }
246
247 static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs)
248 {
249         if (!____is_cr0_pg(regs))
250                 return 0;
251         else if (____is_efer_lma(regs))
252                 return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL :
253                                                PT64_ROOT_4LEVEL;
254         else if (____is_cr4_pae(regs))
255                 return PT32E_ROOT_LEVEL;
256         else
257                 return PT32_ROOT_LEVEL;
258 }
259
260 static inline bool kvm_available_flush_tlb_with_range(void)
261 {
262         return kvm_x86_ops.tlb_remote_flush_with_range;
263 }
264
265 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
266                 struct kvm_tlb_range *range)
267 {
268         int ret = -ENOTSUPP;
269
270         if (range && kvm_x86_ops.tlb_remote_flush_with_range)
271                 ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
272
273         if (ret)
274                 kvm_flush_remote_tlbs(kvm);
275 }
276
277 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
278                 u64 start_gfn, u64 pages)
279 {
280         struct kvm_tlb_range range;
281
282         range.start_gfn = start_gfn;
283         range.pages = pages;
284
285         kvm_flush_remote_tlbs_with_range(kvm, &range);
286 }
287
288 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
289                            unsigned int access)
290 {
291         u64 spte = make_mmio_spte(vcpu, gfn, access);
292
293         trace_mark_mmio_spte(sptep, gfn, spte);
294         mmu_spte_set(sptep, spte);
295 }
296
297 static gfn_t get_mmio_spte_gfn(u64 spte)
298 {
299         u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
300
301         gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
302                & shadow_nonpresent_or_rsvd_mask;
303
304         return gpa >> PAGE_SHIFT;
305 }
306
307 static unsigned get_mmio_spte_access(u64 spte)
308 {
309         return spte & shadow_mmio_access_mask;
310 }
311
312 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
313 {
314         u64 kvm_gen, spte_gen, gen;
315
316         gen = kvm_vcpu_memslots(vcpu)->generation;
317         if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
318                 return false;
319
320         kvm_gen = gen & MMIO_SPTE_GEN_MASK;
321         spte_gen = get_mmio_spte_generation(spte);
322
323         trace_check_mmio_spte(spte, kvm_gen, spte_gen);
324         return likely(kvm_gen == spte_gen);
325 }
326
327 static int is_cpuid_PSE36(void)
328 {
329         return 1;
330 }
331
332 static gfn_t pse36_gfn_delta(u32 gpte)
333 {
334         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
335
336         return (gpte & PT32_DIR_PSE36_MASK) << shift;
337 }
338
339 #ifdef CONFIG_X86_64
340 static void __set_spte(u64 *sptep, u64 spte)
341 {
342         WRITE_ONCE(*sptep, spte);
343 }
344
345 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
346 {
347         WRITE_ONCE(*sptep, spte);
348 }
349
350 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
351 {
352         return xchg(sptep, spte);
353 }
354
355 static u64 __get_spte_lockless(u64 *sptep)
356 {
357         return READ_ONCE(*sptep);
358 }
359 #else
360 union split_spte {
361         struct {
362                 u32 spte_low;
363                 u32 spte_high;
364         };
365         u64 spte;
366 };
367
368 static void count_spte_clear(u64 *sptep, u64 spte)
369 {
370         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
371
372         if (is_shadow_present_pte(spte))
373                 return;
374
375         /* Ensure the spte is completely set before we increase the count */
376         smp_wmb();
377         sp->clear_spte_count++;
378 }
379
380 static void __set_spte(u64 *sptep, u64 spte)
381 {
382         union split_spte *ssptep, sspte;
383
384         ssptep = (union split_spte *)sptep;
385         sspte = (union split_spte)spte;
386
387         ssptep->spte_high = sspte.spte_high;
388
389         /*
390          * If we map the spte from nonpresent to present, We should store
391          * the high bits firstly, then set present bit, so cpu can not
392          * fetch this spte while we are setting the spte.
393          */
394         smp_wmb();
395
396         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
397 }
398
399 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
400 {
401         union split_spte *ssptep, sspte;
402
403         ssptep = (union split_spte *)sptep;
404         sspte = (union split_spte)spte;
405
406         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
407
408         /*
409          * If we map the spte from present to nonpresent, we should clear
410          * present bit firstly to avoid vcpu fetch the old high bits.
411          */
412         smp_wmb();
413
414         ssptep->spte_high = sspte.spte_high;
415         count_spte_clear(sptep, spte);
416 }
417
418 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
419 {
420         union split_spte *ssptep, sspte, orig;
421
422         ssptep = (union split_spte *)sptep;
423         sspte = (union split_spte)spte;
424
425         /* xchg acts as a barrier before the setting of the high bits */
426         orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
427         orig.spte_high = ssptep->spte_high;
428         ssptep->spte_high = sspte.spte_high;
429         count_spte_clear(sptep, spte);
430
431         return orig.spte;
432 }
433
434 /*
435  * The idea using the light way get the spte on x86_32 guest is from
436  * gup_get_pte (mm/gup.c).
437  *
438  * An spte tlb flush may be pending, because kvm_set_pte_rmapp
439  * coalesces them and we are running out of the MMU lock.  Therefore
440  * we need to protect against in-progress updates of the spte.
441  *
442  * Reading the spte while an update is in progress may get the old value
443  * for the high part of the spte.  The race is fine for a present->non-present
444  * change (because the high part of the spte is ignored for non-present spte),
445  * but for a present->present change we must reread the spte.
446  *
447  * All such changes are done in two steps (present->non-present and
448  * non-present->present), hence it is enough to count the number of
449  * present->non-present updates: if it changed while reading the spte,
450  * we might have hit the race.  This is done using clear_spte_count.
451  */
452 static u64 __get_spte_lockless(u64 *sptep)
453 {
454         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
455         union split_spte spte, *orig = (union split_spte *)sptep;
456         int count;
457
458 retry:
459         count = sp->clear_spte_count;
460         smp_rmb();
461
462         spte.spte_low = orig->spte_low;
463         smp_rmb();
464
465         spte.spte_high = orig->spte_high;
466         smp_rmb();
467
468         if (unlikely(spte.spte_low != orig->spte_low ||
469               count != sp->clear_spte_count))
470                 goto retry;
471
472         return spte.spte;
473 }
474 #endif
475
476 /* Rules for using mmu_spte_set:
477  * Set the sptep from nonpresent to present.
478  * Note: the sptep being assigned *must* be either not present
479  * or in a state where the hardware will not attempt to update
480  * the spte.
481  */
482 static void mmu_spte_set(u64 *sptep, u64 new_spte)
483 {
484         WARN_ON(is_shadow_present_pte(*sptep));
485         __set_spte(sptep, new_spte);
486 }
487
488 /*
489  * Update the SPTE (excluding the PFN), but do not track changes in its
490  * accessed/dirty status.
491  */
492 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
493 {
494         u64 old_spte = *sptep;
495
496         WARN_ON(!is_shadow_present_pte(new_spte));
497         check_spte_writable_invariants(new_spte);
498
499         if (!is_shadow_present_pte(old_spte)) {
500                 mmu_spte_set(sptep, new_spte);
501                 return old_spte;
502         }
503
504         if (!spte_has_volatile_bits(old_spte))
505                 __update_clear_spte_fast(sptep, new_spte);
506         else
507                 old_spte = __update_clear_spte_slow(sptep, new_spte);
508
509         WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
510
511         return old_spte;
512 }
513
514 /* Rules for using mmu_spte_update:
515  * Update the state bits, it means the mapped pfn is not changed.
516  *
517  * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
518  * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
519  * spte, even though the writable spte might be cached on a CPU's TLB.
520  *
521  * Returns true if the TLB needs to be flushed
522  */
523 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
524 {
525         bool flush = false;
526         u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
527
528         if (!is_shadow_present_pte(old_spte))
529                 return false;
530
531         /*
532          * For the spte updated out of mmu-lock is safe, since
533          * we always atomically update it, see the comments in
534          * spte_has_volatile_bits().
535          */
536         if (is_mmu_writable_spte(old_spte) &&
537               !is_writable_pte(new_spte))
538                 flush = true;
539
540         /*
541          * Flush TLB when accessed/dirty states are changed in the page tables,
542          * to guarantee consistency between TLB and page tables.
543          */
544
545         if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
546                 flush = true;
547                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
548         }
549
550         if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
551                 flush = true;
552                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
553         }
554
555         return flush;
556 }
557
558 /*
559  * Rules for using mmu_spte_clear_track_bits:
560  * It sets the sptep from present to nonpresent, and track the
561  * state bits, it is used to clear the last level sptep.
562  * Returns the old PTE.
563  */
564 static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
565 {
566         kvm_pfn_t pfn;
567         u64 old_spte = *sptep;
568         int level = sptep_to_sp(sptep)->role.level;
569
570         if (!is_shadow_present_pte(old_spte) ||
571             !spte_has_volatile_bits(old_spte))
572                 __update_clear_spte_fast(sptep, 0ull);
573         else
574                 old_spte = __update_clear_spte_slow(sptep, 0ull);
575
576         if (!is_shadow_present_pte(old_spte))
577                 return old_spte;
578
579         kvm_update_page_stats(kvm, level, -1);
580
581         pfn = spte_to_pfn(old_spte);
582
583         /*
584          * KVM does not hold the refcount of the page used by
585          * kvm mmu, before reclaiming the page, we should
586          * unmap it from mmu first.
587          */
588         WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
589
590         if (is_accessed_spte(old_spte))
591                 kvm_set_pfn_accessed(pfn);
592
593         if (is_dirty_spte(old_spte))
594                 kvm_set_pfn_dirty(pfn);
595
596         return old_spte;
597 }
598
599 /*
600  * Rules for using mmu_spte_clear_no_track:
601  * Directly clear spte without caring the state bits of sptep,
602  * it is used to set the upper level spte.
603  */
604 static void mmu_spte_clear_no_track(u64 *sptep)
605 {
606         __update_clear_spte_fast(sptep, 0ull);
607 }
608
609 static u64 mmu_spte_get_lockless(u64 *sptep)
610 {
611         return __get_spte_lockless(sptep);
612 }
613
614 /* Returns the Accessed status of the PTE and resets it at the same time. */
615 static bool mmu_spte_age(u64 *sptep)
616 {
617         u64 spte = mmu_spte_get_lockless(sptep);
618
619         if (!is_accessed_spte(spte))
620                 return false;
621
622         if (spte_ad_enabled(spte)) {
623                 clear_bit((ffs(shadow_accessed_mask) - 1),
624                           (unsigned long *)sptep);
625         } else {
626                 /*
627                  * Capture the dirty status of the page, so that it doesn't get
628                  * lost when the SPTE is marked for access tracking.
629                  */
630                 if (is_writable_pte(spte))
631                         kvm_set_pfn_dirty(spte_to_pfn(spte));
632
633                 spte = mark_spte_for_access_track(spte);
634                 mmu_spte_update_no_track(sptep, spte);
635         }
636
637         return true;
638 }
639
640 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
641 {
642         if (is_tdp_mmu(vcpu->arch.mmu)) {
643                 kvm_tdp_mmu_walk_lockless_begin();
644         } else {
645                 /*
646                  * Prevent page table teardown by making any free-er wait during
647                  * kvm_flush_remote_tlbs() IPI to all active vcpus.
648                  */
649                 local_irq_disable();
650
651                 /*
652                  * Make sure a following spte read is not reordered ahead of the write
653                  * to vcpu->mode.
654                  */
655                 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
656         }
657 }
658
659 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
660 {
661         if (is_tdp_mmu(vcpu->arch.mmu)) {
662                 kvm_tdp_mmu_walk_lockless_end();
663         } else {
664                 /*
665                  * Make sure the write to vcpu->mode is not reordered in front of
666                  * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
667                  * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
668                  */
669                 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
670                 local_irq_enable();
671         }
672 }
673
674 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
675 {
676         int r;
677
678         /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
679         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
680                                        1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
681         if (r)
682                 return r;
683         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
684                                        PT64_ROOT_MAX_LEVEL);
685         if (r)
686                 return r;
687         if (maybe_indirect) {
688                 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
689                                                PT64_ROOT_MAX_LEVEL);
690                 if (r)
691                         return r;
692         }
693         return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
694                                           PT64_ROOT_MAX_LEVEL);
695 }
696
697 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
698 {
699         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
700         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
701         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
702         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
703 }
704
705 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
706 {
707         return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
708 }
709
710 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
711 {
712         kmem_cache_free(pte_list_desc_cache, pte_list_desc);
713 }
714
715 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
716 {
717         if (!sp->role.direct)
718                 return sp->gfns[index];
719
720         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
721 }
722
723 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
724 {
725         if (!sp->role.direct) {
726                 sp->gfns[index] = gfn;
727                 return;
728         }
729
730         if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
731                 pr_err_ratelimited("gfn mismatch under direct page %llx "
732                                    "(expected %llx, got %llx)\n",
733                                    sp->gfn,
734                                    kvm_mmu_page_get_gfn(sp, index), gfn);
735 }
736
737 /*
738  * Return the pointer to the large page information for a given gfn,
739  * handling slots that are not large page aligned.
740  */
741 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
742                 const struct kvm_memory_slot *slot, int level)
743 {
744         unsigned long idx;
745
746         idx = gfn_to_index(gfn, slot->base_gfn, level);
747         return &slot->arch.lpage_info[level - 2][idx];
748 }
749
750 static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
751                                             gfn_t gfn, int count)
752 {
753         struct kvm_lpage_info *linfo;
754         int i;
755
756         for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
757                 linfo = lpage_info_slot(gfn, slot, i);
758                 linfo->disallow_lpage += count;
759                 WARN_ON(linfo->disallow_lpage < 0);
760         }
761 }
762
763 void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
764 {
765         update_gfn_disallow_lpage_count(slot, gfn, 1);
766 }
767
768 void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
769 {
770         update_gfn_disallow_lpage_count(slot, gfn, -1);
771 }
772
773 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
774 {
775         struct kvm_memslots *slots;
776         struct kvm_memory_slot *slot;
777         gfn_t gfn;
778
779         kvm->arch.indirect_shadow_pages++;
780         gfn = sp->gfn;
781         slots = kvm_memslots_for_spte_role(kvm, sp->role);
782         slot = __gfn_to_memslot(slots, gfn);
783
784         /* the non-leaf shadow pages are keeping readonly. */
785         if (sp->role.level > PG_LEVEL_4K)
786                 return kvm_slot_page_track_add_page(kvm, slot, gfn,
787                                                     KVM_PAGE_TRACK_WRITE);
788
789         kvm_mmu_gfn_disallow_lpage(slot, gfn);
790 }
791
792 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
793 {
794         if (sp->lpage_disallowed)
795                 return;
796
797         ++kvm->stat.nx_lpage_splits;
798         list_add_tail(&sp->lpage_disallowed_link,
799                       &kvm->arch.lpage_disallowed_mmu_pages);
800         sp->lpage_disallowed = true;
801 }
802
803 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
804 {
805         struct kvm_memslots *slots;
806         struct kvm_memory_slot *slot;
807         gfn_t gfn;
808
809         kvm->arch.indirect_shadow_pages--;
810         gfn = sp->gfn;
811         slots = kvm_memslots_for_spte_role(kvm, sp->role);
812         slot = __gfn_to_memslot(slots, gfn);
813         if (sp->role.level > PG_LEVEL_4K)
814                 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
815                                                        KVM_PAGE_TRACK_WRITE);
816
817         kvm_mmu_gfn_allow_lpage(slot, gfn);
818 }
819
820 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
821 {
822         --kvm->stat.nx_lpage_splits;
823         sp->lpage_disallowed = false;
824         list_del(&sp->lpage_disallowed_link);
825 }
826
827 static struct kvm_memory_slot *
828 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
829                             bool no_dirty_log)
830 {
831         struct kvm_memory_slot *slot;
832
833         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
834         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
835                 return NULL;
836         if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
837                 return NULL;
838
839         return slot;
840 }
841
842 /*
843  * About rmap_head encoding:
844  *
845  * If the bit zero of rmap_head->val is clear, then it points to the only spte
846  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
847  * pte_list_desc containing more mappings.
848  */
849
850 /*
851  * Returns the number of pointers in the rmap chain, not counting the new one.
852  */
853 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
854                         struct kvm_rmap_head *rmap_head)
855 {
856         struct pte_list_desc *desc;
857         int count = 0;
858
859         if (!rmap_head->val) {
860                 rmap_printk("%p %llx 0->1\n", spte, *spte);
861                 rmap_head->val = (unsigned long)spte;
862         } else if (!(rmap_head->val & 1)) {
863                 rmap_printk("%p %llx 1->many\n", spte, *spte);
864                 desc = mmu_alloc_pte_list_desc(vcpu);
865                 desc->sptes[0] = (u64 *)rmap_head->val;
866                 desc->sptes[1] = spte;
867                 desc->spte_count = 2;
868                 rmap_head->val = (unsigned long)desc | 1;
869                 ++count;
870         } else {
871                 rmap_printk("%p %llx many->many\n", spte, *spte);
872                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
873                 while (desc->spte_count == PTE_LIST_EXT) {
874                         count += PTE_LIST_EXT;
875                         if (!desc->more) {
876                                 desc->more = mmu_alloc_pte_list_desc(vcpu);
877                                 desc = desc->more;
878                                 desc->spte_count = 0;
879                                 break;
880                         }
881                         desc = desc->more;
882                 }
883                 count += desc->spte_count;
884                 desc->sptes[desc->spte_count++] = spte;
885         }
886         return count;
887 }
888
889 static void
890 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
891                            struct pte_list_desc *desc, int i,
892                            struct pte_list_desc *prev_desc)
893 {
894         int j = desc->spte_count - 1;
895
896         desc->sptes[i] = desc->sptes[j];
897         desc->sptes[j] = NULL;
898         desc->spte_count--;
899         if (desc->spte_count)
900                 return;
901         if (!prev_desc && !desc->more)
902                 rmap_head->val = 0;
903         else
904                 if (prev_desc)
905                         prev_desc->more = desc->more;
906                 else
907                         rmap_head->val = (unsigned long)desc->more | 1;
908         mmu_free_pte_list_desc(desc);
909 }
910
911 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
912 {
913         struct pte_list_desc *desc;
914         struct pte_list_desc *prev_desc;
915         int i;
916
917         if (!rmap_head->val) {
918                 pr_err("%s: %p 0->BUG\n", __func__, spte);
919                 BUG();
920         } else if (!(rmap_head->val & 1)) {
921                 rmap_printk("%p 1->0\n", spte);
922                 if ((u64 *)rmap_head->val != spte) {
923                         pr_err("%s:  %p 1->BUG\n", __func__, spte);
924                         BUG();
925                 }
926                 rmap_head->val = 0;
927         } else {
928                 rmap_printk("%p many->many\n", spte);
929                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
930                 prev_desc = NULL;
931                 while (desc) {
932                         for (i = 0; i < desc->spte_count; ++i) {
933                                 if (desc->sptes[i] == spte) {
934                                         pte_list_desc_remove_entry(rmap_head,
935                                                         desc, i, prev_desc);
936                                         return;
937                                 }
938                         }
939                         prev_desc = desc;
940                         desc = desc->more;
941                 }
942                 pr_err("%s: %p many->many\n", __func__, spte);
943                 BUG();
944         }
945 }
946
947 static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
948                             u64 *sptep)
949 {
950         mmu_spte_clear_track_bits(kvm, sptep);
951         __pte_list_remove(sptep, rmap_head);
952 }
953
954 /* Return true if rmap existed, false otherwise */
955 static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
956 {
957         struct pte_list_desc *desc, *next;
958         int i;
959
960         if (!rmap_head->val)
961                 return false;
962
963         if (!(rmap_head->val & 1)) {
964                 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
965                 goto out;
966         }
967
968         desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
969
970         for (; desc; desc = next) {
971                 for (i = 0; i < desc->spte_count; i++)
972                         mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
973                 next = desc->more;
974                 mmu_free_pte_list_desc(desc);
975         }
976 out:
977         /* rmap_head is meaningless now, remember to reset it */
978         rmap_head->val = 0;
979         return true;
980 }
981
982 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
983 {
984         struct pte_list_desc *desc;
985         unsigned int count = 0;
986
987         if (!rmap_head->val)
988                 return 0;
989         else if (!(rmap_head->val & 1))
990                 return 1;
991
992         desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
993
994         while (desc) {
995                 count += desc->spte_count;
996                 desc = desc->more;
997         }
998
999         return count;
1000 }
1001
1002 static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
1003                                          const struct kvm_memory_slot *slot)
1004 {
1005         unsigned long idx;
1006
1007         idx = gfn_to_index(gfn, slot->base_gfn, level);
1008         return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
1009 }
1010
1011 static bool rmap_can_add(struct kvm_vcpu *vcpu)
1012 {
1013         struct kvm_mmu_memory_cache *mc;
1014
1015         mc = &vcpu->arch.mmu_pte_list_desc_cache;
1016         return kvm_mmu_memory_cache_nr_free_objects(mc);
1017 }
1018
1019 static void rmap_remove(struct kvm *kvm, u64 *spte)
1020 {
1021         struct kvm_memslots *slots;
1022         struct kvm_memory_slot *slot;
1023         struct kvm_mmu_page *sp;
1024         gfn_t gfn;
1025         struct kvm_rmap_head *rmap_head;
1026
1027         sp = sptep_to_sp(spte);
1028         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1029
1030         /*
1031          * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
1032          * so we have to determine which memslots to use based on context
1033          * information in sp->role.
1034          */
1035         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1036
1037         slot = __gfn_to_memslot(slots, gfn);
1038         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1039
1040         __pte_list_remove(spte, rmap_head);
1041 }
1042
1043 /*
1044  * Used by the following functions to iterate through the sptes linked by a
1045  * rmap.  All fields are private and not assumed to be used outside.
1046  */
1047 struct rmap_iterator {
1048         /* private fields */
1049         struct pte_list_desc *desc;     /* holds the sptep if not NULL */
1050         int pos;                        /* index of the sptep */
1051 };
1052
1053 /*
1054  * Iteration must be started by this function.  This should also be used after
1055  * removing/dropping sptes from the rmap link because in such cases the
1056  * information in the iterator may not be valid.
1057  *
1058  * Returns sptep if found, NULL otherwise.
1059  */
1060 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1061                            struct rmap_iterator *iter)
1062 {
1063         u64 *sptep;
1064
1065         if (!rmap_head->val)
1066                 return NULL;
1067
1068         if (!(rmap_head->val & 1)) {
1069                 iter->desc = NULL;
1070                 sptep = (u64 *)rmap_head->val;
1071                 goto out;
1072         }
1073
1074         iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1075         iter->pos = 0;
1076         sptep = iter->desc->sptes[iter->pos];
1077 out:
1078         BUG_ON(!is_shadow_present_pte(*sptep));
1079         return sptep;
1080 }
1081
1082 /*
1083  * Must be used with a valid iterator: e.g. after rmap_get_first().
1084  *
1085  * Returns sptep if found, NULL otherwise.
1086  */
1087 static u64 *rmap_get_next(struct rmap_iterator *iter)
1088 {
1089         u64 *sptep;
1090
1091         if (iter->desc) {
1092                 if (iter->pos < PTE_LIST_EXT - 1) {
1093                         ++iter->pos;
1094                         sptep = iter->desc->sptes[iter->pos];
1095                         if (sptep)
1096                                 goto out;
1097                 }
1098
1099                 iter->desc = iter->desc->more;
1100
1101                 if (iter->desc) {
1102                         iter->pos = 0;
1103                         /* desc->sptes[0] cannot be NULL */
1104                         sptep = iter->desc->sptes[iter->pos];
1105                         goto out;
1106                 }
1107         }
1108
1109         return NULL;
1110 out:
1111         BUG_ON(!is_shadow_present_pte(*sptep));
1112         return sptep;
1113 }
1114
1115 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)                 \
1116         for (_spte_ = rmap_get_first(_rmap_head_, _iter_);              \
1117              _spte_; _spte_ = rmap_get_next(_iter_))
1118
1119 static void drop_spte(struct kvm *kvm, u64 *sptep)
1120 {
1121         u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
1122
1123         if (is_shadow_present_pte(old_spte))
1124                 rmap_remove(kvm, sptep);
1125 }
1126
1127
1128 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1129 {
1130         if (is_large_pte(*sptep)) {
1131                 WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
1132                 drop_spte(kvm, sptep);
1133                 return true;
1134         }
1135
1136         return false;
1137 }
1138
1139 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1140 {
1141         if (__drop_large_spte(vcpu->kvm, sptep)) {
1142                 struct kvm_mmu_page *sp = sptep_to_sp(sptep);
1143
1144                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1145                         KVM_PAGES_PER_HPAGE(sp->role.level));
1146         }
1147 }
1148
1149 /*
1150  * Write-protect on the specified @sptep, @pt_protect indicates whether
1151  * spte write-protection is caused by protecting shadow page table.
1152  *
1153  * Note: write protection is difference between dirty logging and spte
1154  * protection:
1155  * - for dirty logging, the spte can be set to writable at anytime if
1156  *   its dirty bitmap is properly set.
1157  * - for spte protection, the spte can be writable only after unsync-ing
1158  *   shadow page.
1159  *
1160  * Return true if tlb need be flushed.
1161  */
1162 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1163 {
1164         u64 spte = *sptep;
1165
1166         if (!is_writable_pte(spte) &&
1167             !(pt_protect && is_mmu_writable_spte(spte)))
1168                 return false;
1169
1170         rmap_printk("spte %p %llx\n", sptep, *sptep);
1171
1172         if (pt_protect)
1173                 spte &= ~shadow_mmu_writable_mask;
1174         spte = spte & ~PT_WRITABLE_MASK;
1175
1176         return mmu_spte_update(sptep, spte);
1177 }
1178
1179 static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
1180                                bool pt_protect)
1181 {
1182         u64 *sptep;
1183         struct rmap_iterator iter;
1184         bool flush = false;
1185
1186         for_each_rmap_spte(rmap_head, &iter, sptep)
1187                 flush |= spte_write_protect(sptep, pt_protect);
1188
1189         return flush;
1190 }
1191
1192 static bool spte_clear_dirty(u64 *sptep)
1193 {
1194         u64 spte = *sptep;
1195
1196         rmap_printk("spte %p %llx\n", sptep, *sptep);
1197
1198         MMU_WARN_ON(!spte_ad_enabled(spte));
1199         spte &= ~shadow_dirty_mask;
1200         return mmu_spte_update(sptep, spte);
1201 }
1202
1203 static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1204 {
1205         bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1206                                                (unsigned long *)sptep);
1207         if (was_writable && !spte_ad_enabled(*sptep))
1208                 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1209
1210         return was_writable;
1211 }
1212
1213 /*
1214  * Gets the GFN ready for another round of dirty logging by clearing the
1215  *      - D bit on ad-enabled SPTEs, and
1216  *      - W bit on ad-disabled SPTEs.
1217  * Returns true iff any D or W bits were cleared.
1218  */
1219 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1220                                const struct kvm_memory_slot *slot)
1221 {
1222         u64 *sptep;
1223         struct rmap_iterator iter;
1224         bool flush = false;
1225
1226         for_each_rmap_spte(rmap_head, &iter, sptep)
1227                 if (spte_ad_need_write_protect(*sptep))
1228                         flush |= spte_wrprot_for_clear_dirty(sptep);
1229                 else
1230                         flush |= spte_clear_dirty(sptep);
1231
1232         return flush;
1233 }
1234
1235 /**
1236  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1237  * @kvm: kvm instance
1238  * @slot: slot to protect
1239  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1240  * @mask: indicates which pages we should protect
1241  *
1242  * Used when we do not need to care about huge page mappings.
1243  */
1244 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1245                                      struct kvm_memory_slot *slot,
1246                                      gfn_t gfn_offset, unsigned long mask)
1247 {
1248         struct kvm_rmap_head *rmap_head;
1249
1250         if (is_tdp_mmu_enabled(kvm))
1251                 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1252                                 slot->base_gfn + gfn_offset, mask, true);
1253
1254         if (!kvm_memslots_have_rmaps(kvm))
1255                 return;
1256
1257         while (mask) {
1258                 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1259                                         PG_LEVEL_4K, slot);
1260                 rmap_write_protect(rmap_head, false);
1261
1262                 /* clear the first set bit */
1263                 mask &= mask - 1;
1264         }
1265 }
1266
1267 /**
1268  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1269  * protect the page if the D-bit isn't supported.
1270  * @kvm: kvm instance
1271  * @slot: slot to clear D-bit
1272  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1273  * @mask: indicates which pages we should clear D-bit
1274  *
1275  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1276  */
1277 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1278                                          struct kvm_memory_slot *slot,
1279                                          gfn_t gfn_offset, unsigned long mask)
1280 {
1281         struct kvm_rmap_head *rmap_head;
1282
1283         if (is_tdp_mmu_enabled(kvm))
1284                 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1285                                 slot->base_gfn + gfn_offset, mask, false);
1286
1287         if (!kvm_memslots_have_rmaps(kvm))
1288                 return;
1289
1290         while (mask) {
1291                 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1292                                         PG_LEVEL_4K, slot);
1293                 __rmap_clear_dirty(kvm, rmap_head, slot);
1294
1295                 /* clear the first set bit */
1296                 mask &= mask - 1;
1297         }
1298 }
1299
1300 /**
1301  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1302  * PT level pages.
1303  *
1304  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1305  * enable dirty logging for them.
1306  *
1307  * We need to care about huge page mappings: e.g. during dirty logging we may
1308  * have such mappings.
1309  */
1310 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1311                                 struct kvm_memory_slot *slot,
1312                                 gfn_t gfn_offset, unsigned long mask)
1313 {
1314         /*
1315          * Huge pages are NOT write protected when we start dirty logging in
1316          * initially-all-set mode; must write protect them here so that they
1317          * are split to 4K on the first write.
1318          *
1319          * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
1320          * of memslot has no such restriction, so the range can cross two large
1321          * pages.
1322          */
1323         if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1324                 gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
1325                 gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
1326
1327                 if (READ_ONCE(eager_page_split))
1328                         kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
1329
1330                 kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
1331
1332                 /* Cross two large pages? */
1333                 if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
1334                     ALIGN(end << PAGE_SHIFT, PMD_SIZE))
1335                         kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
1336                                                        PG_LEVEL_2M);
1337         }
1338
1339         /* Now handle 4K PTEs.  */
1340         if (kvm_x86_ops.cpu_dirty_log_size)
1341                 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
1342         else
1343                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1344 }
1345
1346 int kvm_cpu_dirty_log_size(void)
1347 {
1348         return kvm_x86_ops.cpu_dirty_log_size;
1349 }
1350
1351 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1352                                     struct kvm_memory_slot *slot, u64 gfn,
1353                                     int min_level)
1354 {
1355         struct kvm_rmap_head *rmap_head;
1356         int i;
1357         bool write_protected = false;
1358
1359         if (kvm_memslots_have_rmaps(kvm)) {
1360                 for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
1361                         rmap_head = gfn_to_rmap(gfn, i, slot);
1362                         write_protected |= rmap_write_protect(rmap_head, true);
1363                 }
1364         }
1365
1366         if (is_tdp_mmu_enabled(kvm))
1367                 write_protected |=
1368                         kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
1369
1370         return write_protected;
1371 }
1372
1373 static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
1374 {
1375         struct kvm_memory_slot *slot;
1376
1377         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1378         return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
1379 }
1380
1381 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1382                           const struct kvm_memory_slot *slot)
1383 {
1384         return pte_list_destroy(kvm, rmap_head);
1385 }
1386
1387 static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1388                             struct kvm_memory_slot *slot, gfn_t gfn, int level,
1389                             pte_t unused)
1390 {
1391         return kvm_zap_rmapp(kvm, rmap_head, slot);
1392 }
1393
1394 static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1395                               struct kvm_memory_slot *slot, gfn_t gfn, int level,
1396                               pte_t pte)
1397 {
1398         u64 *sptep;
1399         struct rmap_iterator iter;
1400         bool need_flush = false;
1401         u64 new_spte;
1402         kvm_pfn_t new_pfn;
1403
1404         WARN_ON(pte_huge(pte));
1405         new_pfn = pte_pfn(pte);
1406
1407 restart:
1408         for_each_rmap_spte(rmap_head, &iter, sptep) {
1409                 rmap_printk("spte %p %llx gfn %llx (%d)\n",
1410                             sptep, *sptep, gfn, level);
1411
1412                 need_flush = true;
1413
1414                 if (pte_write(pte)) {
1415                         pte_list_remove(kvm, rmap_head, sptep);
1416                         goto restart;
1417                 } else {
1418                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1419                                         *sptep, new_pfn);
1420
1421                         mmu_spte_clear_track_bits(kvm, sptep);
1422                         mmu_spte_set(sptep, new_spte);
1423                 }
1424         }
1425
1426         if (need_flush && kvm_available_flush_tlb_with_range()) {
1427                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1428                 return false;
1429         }
1430
1431         return need_flush;
1432 }
1433
1434 struct slot_rmap_walk_iterator {
1435         /* input fields. */
1436         const struct kvm_memory_slot *slot;
1437         gfn_t start_gfn;
1438         gfn_t end_gfn;
1439         int start_level;
1440         int end_level;
1441
1442         /* output fields. */
1443         gfn_t gfn;
1444         struct kvm_rmap_head *rmap;
1445         int level;
1446
1447         /* private field. */
1448         struct kvm_rmap_head *end_rmap;
1449 };
1450
1451 static void
1452 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1453 {
1454         iterator->level = level;
1455         iterator->gfn = iterator->start_gfn;
1456         iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
1457         iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
1458 }
1459
1460 static void
1461 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1462                     const struct kvm_memory_slot *slot, int start_level,
1463                     int end_level, gfn_t start_gfn, gfn_t end_gfn)
1464 {
1465         iterator->slot = slot;
1466         iterator->start_level = start_level;
1467         iterator->end_level = end_level;
1468         iterator->start_gfn = start_gfn;
1469         iterator->end_gfn = end_gfn;
1470
1471         rmap_walk_init_level(iterator, iterator->start_level);
1472 }
1473
1474 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1475 {
1476         return !!iterator->rmap;
1477 }
1478
1479 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1480 {
1481         if (++iterator->rmap <= iterator->end_rmap) {
1482                 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1483                 return;
1484         }
1485
1486         if (++iterator->level > iterator->end_level) {
1487                 iterator->rmap = NULL;
1488                 return;
1489         }
1490
1491         rmap_walk_init_level(iterator, iterator->level);
1492 }
1493
1494 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1495            _start_gfn, _end_gfn, _iter_)                                \
1496         for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
1497                                  _end_level_, _start_gfn, _end_gfn);    \
1498              slot_rmap_walk_okay(_iter_);                               \
1499              slot_rmap_walk_next(_iter_))
1500
1501 typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1502                                struct kvm_memory_slot *slot, gfn_t gfn,
1503                                int level, pte_t pte);
1504
1505 static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
1506                                                  struct kvm_gfn_range *range,
1507                                                  rmap_handler_t handler)
1508 {
1509         struct slot_rmap_walk_iterator iterator;
1510         bool ret = false;
1511
1512         for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
1513                                  range->start, range->end - 1, &iterator)
1514                 ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
1515                                iterator.level, range->pte);
1516
1517         return ret;
1518 }
1519
1520 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1521 {
1522         bool flush = false;
1523
1524         if (kvm_memslots_have_rmaps(kvm))
1525                 flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
1526
1527         if (is_tdp_mmu_enabled(kvm))
1528                 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
1529
1530         return flush;
1531 }
1532
1533 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1534 {
1535         bool flush = false;
1536
1537         if (kvm_memslots_have_rmaps(kvm))
1538                 flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
1539
1540         if (is_tdp_mmu_enabled(kvm))
1541                 flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
1542
1543         return flush;
1544 }
1545
1546 static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1547                           struct kvm_memory_slot *slot, gfn_t gfn, int level,
1548                           pte_t unused)
1549 {
1550         u64 *sptep;
1551         struct rmap_iterator iter;
1552         int young = 0;
1553
1554         for_each_rmap_spte(rmap_head, &iter, sptep)
1555                 young |= mmu_spte_age(sptep);
1556
1557         return young;
1558 }
1559
1560 static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1561                                struct kvm_memory_slot *slot, gfn_t gfn,
1562                                int level, pte_t unused)
1563 {
1564         u64 *sptep;
1565         struct rmap_iterator iter;
1566
1567         for_each_rmap_spte(rmap_head, &iter, sptep)
1568                 if (is_accessed_spte(*sptep))
1569                         return true;
1570         return false;
1571 }
1572
1573 #define RMAP_RECYCLE_THRESHOLD 1000
1574
1575 static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
1576                      u64 *spte, gfn_t gfn)
1577 {
1578         struct kvm_mmu_page *sp;
1579         struct kvm_rmap_head *rmap_head;
1580         int rmap_count;
1581
1582         sp = sptep_to_sp(spte);
1583         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1584         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1585         rmap_count = pte_list_add(vcpu, spte, rmap_head);
1586
1587         if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
1588                 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
1589                 kvm_flush_remote_tlbs_with_address(
1590                                 vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
1591         }
1592 }
1593
1594 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1595 {
1596         bool young = false;
1597
1598         if (kvm_memslots_have_rmaps(kvm))
1599                 young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
1600
1601         if (is_tdp_mmu_enabled(kvm))
1602                 young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
1603
1604         return young;
1605 }
1606
1607 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1608 {
1609         bool young = false;
1610
1611         if (kvm_memslots_have_rmaps(kvm))
1612                 young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
1613
1614         if (is_tdp_mmu_enabled(kvm))
1615                 young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
1616
1617         return young;
1618 }
1619
1620 #ifdef MMU_DEBUG
1621 static int is_empty_shadow_page(u64 *spt)
1622 {
1623         u64 *pos;
1624         u64 *end;
1625
1626         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1627                 if (is_shadow_present_pte(*pos)) {
1628                         printk(KERN_ERR "%s: %p %llx\n", __func__,
1629                                pos, *pos);
1630                         return 0;
1631                 }
1632         return 1;
1633 }
1634 #endif
1635
1636 /*
1637  * This value is the sum of all of the kvm instances's
1638  * kvm->arch.n_used_mmu_pages values.  We need a global,
1639  * aggregate version in order to make the slab shrinker
1640  * faster
1641  */
1642 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
1643 {
1644         kvm->arch.n_used_mmu_pages += nr;
1645         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1646 }
1647
1648 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1649 {
1650         MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
1651         hlist_del(&sp->hash_link);
1652         list_del(&sp->link);
1653         free_page((unsigned long)sp->spt);
1654         if (!sp->role.direct)
1655                 free_page((unsigned long)sp->gfns);
1656         kmem_cache_free(mmu_page_header_cache, sp);
1657 }
1658
1659 static unsigned kvm_page_table_hashfn(gfn_t gfn)
1660 {
1661         return hash_64(gfn, KVM_MMU_HASH_SHIFT);
1662 }
1663
1664 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1665                                     struct kvm_mmu_page *sp, u64 *parent_pte)
1666 {
1667         if (!parent_pte)
1668                 return;
1669
1670         pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
1671 }
1672
1673 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1674                                        u64 *parent_pte)
1675 {
1676         __pte_list_remove(parent_pte, &sp->parent_ptes);
1677 }
1678
1679 static void drop_parent_pte(struct kvm_mmu_page *sp,
1680                             u64 *parent_pte)
1681 {
1682         mmu_page_remove_parent_pte(sp, parent_pte);
1683         mmu_spte_clear_no_track(parent_pte);
1684 }
1685
1686 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
1687 {
1688         struct kvm_mmu_page *sp;
1689
1690         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
1691         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
1692         if (!direct)
1693                 sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
1694         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1695
1696         /*
1697          * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
1698          * depends on valid pages being added to the head of the list.  See
1699          * comments in kvm_zap_obsolete_pages().
1700          */
1701         sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
1702         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1703         kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1704         return sp;
1705 }
1706
1707 static void mark_unsync(u64 *spte);
1708 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1709 {
1710         u64 *sptep;
1711         struct rmap_iterator iter;
1712
1713         for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
1714                 mark_unsync(sptep);
1715         }
1716 }
1717
1718 static void mark_unsync(u64 *spte)
1719 {
1720         struct kvm_mmu_page *sp;
1721         unsigned int index;
1722
1723         sp = sptep_to_sp(spte);
1724         index = spte - sp->spt;
1725         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1726                 return;
1727         if (sp->unsync_children++)
1728                 return;
1729         kvm_mmu_mark_parents_unsync(sp);
1730 }
1731
1732 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1733                                struct kvm_mmu_page *sp)
1734 {
1735         return -1;
1736 }
1737
1738 #define KVM_PAGE_ARRAY_NR 16
1739
1740 struct kvm_mmu_pages {
1741         struct mmu_page_and_offset {
1742                 struct kvm_mmu_page *sp;
1743                 unsigned int idx;
1744         } page[KVM_PAGE_ARRAY_NR];
1745         unsigned int nr;
1746 };
1747
1748 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1749                          int idx)
1750 {
1751         int i;
1752
1753         if (sp->unsync)
1754                 for (i=0; i < pvec->nr; i++)
1755                         if (pvec->page[i].sp == sp)
1756                                 return 0;
1757
1758         pvec->page[pvec->nr].sp = sp;
1759         pvec->page[pvec->nr].idx = idx;
1760         pvec->nr++;
1761         return (pvec->nr == KVM_PAGE_ARRAY_NR);
1762 }
1763
1764 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
1765 {
1766         --sp->unsync_children;
1767         WARN_ON((int)sp->unsync_children < 0);
1768         __clear_bit(idx, sp->unsync_child_bitmap);
1769 }
1770
1771 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1772                            struct kvm_mmu_pages *pvec)
1773 {
1774         int i, ret, nr_unsync_leaf = 0;
1775
1776         for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1777                 struct kvm_mmu_page *child;
1778                 u64 ent = sp->spt[i];
1779
1780                 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
1781                         clear_unsync_child_bit(sp, i);
1782                         continue;
1783                 }
1784
1785                 child = to_shadow_page(ent & PT64_BASE_ADDR_MASK);
1786
1787                 if (child->unsync_children) {
1788                         if (mmu_pages_add(pvec, child, i))
1789                                 return -ENOSPC;
1790
1791                         ret = __mmu_unsync_walk(child, pvec);
1792                         if (!ret) {
1793                                 clear_unsync_child_bit(sp, i);
1794                                 continue;
1795                         } else if (ret > 0) {
1796                                 nr_unsync_leaf += ret;
1797                         } else
1798                                 return ret;
1799                 } else if (child->unsync) {
1800                         nr_unsync_leaf++;
1801                         if (mmu_pages_add(pvec, child, i))
1802                                 return -ENOSPC;
1803                 } else
1804                         clear_unsync_child_bit(sp, i);
1805         }
1806
1807         return nr_unsync_leaf;
1808 }
1809
1810 #define INVALID_INDEX (-1)
1811
1812 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1813                            struct kvm_mmu_pages *pvec)
1814 {
1815         pvec->nr = 0;
1816         if (!sp->unsync_children)
1817                 return 0;
1818
1819         mmu_pages_add(pvec, sp, INVALID_INDEX);
1820         return __mmu_unsync_walk(sp, pvec);
1821 }
1822
1823 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1824 {
1825         WARN_ON(!sp->unsync);
1826         trace_kvm_mmu_sync_page(sp);
1827         sp->unsync = 0;
1828         --kvm->stat.mmu_unsync;
1829 }
1830
1831 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1832                                      struct list_head *invalid_list);
1833 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1834                                     struct list_head *invalid_list);
1835
1836 #define for_each_valid_sp(_kvm, _sp, _list)                             \
1837         hlist_for_each_entry(_sp, _list, hash_link)                     \
1838                 if (is_obsolete_sp((_kvm), (_sp))) {                    \
1839                 } else
1840
1841 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                 \
1842         for_each_valid_sp(_kvm, _sp,                                    \
1843           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])     \
1844                 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
1845
1846 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1847                          struct list_head *invalid_list)
1848 {
1849         int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
1850
1851         if (ret < 0) {
1852                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1853                 return false;
1854         }
1855
1856         return !!ret;
1857 }
1858
1859 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
1860                                         struct list_head *invalid_list,
1861                                         bool remote_flush)
1862 {
1863         if (!remote_flush && list_empty(invalid_list))
1864                 return false;
1865
1866         if (!list_empty(invalid_list))
1867                 kvm_mmu_commit_zap_page(kvm, invalid_list);
1868         else
1869                 kvm_flush_remote_tlbs(kvm);
1870         return true;
1871 }
1872
1873 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
1874 {
1875         if (sp->role.invalid)
1876                 return true;
1877
1878         /* TDP MMU pages due not use the MMU generation. */
1879         return !sp->tdp_mmu_page &&
1880                unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
1881 }
1882
1883 struct mmu_page_path {
1884         struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
1885         unsigned int idx[PT64_ROOT_MAX_LEVEL];
1886 };
1887
1888 #define for_each_sp(pvec, sp, parents, i)                       \
1889                 for (i = mmu_pages_first(&pvec, &parents);      \
1890                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
1891                         i = mmu_pages_next(&pvec, &parents, i))
1892
1893 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1894                           struct mmu_page_path *parents,
1895                           int i)
1896 {
1897         int n;
1898
1899         for (n = i+1; n < pvec->nr; n++) {
1900                 struct kvm_mmu_page *sp = pvec->page[n].sp;
1901                 unsigned idx = pvec->page[n].idx;
1902                 int level = sp->role.level;
1903
1904                 parents->idx[level-1] = idx;
1905                 if (level == PG_LEVEL_4K)
1906                         break;
1907
1908                 parents->parent[level-2] = sp;
1909         }
1910
1911         return n;
1912 }
1913
1914 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
1915                            struct mmu_page_path *parents)
1916 {
1917         struct kvm_mmu_page *sp;
1918         int level;
1919
1920         if (pvec->nr == 0)
1921                 return 0;
1922
1923         WARN_ON(pvec->page[0].idx != INVALID_INDEX);
1924
1925         sp = pvec->page[0].sp;
1926         level = sp->role.level;
1927         WARN_ON(level == PG_LEVEL_4K);
1928
1929         parents->parent[level-2] = sp;
1930
1931         /* Also set up a sentinel.  Further entries in pvec are all
1932          * children of sp, so this element is never overwritten.
1933          */
1934         parents->parent[level-1] = NULL;
1935         return mmu_pages_next(pvec, parents, 0);
1936 }
1937
1938 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1939 {
1940         struct kvm_mmu_page *sp;
1941         unsigned int level = 0;
1942
1943         do {
1944                 unsigned int idx = parents->idx[level];
1945                 sp = parents->parent[level];
1946                 if (!sp)
1947                         return;
1948
1949                 WARN_ON(idx == INVALID_INDEX);
1950                 clear_unsync_child_bit(sp, idx);
1951                 level++;
1952         } while (!sp->unsync_children);
1953 }
1954
1955 static int mmu_sync_children(struct kvm_vcpu *vcpu,
1956                              struct kvm_mmu_page *parent, bool can_yield)
1957 {
1958         int i;
1959         struct kvm_mmu_page *sp;
1960         struct mmu_page_path parents;
1961         struct kvm_mmu_pages pages;
1962         LIST_HEAD(invalid_list);
1963         bool flush = false;
1964
1965         while (mmu_unsync_walk(parent, &pages)) {
1966                 bool protected = false;
1967
1968                 for_each_sp(pages, sp, parents, i)
1969                         protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
1970
1971                 if (protected) {
1972                         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
1973                         flush = false;
1974                 }
1975
1976                 for_each_sp(pages, sp, parents, i) {
1977                         kvm_unlink_unsync_page(vcpu->kvm, sp);
1978                         flush |= kvm_sync_page(vcpu, sp, &invalid_list);
1979                         mmu_pages_clear_parents(&parents);
1980                 }
1981                 if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
1982                         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
1983                         if (!can_yield) {
1984                                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1985                                 return -EINTR;
1986                         }
1987
1988                         cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
1989                         flush = false;
1990                 }
1991         }
1992
1993         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
1994         return 0;
1995 }
1996
1997 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
1998 {
1999         atomic_set(&sp->write_flooding_count,  0);
2000 }
2001
2002 static void clear_sp_write_flooding_count(u64 *spte)
2003 {
2004         __clear_sp_write_flooding_count(sptep_to_sp(spte));
2005 }
2006
2007 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2008                                              gfn_t gfn,
2009                                              gva_t gaddr,
2010                                              unsigned level,
2011                                              int direct,
2012                                              unsigned int access)
2013 {
2014         bool direct_mmu = vcpu->arch.mmu->direct_map;
2015         union kvm_mmu_page_role role;
2016         struct hlist_head *sp_list;
2017         unsigned quadrant;
2018         struct kvm_mmu_page *sp;
2019         int collisions = 0;
2020         LIST_HEAD(invalid_list);
2021
2022         role = vcpu->arch.mmu->mmu_role.base;
2023         role.level = level;
2024         role.direct = direct;
2025         role.access = access;
2026         if (role.has_4_byte_gpte) {
2027                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2028                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2029                 role.quadrant = quadrant;
2030         }
2031
2032         sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
2033         for_each_valid_sp(vcpu->kvm, sp, sp_list) {
2034                 if (sp->gfn != gfn) {
2035                         collisions++;
2036                         continue;
2037                 }
2038
2039                 if (sp->role.word != role.word) {
2040                         /*
2041                          * If the guest is creating an upper-level page, zap
2042                          * unsync pages for the same gfn.  While it's possible
2043                          * the guest is using recursive page tables, in all
2044                          * likelihood the guest has stopped using the unsync
2045                          * page and is installing a completely unrelated page.
2046                          * Unsync pages must not be left as is, because the new
2047                          * upper-level page will be write-protected.
2048                          */
2049                         if (level > PG_LEVEL_4K && sp->unsync)
2050                                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2051                                                          &invalid_list);
2052                         continue;
2053                 }
2054
2055                 if (direct_mmu)
2056                         goto trace_get_page;
2057
2058                 if (sp->unsync) {
2059                         /*
2060                          * The page is good, but is stale.  kvm_sync_page does
2061                          * get the latest guest state, but (unlike mmu_unsync_children)
2062                          * it doesn't write-protect the page or mark it synchronized!
2063                          * This way the validity of the mapping is ensured, but the
2064                          * overhead of write protection is not incurred until the
2065                          * guest invalidates the TLB mapping.  This allows multiple
2066                          * SPs for a single gfn to be unsync.
2067                          *
2068                          * If the sync fails, the page is zapped.  If so, break
2069                          * in order to rebuild it.
2070                          */
2071                         if (!kvm_sync_page(vcpu, sp, &invalid_list))
2072                                 break;
2073
2074                         WARN_ON(!list_empty(&invalid_list));
2075                         kvm_flush_remote_tlbs(vcpu->kvm);
2076                 }
2077
2078                 __clear_sp_write_flooding_count(sp);
2079
2080 trace_get_page:
2081                 trace_kvm_mmu_get_page(sp, false);
2082                 goto out;
2083         }
2084
2085         ++vcpu->kvm->stat.mmu_cache_miss;
2086
2087         sp = kvm_mmu_alloc_page(vcpu, direct);
2088
2089         sp->gfn = gfn;
2090         sp->role = role;
2091         hlist_add_head(&sp->hash_link, sp_list);
2092         if (!direct) {
2093                 account_shadowed(vcpu->kvm, sp);
2094                 if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
2095                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2096         }
2097         trace_kvm_mmu_get_page(sp, true);
2098 out:
2099         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2100
2101         if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2102                 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2103         return sp;
2104 }
2105
2106 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2107                                         struct kvm_vcpu *vcpu, hpa_t root,
2108                                         u64 addr)
2109 {
2110         iterator->addr = addr;
2111         iterator->shadow_addr = root;
2112         iterator->level = vcpu->arch.mmu->shadow_root_level;
2113
2114         if (iterator->level >= PT64_ROOT_4LEVEL &&
2115             vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2116             !vcpu->arch.mmu->direct_map)
2117                 iterator->level = PT32E_ROOT_LEVEL;
2118
2119         if (iterator->level == PT32E_ROOT_LEVEL) {
2120                 /*
2121                  * prev_root is currently only used for 64-bit hosts. So only
2122                  * the active root_hpa is valid here.
2123                  */
2124                 BUG_ON(root != vcpu->arch.mmu->root.hpa);
2125
2126                 iterator->shadow_addr
2127                         = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2128                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2129                 --iterator->level;
2130                 if (!iterator->shadow_addr)
2131                         iterator->level = 0;
2132         }
2133 }
2134
2135 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2136                              struct kvm_vcpu *vcpu, u64 addr)
2137 {
2138         shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
2139                                     addr);
2140 }
2141
2142 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2143 {
2144         if (iterator->level < PG_LEVEL_4K)
2145                 return false;
2146
2147         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2148         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2149         return true;
2150 }
2151
2152 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2153                                u64 spte)
2154 {
2155         if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
2156                 iterator->level = 0;
2157                 return;
2158         }
2159
2160         iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2161         --iterator->level;
2162 }
2163
2164 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2165 {
2166         __shadow_walk_next(iterator, *iterator->sptep);
2167 }
2168
2169 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2170                              struct kvm_mmu_page *sp)
2171 {
2172         u64 spte;
2173
2174         BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2175
2176         spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
2177
2178         mmu_spte_set(sptep, spte);
2179
2180         mmu_page_add_parent_pte(vcpu, sp, sptep);
2181
2182         if (sp->unsync_children || sp->unsync)
2183                 mark_unsync(sptep);
2184 }
2185
2186 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2187                                    unsigned direct_access)
2188 {
2189         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2190                 struct kvm_mmu_page *child;
2191
2192                 /*
2193                  * For the direct sp, if the guest pte's dirty bit
2194                  * changed form clean to dirty, it will corrupt the
2195                  * sp's access: allow writable in the read-only sp,
2196                  * so we should update the spte at this point to get
2197                  * a new sp with the correct access.
2198                  */
2199                 child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK);
2200                 if (child->role.access == direct_access)
2201                         return;
2202
2203                 drop_parent_pte(child, sptep);
2204                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2205         }
2206 }
2207
2208 /* Returns the number of zapped non-leaf child shadow pages. */
2209 static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2210                             u64 *spte, struct list_head *invalid_list)
2211 {
2212         u64 pte;
2213         struct kvm_mmu_page *child;
2214
2215         pte = *spte;
2216         if (is_shadow_present_pte(pte)) {
2217                 if (is_last_spte(pte, sp->role.level)) {
2218                         drop_spte(kvm, spte);
2219                 } else {
2220                         child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
2221                         drop_parent_pte(child, spte);
2222
2223                         /*
2224                          * Recursively zap nested TDP SPs, parentless SPs are
2225                          * unlikely to be used again in the near future.  This
2226                          * avoids retaining a large number of stale nested SPs.
2227                          */
2228                         if (tdp_enabled && invalid_list &&
2229                             child->role.guest_mode && !child->parent_ptes.val)
2230                                 return kvm_mmu_prepare_zap_page(kvm, child,
2231                                                                 invalid_list);
2232                 }
2233         } else if (is_mmio_spte(pte)) {
2234                 mmu_spte_clear_no_track(spte);
2235         }
2236         return 0;
2237 }
2238
2239 static int kvm_mmu_page_unlink_children(struct kvm *kvm,
2240                                         struct kvm_mmu_page *sp,
2241                                         struct list_head *invalid_list)
2242 {
2243         int zapped = 0;
2244         unsigned i;
2245
2246         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2247                 zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
2248
2249         return zapped;
2250 }
2251
2252 static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
2253 {
2254         u64 *sptep;
2255         struct rmap_iterator iter;
2256
2257         while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2258                 drop_parent_pte(sp, sptep);
2259 }
2260
2261 static int mmu_zap_unsync_children(struct kvm *kvm,
2262                                    struct kvm_mmu_page *parent,
2263                                    struct list_head *invalid_list)
2264 {
2265         int i, zapped = 0;
2266         struct mmu_page_path parents;
2267         struct kvm_mmu_pages pages;
2268
2269         if (parent->role.level == PG_LEVEL_4K)
2270                 return 0;
2271
2272         while (mmu_unsync_walk(parent, &pages)) {
2273                 struct kvm_mmu_page *sp;
2274
2275                 for_each_sp(pages, sp, parents, i) {
2276                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2277                         mmu_pages_clear_parents(&parents);
2278                         zapped++;
2279                 }
2280         }
2281
2282         return zapped;
2283 }
2284
2285 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2286                                        struct kvm_mmu_page *sp,
2287                                        struct list_head *invalid_list,
2288                                        int *nr_zapped)
2289 {
2290         bool list_unstable, zapped_root = false;
2291
2292         trace_kvm_mmu_prepare_zap_page(sp);
2293         ++kvm->stat.mmu_shadow_zapped;
2294         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2295         *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
2296         kvm_mmu_unlink_parents(sp);
2297
2298         /* Zapping children means active_mmu_pages has become unstable. */
2299         list_unstable = *nr_zapped;
2300
2301         if (!sp->role.invalid && !sp->role.direct)
2302                 unaccount_shadowed(kvm, sp);
2303
2304         if (sp->unsync)
2305                 kvm_unlink_unsync_page(kvm, sp);
2306         if (!sp->root_count) {
2307                 /* Count self */
2308                 (*nr_zapped)++;
2309
2310                 /*
2311                  * Already invalid pages (previously active roots) are not on
2312                  * the active page list.  See list_del() in the "else" case of
2313                  * !sp->root_count.
2314                  */
2315                 if (sp->role.invalid)
2316                         list_add(&sp->link, invalid_list);
2317                 else
2318                         list_move(&sp->link, invalid_list);
2319                 kvm_mod_used_mmu_pages(kvm, -1);
2320         } else {
2321                 /*
2322                  * Remove the active root from the active page list, the root
2323                  * will be explicitly freed when the root_count hits zero.
2324                  */
2325                 list_del(&sp->link);
2326
2327                 /*
2328                  * Obsolete pages cannot be used on any vCPUs, see the comment
2329                  * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
2330                  * treats invalid shadow pages as being obsolete.
2331                  */
2332                 zapped_root = !is_obsolete_sp(kvm, sp);
2333         }
2334
2335         if (sp->lpage_disallowed)
2336                 unaccount_huge_nx_page(kvm, sp);
2337
2338         sp->role.invalid = 1;
2339
2340         /*
2341          * Make the request to free obsolete roots after marking the root
2342          * invalid, otherwise other vCPUs may not see it as invalid.
2343          */
2344         if (zapped_root)
2345                 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
2346         return list_unstable;
2347 }
2348
2349 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2350                                      struct list_head *invalid_list)
2351 {
2352         int nr_zapped;
2353
2354         __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2355         return nr_zapped;
2356 }
2357
2358 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2359                                     struct list_head *invalid_list)
2360 {
2361         struct kvm_mmu_page *sp, *nsp;
2362
2363         if (list_empty(invalid_list))
2364                 return;
2365
2366         /*
2367          * We need to make sure everyone sees our modifications to
2368          * the page tables and see changes to vcpu->mode here. The barrier
2369          * in the kvm_flush_remote_tlbs() achieves this. This pairs
2370          * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2371          *
2372          * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2373          * guest mode and/or lockless shadow page table walks.
2374          */
2375         kvm_flush_remote_tlbs(kvm);
2376
2377         list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2378                 WARN_ON(!sp->role.invalid || sp->root_count);
2379                 kvm_mmu_free_page(sp);
2380         }
2381 }
2382
2383 static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
2384                                                   unsigned long nr_to_zap)
2385 {
2386         unsigned long total_zapped = 0;
2387         struct kvm_mmu_page *sp, *tmp;
2388         LIST_HEAD(invalid_list);
2389         bool unstable;
2390         int nr_zapped;
2391
2392         if (list_empty(&kvm->arch.active_mmu_pages))
2393                 return 0;
2394
2395 restart:
2396         list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
2397                 /*
2398                  * Don't zap active root pages, the page itself can't be freed
2399                  * and zapping it will just force vCPUs to realloc and reload.
2400                  */
2401                 if (sp->root_count)
2402                         continue;
2403
2404                 unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
2405                                                       &nr_zapped);
2406                 total_zapped += nr_zapped;
2407                 if (total_zapped >= nr_to_zap)
2408                         break;
2409
2410                 if (unstable)
2411                         goto restart;
2412         }
2413
2414         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2415
2416         kvm->stat.mmu_recycled += total_zapped;
2417         return total_zapped;
2418 }
2419
2420 static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
2421 {
2422         if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
2423                 return kvm->arch.n_max_mmu_pages -
2424                         kvm->arch.n_used_mmu_pages;
2425
2426         return 0;
2427 }
2428
2429 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
2430 {
2431         unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
2432
2433         if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
2434                 return 0;
2435
2436         kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
2437
2438         /*
2439          * Note, this check is intentionally soft, it only guarantees that one
2440          * page is available, while the caller may end up allocating as many as
2441          * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
2442          * exceeding the (arbitrary by default) limit will not harm the host,
2443          * being too aggressive may unnecessarily kill the guest, and getting an
2444          * exact count is far more trouble than it's worth, especially in the
2445          * page fault paths.
2446          */
2447         if (!kvm_mmu_available_pages(vcpu->kvm))
2448                 return -ENOSPC;
2449         return 0;
2450 }
2451
2452 /*
2453  * Changing the number of mmu pages allocated to the vm
2454  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2455  */
2456 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2457 {
2458         write_lock(&kvm->mmu_lock);
2459
2460         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2461                 kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
2462                                                   goal_nr_mmu_pages);
2463
2464                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2465         }
2466
2467         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2468
2469         write_unlock(&kvm->mmu_lock);
2470 }
2471
2472 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2473 {
2474         struct kvm_mmu_page *sp;
2475         LIST_HEAD(invalid_list);
2476         int r;
2477
2478         pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2479         r = 0;
2480         write_lock(&kvm->mmu_lock);
2481         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2482                 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2483                          sp->role.word);
2484                 r = 1;
2485                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2486         }
2487         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2488         write_unlock(&kvm->mmu_lock);
2489
2490         return r;
2491 }
2492
2493 static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2494 {
2495         gpa_t gpa;
2496         int r;
2497
2498         if (vcpu->arch.mmu->direct_map)
2499                 return 0;
2500
2501         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2502
2503         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2504
2505         return r;
2506 }
2507
2508 static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2509 {
2510         trace_kvm_mmu_unsync_page(sp);
2511         ++kvm->stat.mmu_unsync;
2512         sp->unsync = 1;
2513
2514         kvm_mmu_mark_parents_unsync(sp);
2515 }
2516
2517 /*
2518  * Attempt to unsync any shadow pages that can be reached by the specified gfn,
2519  * KVM is creating a writable mapping for said gfn.  Returns 0 if all pages
2520  * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
2521  * be write-protected.
2522  */
2523 int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
2524                             gfn_t gfn, bool can_unsync, bool prefetch)
2525 {
2526         struct kvm_mmu_page *sp;
2527         bool locked = false;
2528
2529         /*
2530          * Force write-protection if the page is being tracked.  Note, the page
2531          * track machinery is used to write-protect upper-level shadow pages,
2532          * i.e. this guards the role.level == 4K assertion below!
2533          */
2534         if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
2535                 return -EPERM;
2536
2537         /*
2538          * The page is not write-tracked, mark existing shadow pages unsync
2539          * unless KVM is synchronizing an unsync SP (can_unsync = false).  In
2540          * that case, KVM must complete emulation of the guest TLB flush before
2541          * allowing shadow pages to become unsync (writable by the guest).
2542          */
2543         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2544                 if (!can_unsync)
2545                         return -EPERM;
2546
2547                 if (sp->unsync)
2548                         continue;
2549
2550                 if (prefetch)
2551                         return -EEXIST;
2552
2553                 /*
2554                  * TDP MMU page faults require an additional spinlock as they
2555                  * run with mmu_lock held for read, not write, and the unsync
2556                  * logic is not thread safe.  Take the spinklock regardless of
2557                  * the MMU type to avoid extra conditionals/parameters, there's
2558                  * no meaningful penalty if mmu_lock is held for write.
2559                  */
2560                 if (!locked) {
2561                         locked = true;
2562                         spin_lock(&kvm->arch.mmu_unsync_pages_lock);
2563
2564                         /*
2565                          * Recheck after taking the spinlock, a different vCPU
2566                          * may have since marked the page unsync.  A false
2567                          * positive on the unprotected check above is not
2568                          * possible as clearing sp->unsync _must_ hold mmu_lock
2569                          * for write, i.e. unsync cannot transition from 0->1
2570                          * while this CPU holds mmu_lock for read (or write).
2571                          */
2572                         if (READ_ONCE(sp->unsync))
2573                                 continue;
2574                 }
2575
2576                 WARN_ON(sp->role.level != PG_LEVEL_4K);
2577                 kvm_unsync_page(kvm, sp);
2578         }
2579         if (locked)
2580                 spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
2581
2582         /*
2583          * We need to ensure that the marking of unsync pages is visible
2584          * before the SPTE is updated to allow writes because
2585          * kvm_mmu_sync_roots() checks the unsync flags without holding
2586          * the MMU lock and so can race with this. If the SPTE was updated
2587          * before the page had been marked as unsync-ed, something like the
2588          * following could happen:
2589          *
2590          * CPU 1                    CPU 2
2591          * ---------------------------------------------------------------------
2592          * 1.2 Host updates SPTE
2593          *     to be writable
2594          *                      2.1 Guest writes a GPTE for GVA X.
2595          *                          (GPTE being in the guest page table shadowed
2596          *                           by the SP from CPU 1.)
2597          *                          This reads SPTE during the page table walk.
2598          *                          Since SPTE.W is read as 1, there is no
2599          *                          fault.
2600          *
2601          *                      2.2 Guest issues TLB flush.
2602          *                          That causes a VM Exit.
2603          *
2604          *                      2.3 Walking of unsync pages sees sp->unsync is
2605          *                          false and skips the page.
2606          *
2607          *                      2.4 Guest accesses GVA X.
2608          *                          Since the mapping in the SP was not updated,
2609          *                          so the old mapping for GVA X incorrectly
2610          *                          gets used.
2611          * 1.1 Host marks SP
2612          *     as unsync
2613          *     (sp->unsync = true)
2614          *
2615          * The write barrier below ensures that 1.1 happens before 1.2 and thus
2616          * the situation in 2.4 does not arise.  It pairs with the read barrier
2617          * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
2618          */
2619         smp_wmb();
2620
2621         return 0;
2622 }
2623
2624 static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
2625                         u64 *sptep, unsigned int pte_access, gfn_t gfn,
2626                         kvm_pfn_t pfn, struct kvm_page_fault *fault)
2627 {
2628         struct kvm_mmu_page *sp = sptep_to_sp(sptep);
2629         int level = sp->role.level;
2630         int was_rmapped = 0;
2631         int ret = RET_PF_FIXED;
2632         bool flush = false;
2633         bool wrprot;
2634         u64 spte;
2635
2636         /* Prefetching always gets a writable pfn.  */
2637         bool host_writable = !fault || fault->map_writable;
2638         bool prefetch = !fault || fault->prefetch;
2639         bool write_fault = fault && fault->write;
2640
2641         pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2642                  *sptep, write_fault, gfn);
2643
2644         if (unlikely(is_noslot_pfn(pfn))) {
2645                 mark_mmio_spte(vcpu, sptep, gfn, pte_access);
2646                 return RET_PF_EMULATE;
2647         }
2648
2649         if (is_shadow_present_pte(*sptep)) {
2650                 /*
2651                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2652                  * the parent of the now unreachable PTE.
2653                  */
2654                 if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
2655                         struct kvm_mmu_page *child;
2656                         u64 pte = *sptep;
2657
2658                         child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
2659                         drop_parent_pte(child, sptep);
2660                         flush = true;
2661                 } else if (pfn != spte_to_pfn(*sptep)) {
2662                         pgprintk("hfn old %llx new %llx\n",
2663                                  spte_to_pfn(*sptep), pfn);
2664                         drop_spte(vcpu->kvm, sptep);
2665                         flush = true;
2666                 } else
2667                         was_rmapped = 1;
2668         }
2669
2670         wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
2671                            true, host_writable, &spte);
2672
2673         if (*sptep == spte) {
2674                 ret = RET_PF_SPURIOUS;
2675         } else {
2676                 flush |= mmu_spte_update(sptep, spte);
2677                 trace_kvm_mmu_set_spte(level, gfn, sptep);
2678         }
2679
2680         if (wrprot) {
2681                 if (write_fault)
2682                         ret = RET_PF_EMULATE;
2683         }
2684
2685         if (flush)
2686                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
2687                                 KVM_PAGES_PER_HPAGE(level));
2688
2689         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2690
2691         if (!was_rmapped) {
2692                 WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
2693                 kvm_update_page_stats(vcpu->kvm, level, 1);
2694                 rmap_add(vcpu, slot, sptep, gfn);
2695         }
2696
2697         return ret;
2698 }
2699
2700 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2701                                     struct kvm_mmu_page *sp,
2702                                     u64 *start, u64 *end)
2703 {
2704         struct page *pages[PTE_PREFETCH_NUM];
2705         struct kvm_memory_slot *slot;
2706         unsigned int access = sp->role.access;
2707         int i, ret;
2708         gfn_t gfn;
2709
2710         gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2711         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
2712         if (!slot)
2713                 return -1;
2714
2715         ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
2716         if (ret <= 0)
2717                 return -1;
2718
2719         for (i = 0; i < ret; i++, gfn++, start++) {
2720                 mmu_set_spte(vcpu, slot, start, access, gfn,
2721                              page_to_pfn(pages[i]), NULL);
2722                 put_page(pages[i]);
2723         }
2724
2725         return 0;
2726 }
2727
2728 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2729                                   struct kvm_mmu_page *sp, u64 *sptep)
2730 {
2731         u64 *spte, *start = NULL;
2732         int i;
2733
2734         WARN_ON(!sp->role.direct);
2735
2736         i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2737         spte = sp->spt + i;
2738
2739         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2740                 if (is_shadow_present_pte(*spte) || spte == sptep) {
2741                         if (!start)
2742                                 continue;
2743                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2744                                 return;
2745                         start = NULL;
2746                 } else if (!start)
2747                         start = spte;
2748         }
2749         if (start)
2750                 direct_pte_prefetch_many(vcpu, sp, start, spte);
2751 }
2752
2753 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2754 {
2755         struct kvm_mmu_page *sp;
2756
2757         sp = sptep_to_sp(sptep);
2758
2759         /*
2760          * Without accessed bits, there's no way to distinguish between
2761          * actually accessed translations and prefetched, so disable pte
2762          * prefetch if accessed bits aren't available.
2763          */
2764         if (sp_ad_disabled(sp))
2765                 return;
2766
2767         if (sp->role.level > PG_LEVEL_4K)
2768                 return;
2769
2770         /*
2771          * If addresses are being invalidated, skip prefetching to avoid
2772          * accidentally prefetching those addresses.
2773          */
2774         if (unlikely(vcpu->kvm->mmu_notifier_count))
2775                 return;
2776
2777         __direct_pte_prefetch(vcpu, sp, sptep);
2778 }
2779
2780 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
2781                                   const struct kvm_memory_slot *slot)
2782 {
2783         unsigned long hva;
2784         unsigned long flags;
2785         int level = PG_LEVEL_4K;
2786         pgd_t pgd;
2787         p4d_t p4d;
2788         pud_t pud;
2789         pmd_t pmd;
2790
2791         if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
2792                 return PG_LEVEL_4K;
2793
2794         /*
2795          * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
2796          * is not solely for performance, it's also necessary to avoid the
2797          * "writable" check in __gfn_to_hva_many(), which will always fail on
2798          * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
2799          * page fault steps have already verified the guest isn't writing a
2800          * read-only memslot.
2801          */
2802         hva = __gfn_to_hva_memslot(slot, gfn);
2803
2804         /*
2805          * Lookup the mapping level in the current mm.  The information
2806          * may become stale soon, but it is safe to use as long as
2807          * 1) mmu_notifier_retry was checked after taking mmu_lock, and
2808          * 2) mmu_lock is taken now.
2809          *
2810          * We still need to disable IRQs to prevent concurrent tear down
2811          * of page tables.
2812          */
2813         local_irq_save(flags);
2814
2815         pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
2816         if (pgd_none(pgd))
2817                 goto out;
2818
2819         p4d = READ_ONCE(*p4d_offset(&pgd, hva));
2820         if (p4d_none(p4d) || !p4d_present(p4d))
2821                 goto out;
2822
2823         pud = READ_ONCE(*pud_offset(&p4d, hva));
2824         if (pud_none(pud) || !pud_present(pud))
2825                 goto out;
2826
2827         if (pud_large(pud)) {
2828                 level = PG_LEVEL_1G;
2829                 goto out;
2830         }
2831
2832         pmd = READ_ONCE(*pmd_offset(&pud, hva));
2833         if (pmd_none(pmd) || !pmd_present(pmd))
2834                 goto out;
2835
2836         if (pmd_large(pmd))
2837                 level = PG_LEVEL_2M;
2838
2839 out:
2840         local_irq_restore(flags);
2841         return level;
2842 }
2843
2844 int kvm_mmu_max_mapping_level(struct kvm *kvm,
2845                               const struct kvm_memory_slot *slot, gfn_t gfn,
2846                               kvm_pfn_t pfn, int max_level)
2847 {
2848         struct kvm_lpage_info *linfo;
2849         int host_level;
2850
2851         max_level = min(max_level, max_huge_page_level);
2852         for ( ; max_level > PG_LEVEL_4K; max_level--) {
2853                 linfo = lpage_info_slot(gfn, slot, max_level);
2854                 if (!linfo->disallow_lpage)
2855                         break;
2856         }
2857
2858         if (max_level == PG_LEVEL_4K)
2859                 return PG_LEVEL_4K;
2860
2861         host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
2862         return min(host_level, max_level);
2863 }
2864
2865 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
2866 {
2867         struct kvm_memory_slot *slot = fault->slot;
2868         kvm_pfn_t mask;
2869
2870         fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
2871
2872         if (unlikely(fault->max_level == PG_LEVEL_4K))
2873                 return;
2874
2875         if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn))
2876                 return;
2877
2878         if (kvm_slot_dirty_track_enabled(slot))
2879                 return;
2880
2881         /*
2882          * Enforce the iTLB multihit workaround after capturing the requested
2883          * level, which will be used to do precise, accurate accounting.
2884          */
2885         fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
2886                                                      fault->gfn, fault->pfn,
2887                                                      fault->max_level);
2888         if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
2889                 return;
2890
2891         /*
2892          * mmu_notifier_retry() was successful and mmu_lock is held, so
2893          * the pmd can't be split from under us.
2894          */
2895         fault->goal_level = fault->req_level;
2896         mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
2897         VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
2898         fault->pfn &= ~mask;
2899 }
2900
2901 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
2902 {
2903         if (cur_level > PG_LEVEL_4K &&
2904             cur_level == fault->goal_level &&
2905             is_shadow_present_pte(spte) &&
2906             !is_large_pte(spte)) {
2907                 /*
2908                  * A small SPTE exists for this pfn, but FNAME(fetch)
2909                  * and __direct_map would like to create a large PTE
2910                  * instead: just force them to go down another level,
2911                  * patching back for them into pfn the next 9 bits of
2912                  * the address.
2913                  */
2914                 u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
2915                                 KVM_PAGES_PER_HPAGE(cur_level - 1);
2916                 fault->pfn |= fault->gfn & page_mask;
2917                 fault->goal_level--;
2918         }
2919 }
2920
2921 static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
2922 {
2923         struct kvm_shadow_walk_iterator it;
2924         struct kvm_mmu_page *sp;
2925         int ret;
2926         gfn_t base_gfn = fault->gfn;
2927
2928         kvm_mmu_hugepage_adjust(vcpu, fault);
2929
2930         trace_kvm_mmu_spte_requested(fault);
2931         for_each_shadow_entry(vcpu, fault->addr, it) {
2932                 /*
2933                  * We cannot overwrite existing page tables with an NX
2934                  * large page, as the leaf could be executable.
2935                  */
2936                 if (fault->nx_huge_page_workaround_enabled)
2937                         disallowed_hugepage_adjust(fault, *it.sptep, it.level);
2938
2939                 base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
2940                 if (it.level == fault->goal_level)
2941                         break;
2942
2943                 drop_large_spte(vcpu, it.sptep);
2944                 if (is_shadow_present_pte(*it.sptep))
2945                         continue;
2946
2947                 sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
2948                                       it.level - 1, true, ACC_ALL);
2949
2950                 link_shadow_page(vcpu, it.sptep, sp);
2951                 if (fault->is_tdp && fault->huge_page_disallowed &&
2952                     fault->req_level >= it.level)
2953                         account_huge_nx_page(vcpu->kvm, sp);
2954         }
2955
2956         if (WARN_ON_ONCE(it.level != fault->goal_level))
2957                 return -EFAULT;
2958
2959         ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
2960                            base_gfn, fault->pfn, fault);
2961         if (ret == RET_PF_SPURIOUS)
2962                 return ret;
2963
2964         direct_pte_prefetch(vcpu, it.sptep);
2965         ++vcpu->stat.pf_fixed;
2966         return ret;
2967 }
2968
2969 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2970 {
2971         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
2972 }
2973
2974 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
2975 {
2976         /*
2977          * Do not cache the mmio info caused by writing the readonly gfn
2978          * into the spte otherwise read access on readonly gfn also can
2979          * caused mmio page fault and treat it as mmio access.
2980          */
2981         if (pfn == KVM_PFN_ERR_RO_FAULT)
2982                 return RET_PF_EMULATE;
2983
2984         if (pfn == KVM_PFN_ERR_HWPOISON) {
2985                 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
2986                 return RET_PF_RETRY;
2987         }
2988
2989         return -EFAULT;
2990 }
2991
2992 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
2993                                 unsigned int access, int *ret_val)
2994 {
2995         /* The pfn is invalid, report the error! */
2996         if (unlikely(is_error_pfn(fault->pfn))) {
2997                 *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
2998                 return true;
2999         }
3000
3001         if (unlikely(!fault->slot)) {
3002                 gva_t gva = fault->is_tdp ? 0 : fault->addr;
3003
3004                 vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
3005                                      access & shadow_mmio_access_mask);
3006                 /*
3007                  * If MMIO caching is disabled, emulate immediately without
3008                  * touching the shadow page tables as attempting to install an
3009                  * MMIO SPTE will just be an expensive nop.  Do not cache MMIO
3010                  * whose gfn is greater than host.MAXPHYADDR, any guest that
3011                  * generates such gfns is running nested and is being tricked
3012                  * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
3013                  * and only if L1's MAXPHYADDR is inaccurate with respect to
3014                  * the hardware's).
3015                  */
3016                 if (unlikely(!shadow_mmio_value) ||
3017                     unlikely(fault->gfn > kvm_mmu_max_gfn())) {
3018                         *ret_val = RET_PF_EMULATE;
3019                         return true;
3020                 }
3021         }
3022
3023         return false;
3024 }
3025
3026 static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
3027 {
3028         /*
3029          * Do not fix the mmio spte with invalid generation number which
3030          * need to be updated by slow page fault path.
3031          */
3032         if (fault->rsvd)
3033                 return false;
3034
3035         /* See if the page fault is due to an NX violation */
3036         if (unlikely(fault->exec && fault->present))
3037                 return false;
3038
3039         /*
3040          * #PF can be fast if:
3041          * 1. The shadow page table entry is not present, which could mean that
3042          *    the fault is potentially caused by access tracking (if enabled).
3043          * 2. The shadow page table entry is present and the fault
3044          *    is caused by write-protect, that means we just need change the W
3045          *    bit of the spte which can be done out of mmu-lock.
3046          *
3047          * However, if access tracking is disabled we know that a non-present
3048          * page must be a genuine page fault where we have to create a new SPTE.
3049          * So, if access tracking is disabled, we return true only for write
3050          * accesses to a present page.
3051          */
3052
3053         return shadow_acc_track_mask != 0 || (fault->write && fault->present);
3054 }
3055
3056 /*
3057  * Returns true if the SPTE was fixed successfully. Otherwise,
3058  * someone else modified the SPTE from its original value.
3059  */
3060 static bool
3061 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
3062                         u64 *sptep, u64 old_spte, u64 new_spte)
3063 {
3064         /*
3065          * Theoretically we could also set dirty bit (and flush TLB) here in
3066          * order to eliminate unnecessary PML logging. See comments in
3067          * set_spte. But fast_page_fault is very unlikely to happen with PML
3068          * enabled, so we do not do this. This might result in the same GPA
3069          * to be logged in PML buffer again when the write really happens, and
3070          * eventually to be called by mark_page_dirty twice. But it's also no
3071          * harm. This also avoids the TLB flush needed after setting dirty bit
3072          * so non-PML cases won't be impacted.
3073          *
3074          * Compare with set_spte where instead shadow_dirty_mask is set.
3075          */
3076         if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3077                 return false;
3078
3079         if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
3080                 mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
3081
3082         return true;
3083 }
3084
3085 static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
3086 {
3087         if (fault->exec)
3088                 return is_executable_pte(spte);
3089
3090         if (fault->write)
3091                 return is_writable_pte(spte);
3092
3093         /* Fault was on Read access */
3094         return spte & PT_PRESENT_MASK;
3095 }
3096
3097 /*
3098  * Returns the last level spte pointer of the shadow page walk for the given
3099  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
3100  * walk could be performed, returns NULL and *spte does not contain valid data.
3101  *
3102  * Contract:
3103  *  - Must be called between walk_shadow_page_lockless_{begin,end}.
3104  *  - The returned sptep must not be used after walk_shadow_page_lockless_end.
3105  */
3106 static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
3107 {
3108         struct kvm_shadow_walk_iterator iterator;
3109         u64 old_spte;
3110         u64 *sptep = NULL;
3111
3112         for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
3113                 sptep = iterator.sptep;
3114                 *spte = old_spte;
3115         }
3116
3117         return sptep;
3118 }
3119
3120 /*
3121  * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
3122  */
3123 static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3124 {
3125         struct kvm_mmu_page *sp;
3126         int ret = RET_PF_INVALID;
3127         u64 spte = 0ull;
3128         u64 *sptep = NULL;
3129         uint retry_count = 0;
3130
3131         if (!page_fault_can_be_fast(fault))
3132                 return ret;
3133
3134         walk_shadow_page_lockless_begin(vcpu);
3135
3136         do {
3137                 u64 new_spte;
3138
3139                 if (is_tdp_mmu(vcpu->arch.mmu))
3140                         sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3141                 else
3142                         sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3143
3144                 if (!is_shadow_present_pte(spte))
3145                         break;
3146
3147                 sp = sptep_to_sp(sptep);
3148                 if (!is_last_spte(spte, sp->role.level))
3149                         break;
3150
3151                 /*
3152                  * Check whether the memory access that caused the fault would
3153                  * still cause it if it were to be performed right now. If not,
3154                  * then this is a spurious fault caused by TLB lazily flushed,
3155                  * or some other CPU has already fixed the PTE after the
3156                  * current CPU took the fault.
3157                  *
3158                  * Need not check the access of upper level table entries since
3159                  * they are always ACC_ALL.
3160                  */
3161                 if (is_access_allowed(fault, spte)) {
3162                         ret = RET_PF_SPURIOUS;
3163                         break;
3164                 }
3165
3166                 new_spte = spte;
3167
3168                 if (is_access_track_spte(spte))
3169                         new_spte = restore_acc_track_spte(new_spte);
3170
3171                 /*
3172                  * Currently, to simplify the code, write-protection can
3173                  * be removed in the fast path only if the SPTE was
3174                  * write-protected for dirty-logging or access tracking.
3175                  */
3176                 if (fault->write && is_mmu_writable_spte(spte)) {
3177                         new_spte |= PT_WRITABLE_MASK;
3178
3179                         /*
3180                          * Do not fix write-permission on the large spte when
3181                          * dirty logging is enabled. Since we only dirty the
3182                          * first page into the dirty-bitmap in
3183                          * fast_pf_fix_direct_spte(), other pages are missed
3184                          * if its slot has dirty logging enabled.
3185                          *
3186                          * Instead, we let the slow page fault path create a
3187                          * normal spte to fix the access.
3188                          */
3189                         if (sp->role.level > PG_LEVEL_4K &&
3190                             kvm_slot_dirty_track_enabled(fault->slot))
3191                                 break;
3192                 }
3193
3194                 /* Verify that the fault can be handled in the fast path */
3195                 if (new_spte == spte ||
3196                     !is_access_allowed(fault, new_spte))
3197                         break;
3198
3199                 /*
3200                  * Currently, fast page fault only works for direct mapping
3201                  * since the gfn is not stable for indirect shadow page. See
3202                  * Documentation/virt/kvm/locking.rst to get more detail.
3203                  */
3204                 if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
3205                         ret = RET_PF_FIXED;
3206                         break;
3207                 }
3208
3209                 if (++retry_count > 4) {
3210                         printk_once(KERN_WARNING
3211                                 "kvm: Fast #PF retrying more than 4 times.\n");
3212                         break;
3213                 }
3214
3215         } while (true);
3216
3217         trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
3218         walk_shadow_page_lockless_end(vcpu);
3219
3220         return ret;
3221 }
3222
3223 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3224                                struct list_head *invalid_list)
3225 {
3226         struct kvm_mmu_page *sp;
3227
3228         if (!VALID_PAGE(*root_hpa))
3229                 return;
3230
3231         sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
3232         if (WARN_ON(!sp))
3233                 return;
3234
3235         if (is_tdp_mmu_page(sp))
3236                 kvm_tdp_mmu_put_root(kvm, sp, false);
3237         else if (!--sp->root_count && sp->role.invalid)
3238                 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3239
3240         *root_hpa = INVALID_PAGE;
3241 }
3242
3243 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3244 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
3245                         ulong roots_to_free)
3246 {
3247         int i;
3248         LIST_HEAD(invalid_list);
3249         bool free_active_root;
3250
3251         BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3252
3253         /* Before acquiring the MMU lock, see if we need to do any real work. */
3254         free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
3255                 && VALID_PAGE(mmu->root.hpa);
3256
3257         if (!free_active_root) {
3258                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3259                         if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3260                             VALID_PAGE(mmu->prev_roots[i].hpa))
3261                                 break;
3262
3263                 if (i == KVM_MMU_NUM_PREV_ROOTS)
3264                         return;
3265         }
3266
3267         write_lock(&kvm->mmu_lock);
3268
3269         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3270                 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3271                         mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
3272                                            &invalid_list);
3273
3274         if (free_active_root) {
3275                 if (to_shadow_page(mmu->root.hpa)) {
3276                         mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
3277                 } else if (mmu->pae_root) {
3278                         for (i = 0; i < 4; ++i) {
3279                                 if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
3280                                         continue;
3281
3282                                 mmu_free_root_page(kvm, &mmu->pae_root[i],
3283                                                    &invalid_list);
3284                                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3285                         }
3286                 }
3287                 mmu->root.hpa = INVALID_PAGE;
3288                 mmu->root.pgd = 0;
3289         }
3290
3291         kvm_mmu_commit_zap_page(kvm, &invalid_list);
3292         write_unlock(&kvm->mmu_lock);
3293 }
3294 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3295
3296 void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
3297 {
3298         unsigned long roots_to_free = 0;
3299         hpa_t root_hpa;
3300         int i;
3301
3302         /*
3303          * This should not be called while L2 is active, L2 can't invalidate
3304          * _only_ its own roots, e.g. INVVPID unconditionally exits.
3305          */
3306         WARN_ON_ONCE(mmu->mmu_role.base.guest_mode);
3307
3308         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
3309                 root_hpa = mmu->prev_roots[i].hpa;
3310                 if (!VALID_PAGE(root_hpa))
3311                         continue;
3312
3313                 if (!to_shadow_page(root_hpa) ||
3314                         to_shadow_page(root_hpa)->role.guest_mode)
3315                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3316         }
3317
3318         kvm_mmu_free_roots(kvm, mmu, roots_to_free);
3319 }
3320 EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
3321
3322
3323 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3324 {
3325         int ret = 0;
3326
3327         if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
3328                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3329                 ret = 1;
3330         }
3331
3332         return ret;
3333 }
3334
3335 static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
3336                             u8 level, bool direct)
3337 {
3338         struct kvm_mmu_page *sp;
3339
3340         sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
3341         ++sp->root_count;
3342
3343         return __pa(sp->spt);
3344 }
3345
3346 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3347 {
3348         struct kvm_mmu *mmu = vcpu->arch.mmu;
3349         u8 shadow_root_level = mmu->shadow_root_level;
3350         hpa_t root;
3351         unsigned i;
3352         int r;
3353
3354         write_lock(&vcpu->kvm->mmu_lock);
3355         r = make_mmu_pages_available(vcpu);
3356         if (r < 0)
3357                 goto out_unlock;
3358
3359         if (is_tdp_mmu_enabled(vcpu->kvm)) {
3360                 root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
3361                 mmu->root.hpa = root;
3362         } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
3363                 root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
3364                 mmu->root.hpa = root;
3365         } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
3366                 if (WARN_ON_ONCE(!mmu->pae_root)) {
3367                         r = -EIO;
3368                         goto out_unlock;
3369                 }
3370
3371                 for (i = 0; i < 4; ++i) {
3372                         WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3373
3374                         root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
3375                                               i << 30, PT32_ROOT_LEVEL, true);
3376                         mmu->pae_root[i] = root | PT_PRESENT_MASK |
3377                                            shadow_me_mask;
3378                 }
3379                 mmu->root.hpa = __pa(mmu->pae_root);
3380         } else {
3381                 WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
3382                 r = -EIO;
3383                 goto out_unlock;
3384         }
3385
3386         /* root.pgd is ignored for direct MMUs. */
3387         mmu->root.pgd = 0;
3388 out_unlock:
3389         write_unlock(&vcpu->kvm->mmu_lock);
3390         return r;
3391 }
3392
3393 static int mmu_first_shadow_root_alloc(struct kvm *kvm)
3394 {
3395         struct kvm_memslots *slots;
3396         struct kvm_memory_slot *slot;
3397         int r = 0, i, bkt;
3398
3399         /*
3400          * Check if this is the first shadow root being allocated before
3401          * taking the lock.
3402          */
3403         if (kvm_shadow_root_allocated(kvm))
3404                 return 0;
3405
3406         mutex_lock(&kvm->slots_arch_lock);
3407
3408         /* Recheck, under the lock, whether this is the first shadow root. */
3409         if (kvm_shadow_root_allocated(kvm))
3410                 goto out_unlock;
3411
3412         /*
3413          * Check if anything actually needs to be allocated, e.g. all metadata
3414          * will be allocated upfront if TDP is disabled.
3415          */
3416         if (kvm_memslots_have_rmaps(kvm) &&
3417             kvm_page_track_write_tracking_enabled(kvm))
3418                 goto out_success;
3419
3420         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
3421                 slots = __kvm_memslots(kvm, i);
3422                 kvm_for_each_memslot(slot, bkt, slots) {
3423                         /*
3424                          * Both of these functions are no-ops if the target is
3425                          * already allocated, so unconditionally calling both
3426                          * is safe.  Intentionally do NOT free allocations on
3427                          * failure to avoid having to track which allocations
3428                          * were made now versus when the memslot was created.
3429                          * The metadata is guaranteed to be freed when the slot
3430                          * is freed, and will be kept/used if userspace retries
3431                          * KVM_RUN instead of killing the VM.
3432                          */
3433                         r = memslot_rmap_alloc(slot, slot->npages);
3434                         if (r)
3435                                 goto out_unlock;
3436                         r = kvm_page_track_write_tracking_alloc(slot);
3437                         if (r)
3438                                 goto out_unlock;
3439                 }
3440         }
3441
3442         /*
3443          * Ensure that shadow_root_allocated becomes true strictly after
3444          * all the related pointers are set.
3445          */
3446 out_success:
3447         smp_store_release(&kvm->arch.shadow_root_allocated, true);
3448
3449 out_unlock:
3450         mutex_unlock(&kvm->slots_arch_lock);
3451         return r;
3452 }
3453
3454 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3455 {
3456         struct kvm_mmu *mmu = vcpu->arch.mmu;
3457         u64 pdptrs[4], pm_mask;
3458         gfn_t root_gfn, root_pgd;
3459         hpa_t root;
3460         unsigned i;
3461         int r;
3462
3463         root_pgd = mmu->get_guest_pgd(vcpu);
3464         root_gfn = root_pgd >> PAGE_SHIFT;
3465
3466         if (mmu_check_root(vcpu, root_gfn))
3467                 return 1;
3468
3469         /*
3470          * On SVM, reading PDPTRs might access guest memory, which might fault
3471          * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
3472          */
3473         if (mmu->root_level == PT32E_ROOT_LEVEL) {
3474                 for (i = 0; i < 4; ++i) {
3475                         pdptrs[i] = mmu->get_pdptr(vcpu, i);
3476                         if (!(pdptrs[i] & PT_PRESENT_MASK))
3477                                 continue;
3478
3479                         if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
3480                                 return 1;
3481                 }
3482         }
3483
3484         r = mmu_first_shadow_root_alloc(vcpu->kvm);
3485         if (r)
3486                 return r;
3487
3488         write_lock(&vcpu->kvm->mmu_lock);
3489         r = make_mmu_pages_available(vcpu);
3490         if (r < 0)
3491                 goto out_unlock;
3492
3493         /*
3494          * Do we shadow a long mode page table? If so we need to
3495          * write-protect the guests page table root.
3496          */
3497         if (mmu->root_level >= PT64_ROOT_4LEVEL) {
3498                 root = mmu_alloc_root(vcpu, root_gfn, 0,
3499                                       mmu->shadow_root_level, false);
3500                 mmu->root.hpa = root;
3501                 goto set_root_pgd;
3502         }
3503
3504         if (WARN_ON_ONCE(!mmu->pae_root)) {
3505                 r = -EIO;
3506                 goto out_unlock;
3507         }
3508
3509         /*
3510          * We shadow a 32 bit page table. This may be a legacy 2-level
3511          * or a PAE 3-level page table. In either case we need to be aware that
3512          * the shadow page table may be a PAE or a long mode page table.
3513          */
3514         pm_mask = PT_PRESENT_MASK | shadow_me_mask;
3515         if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3516                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3517
3518                 if (WARN_ON_ONCE(!mmu->pml4_root)) {
3519                         r = -EIO;
3520                         goto out_unlock;
3521                 }
3522                 mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
3523
3524                 if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
3525                         if (WARN_ON_ONCE(!mmu->pml5_root)) {
3526                                 r = -EIO;
3527                                 goto out_unlock;
3528                         }
3529                         mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
3530                 }
3531         }
3532
3533         for (i = 0; i < 4; ++i) {
3534                 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3535
3536                 if (mmu->root_level == PT32E_ROOT_LEVEL) {
3537                         if (!(pdptrs[i] & PT_PRESENT_MASK)) {
3538                                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3539                                 continue;
3540                         }
3541                         root_gfn = pdptrs[i] >> PAGE_SHIFT;
3542                 }
3543
3544                 root = mmu_alloc_root(vcpu, root_gfn, i << 30,
3545                                       PT32_ROOT_LEVEL, false);
3546                 mmu->pae_root[i] = root | pm_mask;
3547         }
3548
3549         if (mmu->shadow_root_level == PT64_ROOT_5LEVEL)
3550                 mmu->root.hpa = __pa(mmu->pml5_root);
3551         else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3552                 mmu->root.hpa = __pa(mmu->pml4_root);
3553         else
3554                 mmu->root.hpa = __pa(mmu->pae_root);
3555
3556 set_root_pgd:
3557         mmu->root.pgd = root_pgd;
3558 out_unlock:
3559         write_unlock(&vcpu->kvm->mmu_lock);
3560
3561         return r;
3562 }
3563
3564 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
3565 {
3566         struct kvm_mmu *mmu = vcpu->arch.mmu;
3567         bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL;
3568         u64 *pml5_root = NULL;
3569         u64 *pml4_root = NULL;
3570         u64 *pae_root;
3571
3572         /*
3573          * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
3574          * tables are allocated and initialized at root creation as there is no
3575          * equivalent level in the guest's NPT to shadow.  Allocate the tables
3576          * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
3577          */
3578         if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
3579             mmu->shadow_root_level < PT64_ROOT_4LEVEL)
3580                 return 0;
3581
3582         /*
3583          * NPT, the only paging mode that uses this horror, uses a fixed number
3584          * of levels for the shadow page tables, e.g. all MMUs are 4-level or
3585          * all MMus are 5-level.  Thus, this can safely require that pml5_root
3586          * is allocated if the other roots are valid and pml5 is needed, as any
3587          * prior MMU would also have required pml5.
3588          */
3589         if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
3590                 return 0;
3591
3592         /*
3593          * The special roots should always be allocated in concert.  Yell and
3594          * bail if KVM ends up in a state where only one of the roots is valid.
3595          */
3596         if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
3597                          (need_pml5 && mmu->pml5_root)))
3598                 return -EIO;
3599
3600         /*
3601          * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
3602          * doesn't need to be decrypted.
3603          */
3604         pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3605         if (!pae_root)
3606                 return -ENOMEM;
3607
3608 #ifdef CONFIG_X86_64
3609         pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3610         if (!pml4_root)
3611                 goto err_pml4;
3612
3613         if (need_pml5) {
3614                 pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3615                 if (!pml5_root)
3616                         goto err_pml5;
3617         }
3618 #endif
3619
3620         mmu->pae_root = pae_root;
3621         mmu->pml4_root = pml4_root;
3622         mmu->pml5_root = pml5_root;
3623
3624         return 0;
3625
3626 #ifdef CONFIG_X86_64
3627 err_pml5:
3628         free_page((unsigned long)pml4_root);
3629 err_pml4:
3630         free_page((unsigned long)pae_root);
3631         return -ENOMEM;
3632 #endif
3633 }
3634
3635 static bool is_unsync_root(hpa_t root)
3636 {
3637         struct kvm_mmu_page *sp;
3638
3639         if (!VALID_PAGE(root))
3640                 return false;
3641
3642         /*
3643          * The read barrier orders the CPU's read of SPTE.W during the page table
3644          * walk before the reads of sp->unsync/sp->unsync_children here.
3645          *
3646          * Even if another CPU was marking the SP as unsync-ed simultaneously,
3647          * any guest page table changes are not guaranteed to be visible anyway
3648          * until this VCPU issues a TLB flush strictly after those changes are
3649          * made.  We only need to ensure that the other CPU sets these flags
3650          * before any actual changes to the page tables are made.  The comments
3651          * in mmu_try_to_unsync_pages() describe what could go wrong if this
3652          * requirement isn't satisfied.
3653          */
3654         smp_rmb();
3655         sp = to_shadow_page(root);
3656
3657         /*
3658          * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
3659          * PDPTEs for a given PAE root need to be synchronized individually.
3660          */
3661         if (WARN_ON_ONCE(!sp))
3662                 return false;
3663
3664         if (sp->unsync || sp->unsync_children)
3665                 return true;
3666
3667         return false;
3668 }
3669
3670 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3671 {
3672         int i;
3673         struct kvm_mmu_page *sp;
3674
3675         if (vcpu->arch.mmu->direct_map)
3676                 return;
3677
3678         if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
3679                 return;
3680
3681         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3682
3683         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3684                 hpa_t root = vcpu->arch.mmu->root.hpa;
3685                 sp = to_shadow_page(root);
3686
3687                 if (!is_unsync_root(root))
3688                         return;
3689
3690                 write_lock(&vcpu->kvm->mmu_lock);
3691                 mmu_sync_children(vcpu, sp, true);
3692                 write_unlock(&vcpu->kvm->mmu_lock);
3693                 return;
3694         }
3695
3696         write_lock(&vcpu->kvm->mmu_lock);
3697
3698         for (i = 0; i < 4; ++i) {
3699                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3700
3701                 if (IS_VALID_PAE_ROOT(root)) {
3702                         root &= PT64_BASE_ADDR_MASK;
3703                         sp = to_shadow_page(root);
3704                         mmu_sync_children(vcpu, sp, true);
3705                 }
3706         }
3707
3708         write_unlock(&vcpu->kvm->mmu_lock);
3709 }
3710
3711 void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
3712 {
3713         unsigned long roots_to_free = 0;
3714         int i;
3715
3716         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3717                 if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
3718                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3719
3720         /* sync prev_roots by simply freeing them */
3721         kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
3722 }
3723
3724 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3725                                   gpa_t vaddr, u64 access,
3726                                   struct x86_exception *exception)
3727 {
3728         if (exception)
3729                 exception->error_code = 0;
3730         return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
3731 }
3732
3733 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3734 {
3735         /*
3736          * A nested guest cannot use the MMIO cache if it is using nested
3737          * page tables, because cr2 is a nGPA while the cache stores GPAs.
3738          */
3739         if (mmu_is_nested(vcpu))
3740                 return false;
3741
3742         if (direct)
3743                 return vcpu_match_mmio_gpa(vcpu, addr);
3744
3745         return vcpu_match_mmio_gva(vcpu, addr);
3746 }
3747
3748 /*
3749  * Return the level of the lowest level SPTE added to sptes.
3750  * That SPTE may be non-present.
3751  *
3752  * Must be called between walk_shadow_page_lockless_{begin,end}.
3753  */
3754 static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
3755 {
3756         struct kvm_shadow_walk_iterator iterator;
3757         int leaf = -1;
3758         u64 spte;
3759
3760         for (shadow_walk_init(&iterator, vcpu, addr),
3761              *root_level = iterator.level;
3762              shadow_walk_okay(&iterator);
3763              __shadow_walk_next(&iterator, spte)) {
3764                 leaf = iterator.level;
3765                 spte = mmu_spte_get_lockless(iterator.sptep);
3766
3767                 sptes[leaf] = spte;
3768         }
3769
3770         return leaf;
3771 }
3772
3773 /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
3774 static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3775 {
3776         u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
3777         struct rsvd_bits_validate *rsvd_check;
3778         int root, leaf, level;
3779         bool reserved = false;
3780
3781         walk_shadow_page_lockless_begin(vcpu);
3782
3783         if (is_tdp_mmu(vcpu->arch.mmu))
3784                 leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
3785         else
3786                 leaf = get_walk(vcpu, addr, sptes, &root);
3787
3788         walk_shadow_page_lockless_end(vcpu);
3789
3790         if (unlikely(leaf < 0)) {
3791                 *sptep = 0ull;
3792                 return reserved;
3793         }
3794
3795         *sptep = sptes[leaf];
3796
3797         /*
3798          * Skip reserved bits checks on the terminal leaf if it's not a valid
3799          * SPTE.  Note, this also (intentionally) skips MMIO SPTEs, which, by
3800          * design, always have reserved bits set.  The purpose of the checks is
3801          * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
3802          */
3803         if (!is_shadow_present_pte(sptes[leaf]))
3804                 leaf++;
3805
3806         rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
3807
3808         for (level = root; level >= leaf; level--)
3809                 reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
3810
3811         if (reserved) {
3812                 pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
3813                        __func__, addr);
3814                 for (level = root; level >= leaf; level--)
3815                         pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
3816                                sptes[level], level,
3817                                get_rsvd_bits(rsvd_check, sptes[level], level));
3818         }
3819
3820         return reserved;
3821 }
3822
3823 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3824 {
3825         u64 spte;
3826         bool reserved;
3827
3828         if (mmio_info_in_cache(vcpu, addr, direct))
3829                 return RET_PF_EMULATE;
3830
3831         reserved = get_mmio_spte(vcpu, addr, &spte);
3832         if (WARN_ON(reserved))
3833                 return -EINVAL;
3834
3835         if (is_mmio_spte(spte)) {
3836                 gfn_t gfn = get_mmio_spte_gfn(spte);
3837                 unsigned int access = get_mmio_spte_access(spte);
3838
3839                 if (!check_mmio_spte(vcpu, spte))
3840                         return RET_PF_INVALID;
3841
3842                 if (direct)
3843                         addr = 0;
3844
3845                 trace_handle_mmio_page_fault(addr, gfn, access);
3846                 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3847                 return RET_PF_EMULATE;
3848         }
3849
3850         /*
3851          * If the page table is zapped by other cpus, let CPU fault again on
3852          * the address.
3853          */
3854         return RET_PF_RETRY;
3855 }
3856
3857 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
3858                                          struct kvm_page_fault *fault)
3859 {
3860         if (unlikely(fault->rsvd))
3861                 return false;
3862
3863         if (!fault->present || !fault->write)
3864                 return false;
3865
3866         /*
3867          * guest is writing the page which is write tracked which can
3868          * not be fixed by page fault handler.
3869          */
3870         if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
3871                 return true;
3872
3873         return false;
3874 }
3875
3876 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
3877 {
3878         struct kvm_shadow_walk_iterator iterator;
3879         u64 spte;
3880
3881         walk_shadow_page_lockless_begin(vcpu);
3882         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
3883                 clear_sp_write_flooding_count(iterator.sptep);
3884         walk_shadow_page_lockless_end(vcpu);
3885 }
3886
3887 static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
3888 {
3889         /* make sure the token value is not 0 */
3890         u32 id = vcpu->arch.apf.id;
3891
3892         if (id << 12 == 0)
3893                 vcpu->arch.apf.id = 1;
3894
3895         return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
3896 }
3897
3898 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
3899                                     gfn_t gfn)
3900 {
3901         struct kvm_arch_async_pf arch;
3902
3903         arch.token = alloc_apf_token(vcpu);
3904         arch.gfn = gfn;
3905         arch.direct_map = vcpu->arch.mmu->direct_map;
3906         arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
3907
3908         return kvm_setup_async_pf(vcpu, cr2_or_gpa,
3909                                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
3910 }
3911
3912 static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r)
3913 {
3914         struct kvm_memory_slot *slot = fault->slot;
3915         bool async;
3916
3917         /*
3918          * Retry the page fault if the gfn hit a memslot that is being deleted
3919          * or moved.  This ensures any existing SPTEs for the old memslot will
3920          * be zapped before KVM inserts a new MMIO SPTE for the gfn.
3921          */
3922         if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
3923                 goto out_retry;
3924
3925         if (!kvm_is_visible_memslot(slot)) {
3926                 /* Don't expose private memslots to L2. */
3927                 if (is_guest_mode(vcpu)) {
3928                         fault->slot = NULL;
3929                         fault->pfn = KVM_PFN_NOSLOT;
3930                         fault->map_writable = false;
3931                         return false;
3932                 }
3933                 /*
3934                  * If the APIC access page exists but is disabled, go directly
3935                  * to emulation without caching the MMIO access or creating a
3936                  * MMIO SPTE.  That way the cache doesn't need to be purged
3937                  * when the AVIC is re-enabled.
3938                  */
3939                 if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
3940                     !kvm_apicv_activated(vcpu->kvm)) {
3941                         *r = RET_PF_EMULATE;
3942                         return true;
3943                 }
3944         }
3945
3946         async = false;
3947         fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
3948                                           fault->write, &fault->map_writable,
3949                                           &fault->hva);
3950         if (!async)
3951                 return false; /* *pfn has correct page already */
3952
3953         if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
3954                 trace_kvm_try_async_get_page(fault->addr, fault->gfn);
3955                 if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
3956                         trace_kvm_async_pf_doublefault(fault->addr, fault->gfn);
3957                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
3958                         goto out_retry;
3959                 } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn))
3960                         goto out_retry;
3961         }
3962
3963         fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
3964                                           fault->write, &fault->map_writable,
3965                                           &fault->hva);
3966         return false;
3967
3968 out_retry:
3969         *r = RET_PF_RETRY;
3970         return true;
3971 }
3972
3973 /*
3974  * Returns true if the page fault is stale and needs to be retried, i.e. if the
3975  * root was invalidated by a memslot update or a relevant mmu_notifier fired.
3976  */
3977 static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
3978                                 struct kvm_page_fault *fault, int mmu_seq)
3979 {
3980         struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
3981
3982         /* Special roots, e.g. pae_root, are not backed by shadow pages. */
3983         if (sp && is_obsolete_sp(vcpu->kvm, sp))
3984                 return true;
3985
3986         /*
3987          * Roots without an associated shadow page are considered invalid if
3988          * there is a pending request to free obsolete roots.  The request is
3989          * only a hint that the current root _may_ be obsolete and needs to be
3990          * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
3991          * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
3992          * to reload even if no vCPU is actively using the root.
3993          */
3994         if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
3995                 return true;
3996
3997         return fault->slot &&
3998                mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
3999 }
4000
4001 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4002 {
4003         bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
4004
4005         unsigned long mmu_seq;
4006         int r;
4007
4008         fault->gfn = fault->addr >> PAGE_SHIFT;
4009         fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
4010
4011         if (page_fault_handle_page_track(vcpu, fault))
4012                 return RET_PF_EMULATE;
4013
4014         r = fast_page_fault(vcpu, fault);
4015         if (r != RET_PF_INVALID)
4016                 return r;
4017
4018         r = mmu_topup_memory_caches(vcpu, false);
4019         if (r)
4020                 return r;
4021
4022         mmu_seq = vcpu->kvm->mmu_notifier_seq;
4023         smp_rmb();
4024
4025         if (kvm_faultin_pfn(vcpu, fault, &r))
4026                 return r;
4027
4028         if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r))
4029                 return r;
4030
4031         r = RET_PF_RETRY;
4032
4033         if (is_tdp_mmu_fault)
4034                 read_lock(&vcpu->kvm->mmu_lock);
4035         else
4036                 write_lock(&vcpu->kvm->mmu_lock);
4037
4038         if (is_page_fault_stale(vcpu, fault, mmu_seq))
4039                 goto out_unlock;
4040
4041         r = make_mmu_pages_available(vcpu);
4042         if (r)
4043                 goto out_unlock;
4044
4045         if (is_tdp_mmu_fault)
4046                 r = kvm_tdp_mmu_map(vcpu, fault);
4047         else
4048                 r = __direct_map(vcpu, fault);
4049
4050 out_unlock:
4051         if (is_tdp_mmu_fault)
4052                 read_unlock(&vcpu->kvm->mmu_lock);
4053         else
4054                 write_unlock(&vcpu->kvm->mmu_lock);
4055         kvm_release_pfn_clean(fault->pfn);
4056         return r;
4057 }
4058
4059 static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
4060                                 struct kvm_page_fault *fault)
4061 {
4062         pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
4063
4064         /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
4065         fault->max_level = PG_LEVEL_2M;
4066         return direct_page_fault(vcpu, fault);
4067 }
4068
4069 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4070                                 u64 fault_address, char *insn, int insn_len)
4071 {
4072         int r = 1;
4073         u32 flags = vcpu->arch.apf.host_apf_flags;
4074
4075 #ifndef CONFIG_X86_64
4076         /* A 64-bit CR2 should be impossible on 32-bit KVM. */
4077         if (WARN_ON_ONCE(fault_address >> 32))
4078                 return -EFAULT;
4079 #endif
4080
4081         vcpu->arch.l1tf_flush_l1d = true;
4082         if (!flags) {
4083                 trace_kvm_page_fault(fault_address, error_code);
4084
4085                 if (kvm_event_needs_reinjection(vcpu))
4086                         kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4087                 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4088                                 insn_len);
4089         } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
4090                 vcpu->arch.apf.host_apf_flags = 0;
4091                 local_irq_disable();
4092                 kvm_async_pf_task_wait_schedule(fault_address);
4093                 local_irq_enable();
4094         } else {
4095                 WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
4096         }
4097
4098         return r;
4099 }
4100 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4101
4102 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4103 {
4104         while (fault->max_level > PG_LEVEL_4K) {
4105                 int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
4106                 gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
4107
4108                 if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
4109                         break;
4110
4111                 --fault->max_level;
4112         }
4113
4114         return direct_page_fault(vcpu, fault);
4115 }
4116
4117 static void nonpaging_init_context(struct kvm_mmu *context)
4118 {
4119         context->page_fault = nonpaging_page_fault;
4120         context->gva_to_gpa = nonpaging_gva_to_gpa;
4121         context->sync_page = nonpaging_sync_page;
4122         context->invlpg = NULL;
4123         context->direct_map = true;
4124 }
4125
4126 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
4127                                   union kvm_mmu_page_role role)
4128 {
4129         return (role.direct || pgd == root->pgd) &&
4130                VALID_PAGE(root->hpa) &&
4131                role.word == to_shadow_page(root->hpa)->role.word;
4132 }
4133
4134 /*
4135  * Find out if a previously cached root matching the new pgd/role is available,
4136  * and insert the current root as the MRU in the cache.
4137  * If a matching root is found, it is assigned to kvm_mmu->root and
4138  * true is returned.
4139  * If no match is found, kvm_mmu->root is left invalid, the LRU root is
4140  * evicted to make room for the current root, and false is returned.
4141  */
4142 static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
4143                                               gpa_t new_pgd,
4144                                               union kvm_mmu_page_role new_role)
4145 {
4146         uint i;
4147
4148         if (is_root_usable(&mmu->root, new_pgd, new_role))
4149                 return true;
4150
4151         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4152                 /*
4153                  * The swaps end up rotating the cache like this:
4154                  *   C   0 1 2 3   (on entry to the function)
4155                  *   0   C 1 2 3
4156                  *   1   C 0 2 3
4157                  *   2   C 0 1 3
4158                  *   3   C 0 1 2   (on exit from the loop)
4159                  */
4160                 swap(mmu->root, mmu->prev_roots[i]);
4161                 if (is_root_usable(&mmu->root, new_pgd, new_role))
4162                         return true;
4163         }
4164
4165         kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4166         return false;
4167 }
4168
4169 /*
4170  * Find out if a previously cached root matching the new pgd/role is available.
4171  * On entry, mmu->root is invalid.
4172  * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
4173  * of the cache becomes invalid, and true is returned.
4174  * If no match is found, kvm_mmu->root is left invalid and false is returned.
4175  */
4176 static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
4177                                              gpa_t new_pgd,
4178                                              union kvm_mmu_page_role new_role)
4179 {
4180         uint i;
4181
4182         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4183                 if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
4184                         goto hit;
4185
4186         return false;
4187
4188 hit:
4189         swap(mmu->root, mmu->prev_roots[i]);
4190         /* Bubble up the remaining roots.  */
4191         for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
4192                 mmu->prev_roots[i] = mmu->prev_roots[i + 1];
4193         mmu->prev_roots[i].hpa = INVALID_PAGE;
4194         return true;
4195 }
4196
4197 static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
4198                             gpa_t new_pgd, union kvm_mmu_page_role new_role)
4199 {
4200         /*
4201          * For now, limit the caching to 64-bit hosts+VMs in order to avoid
4202          * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4203          * later if necessary.
4204          */
4205         if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
4206                 kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4207
4208         if (VALID_PAGE(mmu->root.hpa))
4209                 return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
4210         else
4211                 return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
4212 }
4213
4214 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
4215 {
4216         struct kvm_mmu *mmu = vcpu->arch.mmu;
4217         union kvm_mmu_page_role new_role = mmu->mmu_role.base;
4218
4219         if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
4220                 /* kvm_mmu_ensure_valid_pgd will set up a new root.  */
4221                 return;
4222         }
4223
4224         /*
4225          * It's possible that the cached previous root page is obsolete because
4226          * of a change in the MMU generation number. However, changing the
4227          * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
4228          * which will free the root set here and allocate a new one.
4229          */
4230         kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
4231
4232         if (force_flush_and_sync_on_reuse) {
4233                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4234                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
4235         }
4236
4237         /*
4238          * The last MMIO access's GVA and GPA are cached in the VCPU. When
4239          * switching to a new CR3, that GVA->GPA mapping may no longer be
4240          * valid. So clear any cached MMIO info even when we don't need to sync
4241          * the shadow page tables.
4242          */
4243         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4244
4245         /*
4246          * If this is a direct root page, it doesn't have a write flooding
4247          * count. Otherwise, clear the write flooding count.
4248          */
4249         if (!new_role.direct)
4250                 __clear_sp_write_flooding_count(
4251                                 to_shadow_page(vcpu->arch.mmu->root.hpa));
4252 }
4253 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
4254
4255 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4256 {
4257         return kvm_read_cr3(vcpu);
4258 }
4259
4260 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4261                            unsigned int access)
4262 {
4263         if (unlikely(is_mmio_spte(*sptep))) {
4264                 if (gfn != get_mmio_spte_gfn(*sptep)) {
4265                         mmu_spte_clear_no_track(sptep);
4266                         return true;
4267                 }
4268
4269                 mark_mmio_spte(vcpu, sptep, gfn, access);
4270                 return true;
4271         }
4272
4273         return false;
4274 }
4275
4276 #define PTTYPE_EPT 18 /* arbitrary */
4277 #define PTTYPE PTTYPE_EPT
4278 #include "paging_tmpl.h"
4279 #undef PTTYPE
4280
4281 #define PTTYPE 64
4282 #include "paging_tmpl.h"
4283 #undef PTTYPE
4284
4285 #define PTTYPE 32
4286 #include "paging_tmpl.h"
4287 #undef PTTYPE
4288
4289 static void
4290 __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
4291                         u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
4292                         bool pse, bool amd)
4293 {
4294         u64 gbpages_bit_rsvd = 0;
4295         u64 nonleaf_bit8_rsvd = 0;
4296         u64 high_bits_rsvd;
4297
4298         rsvd_check->bad_mt_xwr = 0;
4299
4300         if (!gbpages)
4301                 gbpages_bit_rsvd = rsvd_bits(7, 7);
4302
4303         if (level == PT32E_ROOT_LEVEL)
4304                 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
4305         else
4306                 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4307
4308         /* Note, NX doesn't exist in PDPTEs, this is handled below. */
4309         if (!nx)
4310                 high_bits_rsvd |= rsvd_bits(63, 63);
4311
4312         /*
4313          * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4314          * leaf entries) on AMD CPUs only.
4315          */
4316         if (amd)
4317                 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4318
4319         switch (level) {
4320         case PT32_ROOT_LEVEL:
4321                 /* no rsvd bits for 2 level 4K page table entries */
4322                 rsvd_check->rsvd_bits_mask[0][1] = 0;
4323                 rsvd_check->rsvd_bits_mask[0][0] = 0;
4324                 rsvd_check->rsvd_bits_mask[1][0] =
4325                         rsvd_check->rsvd_bits_mask[0][0];
4326
4327                 if (!pse) {
4328                         rsvd_check->rsvd_bits_mask[1][1] = 0;
4329                         break;
4330                 }
4331
4332                 if (is_cpuid_PSE36())
4333                         /* 36bits PSE 4MB page */
4334                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4335                 else
4336                         /* 32 bits PSE 4MB page */
4337                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4338                 break;
4339         case PT32E_ROOT_LEVEL:
4340                 rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
4341                                                    high_bits_rsvd |
4342                                                    rsvd_bits(5, 8) |
4343                                                    rsvd_bits(1, 2);     /* PDPTE */
4344                 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;      /* PDE */
4345                 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;      /* PTE */
4346                 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4347                                                    rsvd_bits(13, 20);   /* large page */
4348                 rsvd_check->rsvd_bits_mask[1][0] =
4349                         rsvd_check->rsvd_bits_mask[0][0];
4350                 break;
4351         case PT64_ROOT_5LEVEL:
4352                 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
4353                                                    nonleaf_bit8_rsvd |
4354                                                    rsvd_bits(7, 7);
4355                 rsvd_check->rsvd_bits_mask[1][4] =
4356                         rsvd_check->rsvd_bits_mask[0][4];
4357                 fallthrough;
4358         case PT64_ROOT_4LEVEL:
4359                 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
4360                                                    nonleaf_bit8_rsvd |
4361                                                    rsvd_bits(7, 7);
4362                 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
4363                                                    gbpages_bit_rsvd;
4364                 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
4365                 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4366                 rsvd_check->rsvd_bits_mask[1][3] =
4367                         rsvd_check->rsvd_bits_mask[0][3];
4368                 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
4369                                                    gbpages_bit_rsvd |
4370                                                    rsvd_bits(13, 29);
4371                 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4372                                                    rsvd_bits(13, 20); /* large page */
4373                 rsvd_check->rsvd_bits_mask[1][0] =
4374                         rsvd_check->rsvd_bits_mask[0][0];
4375                 break;
4376         }
4377 }
4378
4379 static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
4380 {
4381         /*
4382          * If TDP is enabled, let the guest use GBPAGES if they're supported in
4383          * hardware.  The hardware page walker doesn't let KVM disable GBPAGES,
4384          * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
4385          * walk for performance and complexity reasons.  Not to mention KVM
4386          * _can't_ solve the problem because GVA->GPA walks aren't visible to
4387          * KVM once a TDP translation is installed.  Mimic hardware behavior so
4388          * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
4389          */
4390         return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
4391                              guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
4392 }
4393
4394 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4395                                   struct kvm_mmu *context)
4396 {
4397         __reset_rsvds_bits_mask(&context->guest_rsvd_check,
4398                                 vcpu->arch.reserved_gpa_bits,
4399                                 context->root_level, is_efer_nx(context),
4400                                 guest_can_use_gbpages(vcpu),
4401                                 is_cr4_pse(context),
4402                                 guest_cpuid_is_amd_or_hygon(vcpu));
4403 }
4404
4405 static void
4406 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4407                             u64 pa_bits_rsvd, bool execonly, int huge_page_level)
4408 {
4409         u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4410         u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
4411         u64 bad_mt_xwr;
4412
4413         if (huge_page_level < PG_LEVEL_1G)
4414                 large_1g_rsvd = rsvd_bits(7, 7);
4415         if (huge_page_level < PG_LEVEL_2M)
4416                 large_2m_rsvd = rsvd_bits(7, 7);
4417
4418         rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
4419         rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
4420         rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
4421         rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
4422         rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4423
4424         /* large page */
4425         rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4426         rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4427         rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
4428         rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
4429         rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4430
4431         bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
4432         bad_mt_xwr |= 0xFFull << (3 * 8);       /* bits 3..5 must not be 3 */
4433         bad_mt_xwr |= 0xFFull << (7 * 8);       /* bits 3..5 must not be 7 */
4434         bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   /* bits 0..2 must not be 010 */
4435         bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   /* bits 0..2 must not be 110 */
4436         if (!execonly) {
4437                 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4438                 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4439         }
4440         rsvd_check->bad_mt_xwr = bad_mt_xwr;
4441 }
4442
4443 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4444                 struct kvm_mmu *context, bool execonly, int huge_page_level)
4445 {
4446         __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4447                                     vcpu->arch.reserved_gpa_bits, execonly,
4448                                     huge_page_level);
4449 }
4450
4451 static inline u64 reserved_hpa_bits(void)
4452 {
4453         return rsvd_bits(shadow_phys_bits, 63);
4454 }
4455
4456 /*
4457  * the page table on host is the shadow page table for the page
4458  * table in guest or amd nested guest, its mmu features completely
4459  * follow the features in guest.
4460  */
4461 static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4462                                         struct kvm_mmu *context)
4463 {
4464         /*
4465          * KVM uses NX when TDP is disabled to handle a variety of scenarios,
4466          * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
4467          * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
4468          * The iTLB multi-hit workaround can be toggled at any time, so assume
4469          * NX can be used by any non-nested shadow MMU to avoid having to reset
4470          * MMU contexts.  Note, KVM forces EFER.NX=1 when TDP is disabled.
4471          */
4472         bool uses_nx = is_efer_nx(context) || !tdp_enabled;
4473
4474         /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
4475         bool is_amd = true;
4476         /* KVM doesn't use 2-level page tables for the shadow MMU. */
4477         bool is_pse = false;
4478         struct rsvd_bits_validate *shadow_zero_check;
4479         int i;
4480
4481         WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL);
4482
4483         shadow_zero_check = &context->shadow_zero_check;
4484         __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4485                                 context->shadow_root_level, uses_nx,
4486                                 guest_can_use_gbpages(vcpu), is_pse, is_amd);
4487
4488         if (!shadow_me_mask)
4489                 return;
4490
4491         for (i = context->shadow_root_level; --i >= 0;) {
4492                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4493                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4494         }
4495
4496 }
4497
4498 static inline bool boot_cpu_is_amd(void)
4499 {
4500         WARN_ON_ONCE(!tdp_enabled);
4501         return shadow_x_mask == 0;
4502 }
4503
4504 /*
4505  * the direct page table on host, use as much mmu features as
4506  * possible, however, kvm currently does not do execution-protection.
4507  */
4508 static void
4509 reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
4510 {
4511         struct rsvd_bits_validate *shadow_zero_check;
4512         int i;
4513
4514         shadow_zero_check = &context->shadow_zero_check;
4515
4516         if (boot_cpu_is_amd())
4517                 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4518                                         context->shadow_root_level, false,
4519                                         boot_cpu_has(X86_FEATURE_GBPAGES),
4520                                         false, true);
4521         else
4522                 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4523                                             reserved_hpa_bits(), false,
4524                                             max_huge_page_level);
4525
4526         if (!shadow_me_mask)
4527                 return;
4528
4529         for (i = context->shadow_root_level; --i >= 0;) {
4530                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4531                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4532         }
4533 }
4534
4535 /*
4536  * as the comments in reset_shadow_zero_bits_mask() except it
4537  * is the shadow page table for intel nested guest.
4538  */
4539 static void
4540 reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
4541 {
4542         __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4543                                     reserved_hpa_bits(), execonly,
4544                                     max_huge_page_level);
4545 }
4546
4547 #define BYTE_MASK(access) \
4548         ((1 & (access) ? 2 : 0) | \
4549          (2 & (access) ? 4 : 0) | \
4550          (3 & (access) ? 8 : 0) | \
4551          (4 & (access) ? 16 : 0) | \
4552          (5 & (access) ? 32 : 0) | \
4553          (6 & (access) ? 64 : 0) | \
4554          (7 & (access) ? 128 : 0))
4555
4556
4557 static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
4558 {
4559         unsigned byte;
4560
4561         const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4562         const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4563         const u8 u = BYTE_MASK(ACC_USER_MASK);
4564
4565         bool cr4_smep = is_cr4_smep(mmu);
4566         bool cr4_smap = is_cr4_smap(mmu);
4567         bool cr0_wp = is_cr0_wp(mmu);
4568         bool efer_nx = is_efer_nx(mmu);
4569
4570         for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4571                 unsigned pfec = byte << 1;
4572
4573                 /*
4574                  * Each "*f" variable has a 1 bit for each UWX value
4575                  * that causes a fault with the given PFEC.
4576                  */
4577
4578                 /* Faults from writes to non-writable pages */
4579                 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4580                 /* Faults from user mode accesses to supervisor pages */
4581                 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4582                 /* Faults from fetches of non-executable pages*/
4583                 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4584                 /* Faults from kernel mode fetches of user pages */
4585                 u8 smepf = 0;
4586                 /* Faults from kernel mode accesses of user pages */
4587                 u8 smapf = 0;
4588
4589                 if (!ept) {
4590                         /* Faults from kernel mode accesses to user pages */
4591                         u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4592
4593                         /* Not really needed: !nx will cause pte.nx to fault */
4594                         if (!efer_nx)
4595                                 ff = 0;
4596
4597                         /* Allow supervisor writes if !cr0.wp */
4598                         if (!cr0_wp)
4599                                 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4600
4601                         /* Disallow supervisor fetches of user code if cr4.smep */
4602                         if (cr4_smep)
4603                                 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4604
4605                         /*
4606                          * SMAP:kernel-mode data accesses from user-mode
4607                          * mappings should fault. A fault is considered
4608                          * as a SMAP violation if all of the following
4609                          * conditions are true:
4610                          *   - X86_CR4_SMAP is set in CR4
4611                          *   - A user page is accessed
4612                          *   - The access is not a fetch
4613                          *   - The access is supervisor mode
4614                          *   - If implicit supervisor access or X86_EFLAGS_AC is clear
4615                          *
4616                          * Here, we cover the first four conditions.
4617                          * The fifth is computed dynamically in permission_fault();
4618                          * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4619                          * *not* subject to SMAP restrictions.
4620                          */
4621                         if (cr4_smap)
4622                                 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4623                 }
4624
4625                 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4626         }
4627 }
4628
4629 /*
4630 * PKU is an additional mechanism by which the paging controls access to
4631 * user-mode addresses based on the value in the PKRU register.  Protection
4632 * key violations are reported through a bit in the page fault error code.
4633 * Unlike other bits of the error code, the PK bit is not known at the
4634 * call site of e.g. gva_to_gpa; it must be computed directly in
4635 * permission_fault based on two bits of PKRU, on some machine state (CR4,
4636 * CR0, EFER, CPL), and on other bits of the error code and the page tables.
4637 *
4638 * In particular the following conditions come from the error code, the
4639 * page tables and the machine state:
4640 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4641 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4642 * - PK is always zero if U=0 in the page tables
4643 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4644 *
4645 * The PKRU bitmask caches the result of these four conditions.  The error
4646 * code (minus the P bit) and the page table's U bit form an index into the
4647 * PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
4648 * with the two bits of the PKRU register corresponding to the protection key.
4649 * For the first three conditions above the bits will be 00, thus masking
4650 * away both AD and WD.  For all reads or if the last condition holds, WD
4651 * only will be masked away.
4652 */
4653 static void update_pkru_bitmask(struct kvm_mmu *mmu)
4654 {
4655         unsigned bit;
4656         bool wp;
4657
4658         mmu->pkru_mask = 0;
4659
4660         if (!is_cr4_pke(mmu))
4661                 return;
4662
4663         wp = is_cr0_wp(mmu);
4664
4665         for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4666                 unsigned pfec, pkey_bits;
4667                 bool check_pkey, check_write, ff, uf, wf, pte_user;
4668
4669                 pfec = bit << 1;
4670                 ff = pfec & PFERR_FETCH_MASK;
4671                 uf = pfec & PFERR_USER_MASK;
4672                 wf = pfec & PFERR_WRITE_MASK;
4673
4674                 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
4675                 pte_user = pfec & PFERR_RSVD_MASK;
4676
4677                 /*
4678                  * Only need to check the access which is not an
4679                  * instruction fetch and is to a user page.
4680                  */
4681                 check_pkey = (!ff && pte_user);
4682                 /*
4683                  * write access is controlled by PKRU if it is a
4684                  * user access or CR0.WP = 1.
4685                  */
4686                 check_write = check_pkey && wf && (uf || wp);
4687
4688                 /* PKRU.AD stops both read and write access. */
4689                 pkey_bits = !!check_pkey;
4690                 /* PKRU.WD stops write access. */
4691                 pkey_bits |= (!!check_write) << 1;
4692
4693                 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4694         }
4695 }
4696
4697 static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
4698                                         struct kvm_mmu *mmu)
4699 {
4700         if (!is_cr0_pg(mmu))
4701                 return;
4702
4703         reset_rsvds_bits_mask(vcpu, mmu);
4704         update_permission_bitmask(mmu, false);
4705         update_pkru_bitmask(mmu);
4706 }
4707
4708 static void paging64_init_context(struct kvm_mmu *context)
4709 {
4710         context->page_fault = paging64_page_fault;
4711         context->gva_to_gpa = paging64_gva_to_gpa;
4712         context->sync_page = paging64_sync_page;
4713         context->invlpg = paging64_invlpg;
4714         context->direct_map = false;
4715 }
4716
4717 static void paging32_init_context(struct kvm_mmu *context)
4718 {
4719         context->page_fault = paging32_page_fault;
4720         context->gva_to_gpa = paging32_gva_to_gpa;
4721         context->sync_page = paging32_sync_page;
4722         context->invlpg = paging32_invlpg;
4723         context->direct_map = false;
4724 }
4725
4726 static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu,
4727                                                          struct kvm_mmu_role_regs *regs)
4728 {
4729         union kvm_mmu_extended_role ext = {0};
4730
4731         if (____is_cr0_pg(regs)) {
4732                 ext.cr0_pg = 1;
4733                 ext.cr4_pae = ____is_cr4_pae(regs);
4734                 ext.cr4_smep = ____is_cr4_smep(regs);
4735                 ext.cr4_smap = ____is_cr4_smap(regs);
4736                 ext.cr4_pse = ____is_cr4_pse(regs);
4737
4738                 /* PKEY and LA57 are active iff long mode is active. */
4739                 ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
4740                 ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
4741                 ext.efer_lma = ____is_efer_lma(regs);
4742         }
4743
4744         ext.valid = 1;
4745
4746         return ext;
4747 }
4748
4749 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4750                                                    struct kvm_mmu_role_regs *regs,
4751                                                    bool base_only)
4752 {
4753         union kvm_mmu_role role = {0};
4754
4755         role.base.access = ACC_ALL;
4756         if (____is_cr0_pg(regs)) {
4757                 role.base.efer_nx = ____is_efer_nx(regs);
4758                 role.base.cr0_wp = ____is_cr0_wp(regs);
4759         }
4760         role.base.smm = is_smm(vcpu);
4761         role.base.guest_mode = is_guest_mode(vcpu);
4762
4763         if (base_only)
4764                 return role;
4765
4766         role.ext = kvm_calc_mmu_role_ext(vcpu, regs);
4767
4768         return role;
4769 }
4770
4771 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
4772 {
4773         /* tdp_root_level is architecture forced level, use it if nonzero */
4774         if (tdp_root_level)
4775                 return tdp_root_level;
4776
4777         /* Use 5-level TDP if and only if it's useful/necessary. */
4778         if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
4779                 return 4;
4780
4781         return max_tdp_level;
4782 }
4783
4784 static union kvm_mmu_role
4785 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
4786                                 struct kvm_mmu_role_regs *regs, bool base_only)
4787 {
4788         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
4789
4790         role.base.ad_disabled = (shadow_accessed_mask == 0);
4791         role.base.level = kvm_mmu_get_tdp_level(vcpu);
4792         role.base.direct = true;
4793         role.base.has_4_byte_gpte = false;
4794
4795         return role;
4796 }
4797
4798 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4799 {
4800         struct kvm_mmu *context = &vcpu->arch.root_mmu;
4801         struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
4802         union kvm_mmu_role new_role =
4803                 kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, false);
4804
4805         if (new_role.as_u64 == context->mmu_role.as_u64)
4806                 return;
4807
4808         context->mmu_role.as_u64 = new_role.as_u64;
4809         context->page_fault = kvm_tdp_page_fault;
4810         context->sync_page = nonpaging_sync_page;
4811         context->invlpg = NULL;
4812         context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
4813         context->direct_map = true;
4814         context->get_guest_pgd = get_cr3;
4815         context->get_pdptr = kvm_pdptr_read;
4816         context->inject_page_fault = kvm_inject_page_fault;
4817         context->root_level = role_regs_to_root_level(&regs);
4818
4819         if (!is_cr0_pg(context))
4820                 context->gva_to_gpa = nonpaging_gva_to_gpa;
4821         else if (is_cr4_pae(context))
4822                 context->gva_to_gpa = paging64_gva_to_gpa;
4823         else
4824                 context->gva_to_gpa = paging32_gva_to_gpa;
4825
4826         reset_guest_paging_metadata(vcpu, context);
4827         reset_tdp_shadow_zero_bits_mask(context);
4828 }
4829
4830 static union kvm_mmu_role
4831 kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
4832                                       struct kvm_mmu_role_regs *regs, bool base_only)
4833 {
4834         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
4835
4836         role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
4837         role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
4838         role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs);
4839
4840         return role;
4841 }
4842
4843 static union kvm_mmu_role
4844 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
4845                                    struct kvm_mmu_role_regs *regs, bool base_only)
4846 {
4847         union kvm_mmu_role role =
4848                 kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only);
4849
4850         role.base.direct = !____is_cr0_pg(regs);
4851
4852         if (!____is_efer_lma(regs))
4853                 role.base.level = PT32E_ROOT_LEVEL;
4854         else if (____is_cr4_la57(regs))
4855                 role.base.level = PT64_ROOT_5LEVEL;
4856         else
4857                 role.base.level = PT64_ROOT_4LEVEL;
4858
4859         return role;
4860 }
4861
4862 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
4863                                     struct kvm_mmu_role_regs *regs,
4864                                     union kvm_mmu_role new_role)
4865 {
4866         if (new_role.as_u64 == context->mmu_role.as_u64)
4867                 return;
4868
4869         context->mmu_role.as_u64 = new_role.as_u64;
4870
4871         if (!is_cr0_pg(context))
4872                 nonpaging_init_context(context);
4873         else if (is_cr4_pae(context))
4874                 paging64_init_context(context);
4875         else
4876                 paging32_init_context(context);
4877         context->root_level = role_regs_to_root_level(regs);
4878
4879         reset_guest_paging_metadata(vcpu, context);
4880         context->shadow_root_level = new_role.base.level;
4881
4882         reset_shadow_zero_bits_mask(vcpu, context);
4883 }
4884
4885 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
4886                                 struct kvm_mmu_role_regs *regs)
4887 {
4888         struct kvm_mmu *context = &vcpu->arch.root_mmu;
4889         union kvm_mmu_role new_role =
4890                 kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false);
4891
4892         shadow_mmu_init_context(vcpu, context, regs, new_role);
4893 }
4894
4895 static union kvm_mmu_role
4896 kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
4897                                    struct kvm_mmu_role_regs *regs)
4898 {
4899         union kvm_mmu_role role =
4900                 kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
4901
4902         role.base.direct = false;
4903         role.base.level = kvm_mmu_get_tdp_level(vcpu);
4904
4905         return role;
4906 }
4907
4908 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
4909                              unsigned long cr4, u64 efer, gpa_t nested_cr3)
4910 {
4911         struct kvm_mmu *context = &vcpu->arch.guest_mmu;
4912         struct kvm_mmu_role_regs regs = {
4913                 .cr0 = cr0,
4914                 .cr4 = cr4 & ~X86_CR4_PKE,
4915                 .efer = efer,
4916         };
4917         union kvm_mmu_role new_role;
4918
4919         new_role = kvm_calc_shadow_npt_root_page_role(vcpu, &regs);
4920
4921         shadow_mmu_init_context(vcpu, context, &regs, new_role);
4922         kvm_mmu_new_pgd(vcpu, nested_cr3);
4923 }
4924 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
4925
4926 static union kvm_mmu_role
4927 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
4928                                    bool execonly, u8 level)
4929 {
4930         union kvm_mmu_role role = {0};
4931
4932         /* SMM flag is inherited from root_mmu */
4933         role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
4934
4935         role.base.level = level;
4936         role.base.has_4_byte_gpte = false;
4937         role.base.direct = false;
4938         role.base.ad_disabled = !accessed_dirty;
4939         role.base.guest_mode = true;
4940         role.base.access = ACC_ALL;
4941
4942         /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
4943         role.ext.word = 0;
4944         role.ext.execonly = execonly;
4945         role.ext.valid = 1;
4946
4947         return role;
4948 }
4949
4950 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4951                              int huge_page_level, bool accessed_dirty,
4952                              gpa_t new_eptp)
4953 {
4954         struct kvm_mmu *context = &vcpu->arch.guest_mmu;
4955         u8 level = vmx_eptp_page_walk_level(new_eptp);
4956         union kvm_mmu_role new_role =
4957                 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
4958                                                    execonly, level);
4959
4960         if (new_role.as_u64 != context->mmu_role.as_u64) {
4961                 context->mmu_role.as_u64 = new_role.as_u64;
4962
4963                 context->shadow_root_level = level;
4964
4965                 context->ept_ad = accessed_dirty;
4966                 context->page_fault = ept_page_fault;
4967                 context->gva_to_gpa = ept_gva_to_gpa;
4968                 context->sync_page = ept_sync_page;
4969                 context->invlpg = ept_invlpg;
4970                 context->root_level = level;
4971                 context->direct_map = false;
4972                 update_permission_bitmask(context, true);
4973                 context->pkru_mask = 0;
4974                 reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
4975                 reset_ept_shadow_zero_bits_mask(context, execonly);
4976         }
4977
4978         kvm_mmu_new_pgd(vcpu, new_eptp);
4979 }
4980 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
4981
4982 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
4983 {
4984         struct kvm_mmu *context = &vcpu->arch.root_mmu;
4985         struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
4986
4987         kvm_init_shadow_mmu(vcpu, &regs);
4988
4989         context->get_guest_pgd     = get_cr3;
4990         context->get_pdptr         = kvm_pdptr_read;
4991         context->inject_page_fault = kvm_inject_page_fault;
4992 }
4993
4994 static union kvm_mmu_role
4995 kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs)
4996 {
4997         union kvm_mmu_role role;
4998
4999         role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
5000
5001         /*
5002          * Nested MMUs are used only for walking L2's gva->gpa, they never have
5003          * shadow pages of their own and so "direct" has no meaning.   Set it
5004          * to "true" to try to detect bogus usage of the nested MMU.
5005          */
5006         role.base.direct = true;
5007         role.base.level = role_regs_to_root_level(regs);
5008         return role;
5009 }
5010
5011 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
5012 {
5013         struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
5014         union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, &regs);
5015         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5016
5017         if (new_role.as_u64 == g_context->mmu_role.as_u64)
5018                 return;
5019
5020         g_context->mmu_role.as_u64 = new_role.as_u64;
5021         g_context->get_guest_pgd     = get_cr3;
5022         g_context->get_pdptr         = kvm_pdptr_read;
5023         g_context->inject_page_fault = kvm_inject_page_fault;
5024         g_context->root_level        = new_role.base.level;
5025
5026         /*
5027          * L2 page tables are never shadowed, so there is no need to sync
5028          * SPTEs.
5029          */
5030         g_context->invlpg            = NULL;
5031
5032         /*
5033          * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5034          * L1's nested page tables (e.g. EPT12). The nested translation
5035          * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5036          * L2's page tables as the first level of translation and L1's
5037          * nested page tables as the second level of translation. Basically
5038          * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5039          */
5040         if (!is_paging(vcpu))
5041                 g_context->gva_to_gpa = nonpaging_gva_to_gpa;
5042         else if (is_long_mode(vcpu))
5043                 g_context->gva_to_gpa = paging64_gva_to_gpa;
5044         else if (is_pae(vcpu))
5045                 g_context->gva_to_gpa = paging64_gva_to_gpa;
5046         else
5047                 g_context->gva_to_gpa = paging32_gva_to_gpa;
5048
5049         reset_guest_paging_metadata(vcpu, g_context);
5050 }
5051
5052 void kvm_init_mmu(struct kvm_vcpu *vcpu)
5053 {
5054         if (mmu_is_nested(vcpu))
5055                 init_kvm_nested_mmu(vcpu);
5056         else if (tdp_enabled)
5057                 init_kvm_tdp_mmu(vcpu);
5058         else
5059                 init_kvm_softmmu(vcpu);
5060 }
5061 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5062
5063 void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
5064 {
5065         /*
5066          * Invalidate all MMU roles to force them to reinitialize as CPUID
5067          * information is factored into reserved bit calculations.
5068          *
5069          * Correctly handling multiple vCPU models with respect to paging and
5070          * physical address properties) in a single VM would require tracking
5071          * all relevant CPUID information in kvm_mmu_page_role. That is very
5072          * undesirable as it would increase the memory requirements for
5073          * gfn_track (see struct kvm_mmu_page_role comments).  For now that
5074          * problem is swept under the rug; KVM's CPUID API is horrific and
5075          * it's all but impossible to solve it without introducing a new API.
5076          */
5077         vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
5078         vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
5079         vcpu->arch.nested_mmu.mmu_role.ext.valid = 0;
5080         kvm_mmu_reset_context(vcpu);
5081
5082         /*
5083          * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
5084          * kvm_arch_vcpu_ioctl().
5085          */
5086         KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
5087 }
5088
5089 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5090 {
5091         kvm_mmu_unload(vcpu);
5092         kvm_init_mmu(vcpu);
5093 }
5094 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5095
5096 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5097 {
5098         int r;
5099
5100         r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
5101         if (r)
5102                 goto out;
5103         r = mmu_alloc_special_roots(vcpu);
5104         if (r)
5105                 goto out;
5106         if (vcpu->arch.mmu->direct_map)
5107                 r = mmu_alloc_direct_roots(vcpu);
5108         else
5109                 r = mmu_alloc_shadow_roots(vcpu);
5110         if (r)
5111                 goto out;
5112
5113         kvm_mmu_sync_roots(vcpu);
5114
5115         kvm_mmu_load_pgd(vcpu);
5116
5117         /*
5118          * Flush any TLB entries for the new root, the provenance of the root
5119          * is unknown.  Even if KVM ensures there are no stale TLB entries
5120          * for a freed root, in theory another hypervisor could have left
5121          * stale entries.  Flushing on alloc also allows KVM to skip the TLB
5122          * flush when freeing a root (see kvm_tdp_mmu_put_root()).
5123          */
5124         static_call(kvm_x86_flush_tlb_current)(vcpu);
5125 out:
5126         return r;
5127 }
5128
5129 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5130 {
5131         struct kvm *kvm = vcpu->kvm;
5132
5133         kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5134         WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
5135         kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5136         WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
5137         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
5138 }
5139
5140 static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
5141 {
5142         struct kvm_mmu_page *sp;
5143
5144         if (!VALID_PAGE(root_hpa))
5145                 return false;
5146
5147         /*
5148          * When freeing obsolete roots, treat roots as obsolete if they don't
5149          * have an associated shadow page.  This does mean KVM will get false
5150          * positives and free roots that don't strictly need to be freed, but
5151          * such false positives are relatively rare:
5152          *
5153          *  (a) only PAE paging and nested NPT has roots without shadow pages
5154          *  (b) remote reloads due to a memslot update obsoletes _all_ roots
5155          *  (c) KVM doesn't track previous roots for PAE paging, and the guest
5156          *      is unlikely to zap an in-use PGD.
5157          */
5158         sp = to_shadow_page(root_hpa);
5159         return !sp || is_obsolete_sp(kvm, sp);
5160 }
5161
5162 static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
5163 {
5164         unsigned long roots_to_free = 0;
5165         int i;
5166
5167         if (is_obsolete_root(kvm, mmu->root.hpa))
5168                 roots_to_free |= KVM_MMU_ROOT_CURRENT;
5169
5170         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5171                 if (is_obsolete_root(kvm, mmu->root.hpa))
5172                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5173         }
5174
5175         if (roots_to_free)
5176                 kvm_mmu_free_roots(kvm, mmu, roots_to_free);
5177 }
5178
5179 void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
5180 {
5181         __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
5182         __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
5183 }
5184
5185 static bool need_remote_flush(u64 old, u64 new)
5186 {
5187         if (!is_shadow_present_pte(old))
5188                 return false;
5189         if (!is_shadow_present_pte(new))
5190                 return true;
5191         if ((old ^ new) & PT64_BASE_ADDR_MASK)
5192                 return true;
5193         old ^= shadow_nx_mask;
5194         new ^= shadow_nx_mask;
5195         return (old & ~new & PT64_PERM_MASK) != 0;
5196 }
5197
5198 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5199                                     int *bytes)
5200 {
5201         u64 gentry = 0;
5202         int r;
5203
5204         /*
5205          * Assume that the pte write on a page table of the same type
5206          * as the current vcpu paging mode since we update the sptes only
5207          * when they have the same mode.
5208          */
5209         if (is_pae(vcpu) && *bytes == 4) {
5210                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5211                 *gpa &= ~(gpa_t)7;
5212                 *bytes = 8;
5213         }
5214
5215         if (*bytes == 4 || *bytes == 8) {
5216                 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5217                 if (r)
5218                         gentry = 0;
5219         }
5220
5221         return gentry;
5222 }
5223
5224 /*
5225  * If we're seeing too many writes to a page, it may no longer be a page table,
5226  * or we may be forking, in which case it is better to unmap the page.
5227  */
5228 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5229 {
5230         /*
5231          * Skip write-flooding detected for the sp whose level is 1, because
5232          * it can become unsync, then the guest page is not write-protected.
5233          */
5234         if (sp->role.level == PG_LEVEL_4K)
5235                 return false;
5236
5237         atomic_inc(&sp->write_flooding_count);
5238         return atomic_read(&sp->write_flooding_count) >= 3;
5239 }
5240
5241 /*
5242  * Misaligned accesses are too much trouble to fix up; also, they usually
5243  * indicate a page is not used as a page table.
5244  */
5245 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5246                                     int bytes)
5247 {
5248         unsigned offset, pte_size, misaligned;
5249
5250         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5251                  gpa, bytes, sp->role.word);
5252
5253         offset = offset_in_page(gpa);
5254         pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
5255
5256         /*
5257          * Sometimes, the OS only writes the last one bytes to update status
5258          * bits, for example, in linux, andb instruction is used in clear_bit().
5259          */
5260         if (!(offset & (pte_size - 1)) && bytes == 1)
5261                 return false;
5262
5263         misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5264         misaligned |= bytes < 4;
5265
5266         return misaligned;
5267 }
5268
5269 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5270 {
5271         unsigned page_offset, quadrant;
5272         u64 *spte;
5273         int level;
5274
5275         page_offset = offset_in_page(gpa);
5276         level = sp->role.level;
5277         *nspte = 1;
5278         if (sp->role.has_4_byte_gpte) {
5279                 page_offset <<= 1;      /* 32->64 */
5280                 /*
5281                  * A 32-bit pde maps 4MB while the shadow pdes map
5282                  * only 2MB.  So we need to double the offset again
5283                  * and zap two pdes instead of one.
5284                  */
5285                 if (level == PT32_ROOT_LEVEL) {
5286                         page_offset &= ~7; /* kill rounding error */
5287                         page_offset <<= 1;
5288                         *nspte = 2;
5289                 }
5290                 quadrant = page_offset >> PAGE_SHIFT;
5291                 page_offset &= ~PAGE_MASK;
5292                 if (quadrant != sp->role.quadrant)
5293                         return NULL;
5294         }
5295
5296         spte = &sp->spt[page_offset / sizeof(*spte)];
5297         return spte;
5298 }
5299
5300 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5301                               const u8 *new, int bytes,
5302                               struct kvm_page_track_notifier_node *node)
5303 {
5304         gfn_t gfn = gpa >> PAGE_SHIFT;
5305         struct kvm_mmu_page *sp;
5306         LIST_HEAD(invalid_list);
5307         u64 entry, gentry, *spte;
5308         int npte;
5309         bool flush = false;
5310
5311         /*
5312          * If we don't have indirect shadow pages, it means no page is
5313          * write-protected, so we can exit simply.
5314          */
5315         if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5316                 return;
5317
5318         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5319
5320         /*
5321          * No need to care whether allocation memory is successful
5322          * or not since pte prefetch is skipped if it does not have
5323          * enough objects in the cache.
5324          */
5325         mmu_topup_memory_caches(vcpu, true);
5326
5327         write_lock(&vcpu->kvm->mmu_lock);
5328
5329         gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5330
5331         ++vcpu->kvm->stat.mmu_pte_write;
5332
5333         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5334                 if (detect_write_misaligned(sp, gpa, bytes) ||
5335                       detect_write_flooding(sp)) {
5336                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5337                         ++vcpu->kvm->stat.mmu_flooded;
5338                         continue;
5339                 }
5340
5341                 spte = get_written_sptes(sp, gpa, &npte);
5342                 if (!spte)
5343                         continue;
5344
5345                 while (npte--) {
5346                         entry = *spte;
5347                         mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
5348                         if (gentry && sp->role.level != PG_LEVEL_4K)
5349                                 ++vcpu->kvm->stat.mmu_pde_zapped;
5350                         if (need_remote_flush(entry, *spte))
5351                                 flush = true;
5352                         ++spte;
5353                 }
5354         }
5355         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
5356         write_unlock(&vcpu->kvm->mmu_lock);
5357 }
5358
5359 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5360                        void *insn, int insn_len)
5361 {
5362         int r, emulation_type = EMULTYPE_PF;
5363         bool direct = vcpu->arch.mmu->direct_map;
5364
5365         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
5366                 return RET_PF_RETRY;
5367
5368         r = RET_PF_INVALID;
5369         if (unlikely(error_code & PFERR_RSVD_MASK)) {
5370                 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5371                 if (r == RET_PF_EMULATE)
5372                         goto emulate;
5373         }
5374
5375         if (r == RET_PF_INVALID) {
5376                 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
5377                                           lower_32_bits(error_code), false);
5378                 if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
5379                         return -EIO;
5380         }
5381
5382         if (r < 0)
5383                 return r;
5384         if (r != RET_PF_EMULATE)
5385                 return 1;
5386
5387         /*
5388          * Before emulating the instruction, check if the error code
5389          * was due to a RO violation while translating the guest page.
5390          * This can occur when using nested virtualization with nested
5391          * paging in both guests. If true, we simply unprotect the page
5392          * and resume the guest.
5393          */
5394         if (vcpu->arch.mmu->direct_map &&
5395             (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5396                 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5397                 return 1;
5398         }
5399
5400         /*
5401          * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5402          * optimistically try to just unprotect the page and let the processor
5403          * re-execute the instruction that caused the page fault.  Do not allow
5404          * retrying MMIO emulation, as it's not only pointless but could also
5405          * cause us to enter an infinite loop because the processor will keep
5406          * faulting on the non-existent MMIO address.  Retrying an instruction
5407          * from a nested guest is also pointless and dangerous as we are only
5408          * explicitly shadowing L1's page tables, i.e. unprotecting something
5409          * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5410          */
5411         if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5412                 emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
5413 emulate:
5414         return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5415                                        insn_len);
5416 }
5417 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5418
5419 void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
5420                             gva_t gva, hpa_t root_hpa)
5421 {
5422         int i;
5423
5424         /* It's actually a GPA for vcpu->arch.guest_mmu.  */
5425         if (mmu != &vcpu->arch.guest_mmu) {
5426                 /* INVLPG on a non-canonical address is a NOP according to the SDM.  */
5427                 if (is_noncanonical_address(gva, vcpu))
5428                         return;
5429
5430                 static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
5431         }
5432
5433         if (!mmu->invlpg)
5434                 return;
5435
5436         if (root_hpa == INVALID_PAGE) {
5437                 mmu->invlpg(vcpu, gva, mmu->root.hpa);
5438
5439                 /*
5440                  * INVLPG is required to invalidate any global mappings for the VA,
5441                  * irrespective of PCID. Since it would take us roughly similar amount
5442                  * of work to determine whether any of the prev_root mappings of the VA
5443                  * is marked global, or to just sync it blindly, so we might as well
5444                  * just always sync it.
5445                  *
5446                  * Mappings not reachable via the current cr3 or the prev_roots will be
5447                  * synced when switching to that cr3, so nothing needs to be done here
5448                  * for them.
5449                  */
5450                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5451                         if (VALID_PAGE(mmu->prev_roots[i].hpa))
5452                                 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5453         } else {
5454                 mmu->invlpg(vcpu, gva, root_hpa);
5455         }
5456 }
5457
5458 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5459 {
5460         kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
5461         ++vcpu->stat.invlpg;
5462 }
5463 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5464
5465
5466 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5467 {
5468         struct kvm_mmu *mmu = vcpu->arch.mmu;
5469         bool tlb_flush = false;
5470         uint i;
5471
5472         if (pcid == kvm_get_active_pcid(vcpu)) {
5473                 mmu->invlpg(vcpu, gva, mmu->root.hpa);
5474                 tlb_flush = true;
5475         }
5476
5477         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5478                 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5479                     pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
5480                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5481                         tlb_flush = true;
5482                 }
5483         }
5484
5485         if (tlb_flush)
5486                 static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
5487
5488         ++vcpu->stat.invlpg;
5489
5490         /*
5491          * Mappings not reachable via the current cr3 or the prev_roots will be
5492          * synced when switching to that cr3, so nothing needs to be done here
5493          * for them.
5494          */
5495 }
5496
5497 void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
5498                        int tdp_max_root_level, int tdp_huge_page_level)
5499 {
5500         tdp_enabled = enable_tdp;
5501         tdp_root_level = tdp_forced_root_level;
5502         max_tdp_level = tdp_max_root_level;
5503
5504         /*
5505          * max_huge_page_level reflects KVM's MMU capabilities irrespective
5506          * of kernel support, e.g. KVM may be capable of using 1GB pages when
5507          * the kernel is not.  But, KVM never creates a page size greater than
5508          * what is used by the kernel for any given HVA, i.e. the kernel's
5509          * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
5510          */
5511         if (tdp_enabled)
5512                 max_huge_page_level = tdp_huge_page_level;
5513         else if (boot_cpu_has(X86_FEATURE_GBPAGES))
5514                 max_huge_page_level = PG_LEVEL_1G;
5515         else
5516                 max_huge_page_level = PG_LEVEL_2M;
5517 }
5518 EXPORT_SYMBOL_GPL(kvm_configure_mmu);
5519
5520 /* The return value indicates if tlb flush on all vcpus is needed. */
5521 typedef bool (*slot_level_handler) (struct kvm *kvm,
5522                                     struct kvm_rmap_head *rmap_head,
5523                                     const struct kvm_memory_slot *slot);
5524
5525 /* The caller should hold mmu-lock before calling this function. */
5526 static __always_inline bool
5527 slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5528                         slot_level_handler fn, int start_level, int end_level,
5529                         gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
5530                         bool flush)
5531 {
5532         struct slot_rmap_walk_iterator iterator;
5533
5534         for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5535                         end_gfn, &iterator) {
5536                 if (iterator.rmap)
5537                         flush |= fn(kvm, iterator.rmap, memslot);
5538
5539                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
5540                         if (flush && flush_on_yield) {
5541                                 kvm_flush_remote_tlbs_with_address(kvm,
5542                                                 start_gfn,
5543                                                 iterator.gfn - start_gfn + 1);
5544                                 flush = false;
5545                         }
5546                         cond_resched_rwlock_write(&kvm->mmu_lock);
5547                 }
5548         }
5549
5550         return flush;
5551 }
5552
5553 static __always_inline bool
5554 slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5555                   slot_level_handler fn, int start_level, int end_level,
5556                   bool flush_on_yield)
5557 {
5558         return slot_handle_level_range(kvm, memslot, fn, start_level,
5559                         end_level, memslot->base_gfn,
5560                         memslot->base_gfn + memslot->npages - 1,
5561                         flush_on_yield, false);
5562 }
5563
5564 static __always_inline bool
5565 slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5566                      slot_level_handler fn, bool flush_on_yield)
5567 {
5568         return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
5569                                  PG_LEVEL_4K, flush_on_yield);
5570 }
5571
5572 static void free_mmu_pages(struct kvm_mmu *mmu)
5573 {
5574         if (!tdp_enabled && mmu->pae_root)
5575                 set_memory_encrypted((unsigned long)mmu->pae_root, 1);
5576         free_page((unsigned long)mmu->pae_root);
5577         free_page((unsigned long)mmu->pml4_root);
5578         free_page((unsigned long)mmu->pml5_root);
5579 }
5580
5581 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5582 {
5583         struct page *page;
5584         int i;
5585
5586         mmu->root.hpa = INVALID_PAGE;
5587         mmu->root.pgd = 0;
5588         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5589                 mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5590
5591         /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
5592         if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
5593                 return 0;
5594
5595         /*
5596          * When using PAE paging, the four PDPTEs are treated as 'root' pages,
5597          * while the PDP table is a per-vCPU construct that's allocated at MMU
5598          * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
5599          * x86_64.  Therefore we need to allocate the PDP table in the first
5600          * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
5601          * generally doesn't use PAE paging and can skip allocating the PDP
5602          * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
5603          * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
5604          * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
5605          */
5606         if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5607                 return 0;
5608
5609         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5610         if (!page)
5611                 return -ENOMEM;
5612
5613         mmu->pae_root = page_address(page);
5614
5615         /*
5616          * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
5617          * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
5618          * that KVM's writes and the CPU's reads get along.  Note, this is
5619          * only necessary when using shadow paging, as 64-bit NPT can get at
5620          * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
5621          * by 32-bit kernels (when KVM itself uses 32-bit NPT).
5622          */
5623         if (!tdp_enabled)
5624                 set_memory_decrypted((unsigned long)mmu->pae_root, 1);
5625         else
5626                 WARN_ON_ONCE(shadow_me_mask);
5627
5628         for (i = 0; i < 4; ++i)
5629                 mmu->pae_root[i] = INVALID_PAE_ROOT;
5630
5631         return 0;
5632 }
5633
5634 int kvm_mmu_create(struct kvm_vcpu *vcpu)
5635 {
5636         int ret;
5637
5638         vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
5639         vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
5640
5641         vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
5642         vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
5643
5644         vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
5645
5646         vcpu->arch.mmu = &vcpu->arch.root_mmu;
5647         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5648
5649         ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
5650         if (ret)
5651                 return ret;
5652
5653         ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
5654         if (ret)
5655                 goto fail_allocate_root;
5656
5657         return ret;
5658  fail_allocate_root:
5659         free_mmu_pages(&vcpu->arch.guest_mmu);
5660         return ret;
5661 }
5662
5663 #define BATCH_ZAP_PAGES 10
5664 static void kvm_zap_obsolete_pages(struct kvm *kvm)
5665 {
5666         struct kvm_mmu_page *sp, *node;
5667         int nr_zapped, batch = 0;
5668
5669 restart:
5670         list_for_each_entry_safe_reverse(sp, node,
5671               &kvm->arch.active_mmu_pages, link) {
5672                 /*
5673                  * No obsolete valid page exists before a newly created page
5674                  * since active_mmu_pages is a FIFO list.
5675                  */
5676                 if (!is_obsolete_sp(kvm, sp))
5677                         break;
5678
5679                 /*
5680                  * Invalid pages should never land back on the list of active
5681                  * pages.  Skip the bogus page, otherwise we'll get stuck in an
5682                  * infinite loop if the page gets put back on the list (again).
5683                  */
5684                 if (WARN_ON(sp->role.invalid))
5685                         continue;
5686
5687                 /*
5688                  * No need to flush the TLB since we're only zapping shadow
5689                  * pages with an obsolete generation number and all vCPUS have
5690                  * loaded a new root, i.e. the shadow pages being zapped cannot
5691                  * be in active use by the guest.
5692                  */
5693                 if (batch >= BATCH_ZAP_PAGES &&
5694                     cond_resched_rwlock_write(&kvm->mmu_lock)) {
5695                         batch = 0;
5696                         goto restart;
5697                 }
5698
5699                 if (__kvm_mmu_prepare_zap_page(kvm, sp,
5700                                 &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
5701                         batch += nr_zapped;
5702                         goto restart;
5703                 }
5704         }
5705
5706         /*
5707          * Kick all vCPUs (via remote TLB flush) before freeing the page tables
5708          * to ensure KVM is not in the middle of a lockless shadow page table
5709          * walk, which may reference the pages.  The remote TLB flush itself is
5710          * not required and is simply a convenient way to kick vCPUs as needed.
5711          * KVM performs a local TLB flush when allocating a new root (see
5712          * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
5713          * running with an obsolete MMU.
5714          */
5715         kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5716 }
5717
5718 /*
5719  * Fast invalidate all shadow pages and use lock-break technique
5720  * to zap obsolete pages.
5721  *
5722  * It's required when memslot is being deleted or VM is being
5723  * destroyed, in these cases, we should ensure that KVM MMU does
5724  * not use any resource of the being-deleted slot or all slots
5725  * after calling the function.
5726  */
5727 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5728 {
5729         lockdep_assert_held(&kvm->slots_lock);
5730
5731         write_lock(&kvm->mmu_lock);
5732         trace_kvm_mmu_zap_all_fast(kvm);
5733
5734         /*
5735          * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
5736          * held for the entire duration of zapping obsolete pages, it's
5737          * impossible for there to be multiple invalid generations associated
5738          * with *valid* shadow pages at any given time, i.e. there is exactly
5739          * one valid generation and (at most) one invalid generation.
5740          */
5741         kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
5742
5743         /*
5744          * In order to ensure all vCPUs drop their soon-to-be invalid roots,
5745          * invalidating TDP MMU roots must be done while holding mmu_lock for
5746          * write and in the same critical section as making the reload request,
5747          * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
5748          */
5749         if (is_tdp_mmu_enabled(kvm))
5750                 kvm_tdp_mmu_invalidate_all_roots(kvm);
5751
5752         /*
5753          * Notify all vcpus to reload its shadow page table and flush TLB.
5754          * Then all vcpus will switch to new shadow page table with the new
5755          * mmu_valid_gen.
5756          *
5757          * Note: we need to do this under the protection of mmu_lock,
5758          * otherwise, vcpu would purge shadow page but miss tlb flush.
5759          */
5760         kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
5761
5762         kvm_zap_obsolete_pages(kvm);
5763
5764         write_unlock(&kvm->mmu_lock);
5765
5766         /*
5767          * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
5768          * returning to the caller, e.g. if the zap is in response to a memslot
5769          * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
5770          * associated with the deleted memslot once the update completes, and
5771          * Deferring the zap until the final reference to the root is put would
5772          * lead to use-after-free.
5773          */
5774         if (is_tdp_mmu_enabled(kvm))
5775                 kvm_tdp_mmu_zap_invalidated_roots(kvm);
5776 }
5777
5778 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5779 {
5780         return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5781 }
5782
5783 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5784                         struct kvm_memory_slot *slot,
5785                         struct kvm_page_track_notifier_node *node)
5786 {
5787         kvm_mmu_zap_all_fast(kvm);
5788 }
5789
5790 int kvm_mmu_init_vm(struct kvm *kvm)
5791 {
5792         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5793         int r;
5794
5795         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5796         INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
5797         INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
5798         spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
5799
5800         r = kvm_mmu_init_tdp_mmu(kvm);
5801         if (r < 0)
5802                 return r;
5803
5804         node->track_write = kvm_mmu_pte_write;
5805         node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5806         kvm_page_track_register_notifier(kvm, node);
5807         return 0;
5808 }
5809
5810 void kvm_mmu_uninit_vm(struct kvm *kvm)
5811 {
5812         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5813
5814         kvm_page_track_unregister_notifier(kvm, node);
5815
5816         kvm_mmu_uninit_tdp_mmu(kvm);
5817 }
5818
5819 static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5820 {
5821         const struct kvm_memory_slot *memslot;
5822         struct kvm_memslots *slots;
5823         struct kvm_memslot_iter iter;
5824         bool flush = false;
5825         gfn_t start, end;
5826         int i;
5827
5828         if (!kvm_memslots_have_rmaps(kvm))
5829                 return flush;
5830
5831         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5832                 slots = __kvm_memslots(kvm, i);
5833
5834                 kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
5835                         memslot = iter.slot;
5836                         start = max(gfn_start, memslot->base_gfn);
5837                         end = min(gfn_end, memslot->base_gfn + memslot->npages);
5838                         if (WARN_ON_ONCE(start >= end))
5839                                 continue;
5840
5841                         flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5842
5843                                                         PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
5844                                                         start, end - 1, true, flush);
5845                 }
5846         }
5847
5848         return flush;
5849 }
5850
5851 /*
5852  * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
5853  * (not including it)
5854  */
5855 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5856 {
5857         bool flush;
5858         int i;
5859
5860         if (WARN_ON_ONCE(gfn_end <= gfn_start))
5861                 return;
5862
5863         write_lock(&kvm->mmu_lock);
5864
5865         kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
5866
5867         flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end);
5868
5869         if (is_tdp_mmu_enabled(kvm)) {
5870                 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
5871                         flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
5872                                                       gfn_end, true, flush);
5873         }
5874
5875         if (flush)
5876                 kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
5877                                                    gfn_end - gfn_start);
5878
5879         kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
5880
5881         write_unlock(&kvm->mmu_lock);
5882 }
5883
5884 static bool slot_rmap_write_protect(struct kvm *kvm,
5885                                     struct kvm_rmap_head *rmap_head,
5886                                     const struct kvm_memory_slot *slot)
5887 {
5888         return rmap_write_protect(rmap_head, false);
5889 }
5890
5891 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5892                                       const struct kvm_memory_slot *memslot,
5893                                       int start_level)
5894 {
5895         bool flush = false;
5896
5897         if (kvm_memslots_have_rmaps(kvm)) {
5898                 write_lock(&kvm->mmu_lock);
5899                 flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
5900                                           start_level, KVM_MAX_HUGEPAGE_LEVEL,
5901                                           false);
5902                 write_unlock(&kvm->mmu_lock);
5903         }
5904
5905         if (is_tdp_mmu_enabled(kvm)) {
5906                 read_lock(&kvm->mmu_lock);
5907                 flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
5908                 read_unlock(&kvm->mmu_lock);
5909         }
5910
5911         /*
5912          * Flush TLBs if any SPTEs had to be write-protected to ensure that
5913          * guest writes are reflected in the dirty bitmap before the memslot
5914          * update completes, i.e. before enabling dirty logging is visible to
5915          * userspace.
5916          *
5917          * Perform the TLB flush outside the mmu_lock to reduce the amount of
5918          * time the lock is held. However, this does mean that another CPU can
5919          * now grab mmu_lock and encounter a write-protected SPTE while CPUs
5920          * still have a writable mapping for the associated GFN in their TLB.
5921          *
5922          * This is safe but requires KVM to be careful when making decisions
5923          * based on the write-protection status of an SPTE. Specifically, KVM
5924          * also write-protects SPTEs to monitor changes to guest page tables
5925          * during shadow paging, and must guarantee no CPUs can write to those
5926          * page before the lock is dropped. As mentioned in the previous
5927          * paragraph, a write-protected SPTE is no guarantee that CPU cannot
5928          * perform writes. So to determine if a TLB flush is truly required, KVM
5929          * will clear a separate software-only bit (MMU-writable) and skip the
5930          * flush if-and-only-if this bit was already clear.
5931          *
5932          * See is_writable_pte() for more details.
5933          */
5934         if (flush)
5935                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
5936 }
5937
5938 /* Must be called with the mmu_lock held in write-mode. */
5939 void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
5940                                    const struct kvm_memory_slot *memslot,
5941                                    u64 start, u64 end,
5942                                    int target_level)
5943 {
5944         if (is_tdp_mmu_enabled(kvm))
5945                 kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
5946                                                  target_level, false);
5947
5948         /*
5949          * A TLB flush is unnecessary at this point for the same resons as in
5950          * kvm_mmu_slot_try_split_huge_pages().
5951          */
5952 }
5953
5954 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
5955                                         const struct kvm_memory_slot *memslot,
5956                                         int target_level)
5957 {
5958         u64 start = memslot->base_gfn;
5959         u64 end = start + memslot->npages;
5960
5961         if (is_tdp_mmu_enabled(kvm)) {
5962                 read_lock(&kvm->mmu_lock);
5963                 kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
5964                 read_unlock(&kvm->mmu_lock);
5965         }
5966
5967         /*
5968          * No TLB flush is necessary here. KVM will flush TLBs after
5969          * write-protecting and/or clearing dirty on the newly split SPTEs to
5970          * ensure that guest writes are reflected in the dirty log before the
5971          * ioctl to enable dirty logging on this memslot completes. Since the
5972          * split SPTEs retain the write and dirty bits of the huge SPTE, it is
5973          * safe for KVM to decide if a TLB flush is necessary based on the split
5974          * SPTEs.
5975          */
5976 }
5977
5978 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
5979                                          struct kvm_rmap_head *rmap_head,
5980                                          const struct kvm_memory_slot *slot)
5981 {
5982         u64 *sptep;
5983         struct rmap_iterator iter;
5984         int need_tlb_flush = 0;
5985         kvm_pfn_t pfn;
5986         struct kvm_mmu_page *sp;
5987
5988 restart:
5989         for_each_rmap_spte(rmap_head, &iter, sptep) {
5990                 sp = sptep_to_sp(sptep);
5991                 pfn = spte_to_pfn(*sptep);
5992
5993                 /*
5994                  * We cannot do huge page mapping for indirect shadow pages,
5995                  * which are found on the last rmap (level = 1) when not using
5996                  * tdp; such shadow pages are synced with the page table in
5997                  * the guest, and the guest page table is using 4K page size
5998                  * mapping if the indirect sp has level = 1.
5999                  */
6000                 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
6001                     sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
6002                                                                pfn, PG_LEVEL_NUM)) {
6003                         pte_list_remove(kvm, rmap_head, sptep);
6004
6005                         if (kvm_available_flush_tlb_with_range())
6006                                 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
6007                                         KVM_PAGES_PER_HPAGE(sp->role.level));
6008                         else
6009                                 need_tlb_flush = 1;
6010
6011                         goto restart;
6012                 }
6013         }
6014
6015         return need_tlb_flush;
6016 }
6017
6018 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6019                                    const struct kvm_memory_slot *slot)
6020 {
6021         if (kvm_memslots_have_rmaps(kvm)) {
6022                 write_lock(&kvm->mmu_lock);
6023                 /*
6024                  * Zap only 4k SPTEs since the legacy MMU only supports dirty
6025                  * logging at a 4k granularity and never creates collapsible
6026                  * 2m SPTEs during dirty logging.
6027                  */
6028                 if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true))
6029                         kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
6030                 write_unlock(&kvm->mmu_lock);
6031         }
6032
6033         if (is_tdp_mmu_enabled(kvm)) {
6034                 read_lock(&kvm->mmu_lock);
6035                 kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
6036                 read_unlock(&kvm->mmu_lock);
6037         }
6038 }
6039
6040 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
6041                                         const struct kvm_memory_slot *memslot)
6042 {
6043         /*
6044          * All current use cases for flushing the TLBs for a specific memslot
6045          * related to dirty logging, and many do the TLB flush out of mmu_lock.
6046          * The interaction between the various operations on memslot must be
6047          * serialized by slots_locks to ensure the TLB flush from one operation
6048          * is observed by any other operation on the same memslot.
6049          */
6050         lockdep_assert_held(&kvm->slots_lock);
6051         kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6052                                            memslot->npages);
6053 }
6054
6055 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
6056                                    const struct kvm_memory_slot *memslot)
6057 {
6058         bool flush = false;
6059
6060         if (kvm_memslots_have_rmaps(kvm)) {
6061                 write_lock(&kvm->mmu_lock);
6062                 /*
6063                  * Clear dirty bits only on 4k SPTEs since the legacy MMU only
6064                  * support dirty logging at a 4k granularity.
6065                  */
6066                 flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
6067                 write_unlock(&kvm->mmu_lock);
6068         }
6069
6070         if (is_tdp_mmu_enabled(kvm)) {
6071                 read_lock(&kvm->mmu_lock);
6072                 flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
6073                 read_unlock(&kvm->mmu_lock);
6074         }
6075
6076         /*
6077          * It's also safe to flush TLBs out of mmu lock here as currently this
6078          * function is only used for dirty logging, in which case flushing TLB
6079          * out of mmu lock also guarantees no dirty pages will be lost in
6080          * dirty_bitmap.
6081          */
6082         if (flush)
6083                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
6084 }
6085
6086 void kvm_mmu_zap_all(struct kvm *kvm)
6087 {
6088         struct kvm_mmu_page *sp, *node;
6089         LIST_HEAD(invalid_list);
6090         int ign;
6091
6092         write_lock(&kvm->mmu_lock);
6093 restart:
6094         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6095                 if (WARN_ON(sp->role.invalid))
6096                         continue;
6097                 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
6098                         goto restart;
6099                 if (cond_resched_rwlock_write(&kvm->mmu_lock))
6100                         goto restart;
6101         }
6102
6103         kvm_mmu_commit_zap_page(kvm, &invalid_list);
6104
6105         if (is_tdp_mmu_enabled(kvm))
6106                 kvm_tdp_mmu_zap_all(kvm);
6107
6108         write_unlock(&kvm->mmu_lock);
6109 }
6110
6111 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
6112 {
6113         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
6114
6115         gen &= MMIO_SPTE_GEN_MASK;
6116
6117         /*
6118          * Generation numbers are incremented in multiples of the number of
6119          * address spaces in order to provide unique generations across all
6120          * address spaces.  Strip what is effectively the address space
6121          * modifier prior to checking for a wrap of the MMIO generation so
6122          * that a wrap in any address space is detected.
6123          */
6124         gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6125
6126         /*
6127          * The very rare case: if the MMIO generation number has wrapped,
6128          * zap all shadow pages.
6129          */
6130         if (unlikely(gen == 0)) {
6131                 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
6132                 kvm_mmu_zap_all_fast(kvm);
6133         }
6134 }
6135
6136 static unsigned long
6137 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
6138 {
6139         struct kvm *kvm;
6140         int nr_to_scan = sc->nr_to_scan;
6141         unsigned long freed = 0;
6142
6143         mutex_lock(&kvm_lock);
6144
6145         list_for_each_entry(kvm, &vm_list, vm_list) {
6146                 int idx;
6147                 LIST_HEAD(invalid_list);
6148
6149                 /*
6150                  * Never scan more than sc->nr_to_scan VM instances.
6151                  * Will not hit this condition practically since we do not try
6152                  * to shrink more than one VM and it is very unlikely to see
6153                  * !n_used_mmu_pages so many times.
6154                  */
6155                 if (!nr_to_scan--)
6156                         break;
6157                 /*
6158                  * n_used_mmu_pages is accessed without holding kvm->mmu_lock
6159                  * here. We may skip a VM instance errorneosly, but we do not
6160                  * want to shrink a VM that only started to populate its MMU
6161                  * anyway.
6162                  */
6163                 if (!kvm->arch.n_used_mmu_pages &&
6164                     !kvm_has_zapped_obsolete_pages(kvm))
6165                         continue;
6166
6167                 idx = srcu_read_lock(&kvm->srcu);
6168                 write_lock(&kvm->mmu_lock);
6169
6170                 if (kvm_has_zapped_obsolete_pages(kvm)) {
6171                         kvm_mmu_commit_zap_page(kvm,
6172                               &kvm->arch.zapped_obsolete_pages);
6173                         goto unlock;
6174                 }
6175
6176                 freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
6177
6178 unlock:
6179                 write_unlock(&kvm->mmu_lock);
6180                 srcu_read_unlock(&kvm->srcu, idx);
6181
6182                 /*
6183                  * unfair on small ones
6184                  * per-vm shrinkers cry out
6185                  * sadness comes quickly
6186                  */
6187                 list_move_tail(&kvm->vm_list, &vm_list);
6188                 break;
6189         }
6190
6191         mutex_unlock(&kvm_lock);
6192         return freed;
6193 }
6194
6195 static unsigned long
6196 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6197 {
6198         return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6199 }
6200
6201 static struct shrinker mmu_shrinker = {
6202         .count_objects = mmu_shrink_count,
6203         .scan_objects = mmu_shrink_scan,
6204         .seeks = DEFAULT_SEEKS * 10,
6205 };
6206
6207 static void mmu_destroy_caches(void)
6208 {
6209         kmem_cache_destroy(pte_list_desc_cache);
6210         kmem_cache_destroy(mmu_page_header_cache);
6211 }
6212
6213 static bool get_nx_auto_mode(void)
6214 {
6215         /* Return true when CPU has the bug, and mitigations are ON */
6216         return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6217 }
6218
6219 static void __set_nx_huge_pages(bool val)
6220 {
6221         nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6222 }
6223
6224 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6225 {
6226         bool old_val = nx_huge_pages;
6227         bool new_val;
6228
6229         /* In "auto" mode deploy workaround only if CPU has the bug. */
6230         if (sysfs_streq(val, "off"))
6231                 new_val = 0;
6232         else if (sysfs_streq(val, "force"))
6233                 new_val = 1;
6234         else if (sysfs_streq(val, "auto"))
6235                 new_val = get_nx_auto_mode();
6236         else if (strtobool(val, &new_val) < 0)
6237                 return -EINVAL;
6238
6239         __set_nx_huge_pages(new_val);
6240
6241         if (new_val != old_val) {
6242                 struct kvm *kvm;
6243
6244                 mutex_lock(&kvm_lock);
6245
6246                 list_for_each_entry(kvm, &vm_list, vm_list) {
6247                         mutex_lock(&kvm->slots_lock);
6248                         kvm_mmu_zap_all_fast(kvm);
6249                         mutex_unlock(&kvm->slots_lock);
6250
6251                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6252                 }
6253                 mutex_unlock(&kvm_lock);
6254         }
6255
6256         return 0;
6257 }
6258
6259 /*
6260  * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
6261  * its default value of -1 is technically undefined behavior for a boolean.
6262  */
6263 void kvm_mmu_x86_module_init(void)
6264 {
6265         if (nx_huge_pages == -1)
6266                 __set_nx_huge_pages(get_nx_auto_mode());
6267 }
6268
6269 /*
6270  * The bulk of the MMU initialization is deferred until the vendor module is
6271  * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
6272  * to be reset when a potentially different vendor module is loaded.
6273  */
6274 int kvm_mmu_vendor_module_init(void)
6275 {
6276         int ret = -ENOMEM;
6277
6278         /*
6279          * MMU roles use union aliasing which is, generally speaking, an
6280          * undefined behavior. However, we supposedly know how compilers behave
6281          * and the current status quo is unlikely to change. Guardians below are
6282          * supposed to let us know if the assumption becomes false.
6283          */
6284         BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6285         BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6286         BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6287
6288         kvm_mmu_reset_all_pte_masks();
6289
6290         pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6291                                             sizeof(struct pte_list_desc),
6292                                             0, SLAB_ACCOUNT, NULL);
6293         if (!pte_list_desc_cache)
6294                 goto out;
6295
6296         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6297                                                   sizeof(struct kvm_mmu_page),
6298                                                   0, SLAB_ACCOUNT, NULL);
6299         if (!mmu_page_header_cache)
6300                 goto out;
6301
6302         if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6303                 goto out;
6304
6305         ret = register_shrinker(&mmu_shrinker);
6306         if (ret)
6307                 goto out;
6308
6309         return 0;
6310
6311 out:
6312         mmu_destroy_caches();
6313         return ret;
6314 }
6315
6316 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6317 {
6318         kvm_mmu_unload(vcpu);
6319         free_mmu_pages(&vcpu->arch.root_mmu);
6320         free_mmu_pages(&vcpu->arch.guest_mmu);
6321         mmu_free_memory_caches(vcpu);
6322 }
6323
6324 void kvm_mmu_vendor_module_exit(void)
6325 {
6326         mmu_destroy_caches();
6327         percpu_counter_destroy(&kvm_total_used_mmu_pages);
6328         unregister_shrinker(&mmu_shrinker);
6329 }
6330
6331 /*
6332  * Calculate the effective recovery period, accounting for '0' meaning "let KVM
6333  * select a halving time of 1 hour".  Returns true if recovery is enabled.
6334  */
6335 static bool calc_nx_huge_pages_recovery_period(uint *period)
6336 {
6337         /*
6338          * Use READ_ONCE to get the params, this may be called outside of the
6339          * param setters, e.g. by the kthread to compute its next timeout.
6340          */
6341         bool enabled = READ_ONCE(nx_huge_pages);
6342         uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6343
6344         if (!enabled || !ratio)
6345                 return false;
6346
6347         *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
6348         if (!*period) {
6349                 /* Make sure the period is not less than one second.  */
6350                 ratio = min(ratio, 3600u);
6351                 *period = 60 * 60 * 1000 / ratio;
6352         }
6353         return true;
6354 }
6355
6356 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
6357 {
6358         bool was_recovery_enabled, is_recovery_enabled;
6359         uint old_period, new_period;
6360         int err;
6361
6362         was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
6363
6364         err = param_set_uint(val, kp);
6365         if (err)
6366                 return err;
6367
6368         is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
6369
6370         if (is_recovery_enabled &&
6371             (!was_recovery_enabled || old_period > new_period)) {
6372                 struct kvm *kvm;
6373
6374                 mutex_lock(&kvm_lock);
6375
6376                 list_for_each_entry(kvm, &vm_list, vm_list)
6377                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6378
6379                 mutex_unlock(&kvm_lock);
6380         }
6381
6382         return err;
6383 }
6384
6385 static void kvm_recover_nx_lpages(struct kvm *kvm)
6386 {
6387         unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
6388         int rcu_idx;
6389         struct kvm_mmu_page *sp;
6390         unsigned int ratio;
6391         LIST_HEAD(invalid_list);
6392         bool flush = false;
6393         ulong to_zap;
6394
6395         rcu_idx = srcu_read_lock(&kvm->srcu);
6396         write_lock(&kvm->mmu_lock);
6397
6398         /*
6399          * Zapping TDP MMU shadow pages, including the remote TLB flush, must
6400          * be done under RCU protection, because the pages are freed via RCU
6401          * callback.
6402          */
6403         rcu_read_lock();
6404
6405         ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6406         to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
6407         for ( ; to_zap; --to_zap) {
6408                 if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
6409                         break;
6410
6411                 /*
6412                  * We use a separate list instead of just using active_mmu_pages
6413                  * because the number of lpage_disallowed pages is expected to
6414                  * be relatively small compared to the total.
6415                  */
6416                 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6417                                       struct kvm_mmu_page,
6418                                       lpage_disallowed_link);
6419                 WARN_ON_ONCE(!sp->lpage_disallowed);
6420                 if (is_tdp_mmu_page(sp)) {
6421                         flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
6422                 } else {
6423                         kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6424                         WARN_ON_ONCE(sp->lpage_disallowed);
6425                 }
6426
6427                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
6428                         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6429                         rcu_read_unlock();
6430
6431                         cond_resched_rwlock_write(&kvm->mmu_lock);
6432                         flush = false;
6433
6434                         rcu_read_lock();
6435                 }
6436         }
6437         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6438
6439         rcu_read_unlock();
6440
6441         write_unlock(&kvm->mmu_lock);
6442         srcu_read_unlock(&kvm->srcu, rcu_idx);
6443 }
6444
6445 static long get_nx_lpage_recovery_timeout(u64 start_time)
6446 {
6447         bool enabled;
6448         uint period;
6449
6450         enabled = calc_nx_huge_pages_recovery_period(&period);
6451
6452         return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
6453                        : MAX_SCHEDULE_TIMEOUT;
6454 }
6455
6456 static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6457 {
6458         u64 start_time;
6459         long remaining_time;
6460
6461         while (true) {
6462                 start_time = get_jiffies_64();
6463                 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6464
6465                 set_current_state(TASK_INTERRUPTIBLE);
6466                 while (!kthread_should_stop() && remaining_time > 0) {
6467                         schedule_timeout(remaining_time);
6468                         remaining_time = get_nx_lpage_recovery_timeout(start_time);
6469                         set_current_state(TASK_INTERRUPTIBLE);
6470                 }
6471
6472                 set_current_state(TASK_RUNNING);
6473
6474                 if (kthread_should_stop())
6475                         return 0;
6476
6477                 kvm_recover_nx_lpages(kvm);
6478         }
6479 }
6480
6481 int kvm_mmu_post_init_vm(struct kvm *kvm)
6482 {
6483         int err;
6484
6485         err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6486                                           "kvm-nx-lpage-recovery",
6487                                           &kvm->arch.nx_lpage_recovery_thread);
6488         if (!err)
6489                 kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6490
6491         return err;
6492 }
6493
6494 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6495 {
6496         if (kvm->arch.nx_lpage_recovery_thread)
6497                 kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6498 }