KVM: x86/mmu: Add sptep_to_sp() helper to wrap shadow page lookup
[linux-2.6-microblaze.git] / arch / x86 / kvm / mmu / mmu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * This module enables machines with Intel VT-x extensions to run virtual
6  * machines without emulation or binary translation.
7  *
8  * MMU support
9  *
10  * Copyright (C) 2006 Qumranet, Inc.
11  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
12  *
13  * Authors:
14  *   Yaniv Kamay  <yaniv@qumranet.com>
15  *   Avi Kivity   <avi@qumranet.com>
16  */
17
18 #include "irq.h"
19 #include "ioapic.h"
20 #include "mmu.h"
21 #include "mmu_internal.h"
22 #include "x86.h"
23 #include "kvm_cache_regs.h"
24 #include "kvm_emulate.h"
25 #include "cpuid.h"
26
27 #include <linux/kvm_host.h>
28 #include <linux/types.h>
29 #include <linux/string.h>
30 #include <linux/mm.h>
31 #include <linux/highmem.h>
32 #include <linux/moduleparam.h>
33 #include <linux/export.h>
34 #include <linux/swap.h>
35 #include <linux/hugetlb.h>
36 #include <linux/compiler.h>
37 #include <linux/srcu.h>
38 #include <linux/slab.h>
39 #include <linux/sched/signal.h>
40 #include <linux/uaccess.h>
41 #include <linux/hash.h>
42 #include <linux/kern_levels.h>
43 #include <linux/kthread.h>
44
45 #include <asm/page.h>
46 #include <asm/memtype.h>
47 #include <asm/cmpxchg.h>
48 #include <asm/e820/api.h>
49 #include <asm/io.h>
50 #include <asm/vmx.h>
51 #include <asm/kvm_page_track.h>
52 #include "trace.h"
53
54 extern bool itlb_multihit_kvm_mitigation;
55
56 static int __read_mostly nx_huge_pages = -1;
57 #ifdef CONFIG_PREEMPT_RT
58 /* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
59 static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
60 #else
61 static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
62 #endif
63
64 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
65 static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
66
67 static struct kernel_param_ops nx_huge_pages_ops = {
68         .set = set_nx_huge_pages,
69         .get = param_get_bool,
70 };
71
72 static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
73         .set = set_nx_huge_pages_recovery_ratio,
74         .get = param_get_uint,
75 };
76
77 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
78 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
79 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
80                 &nx_huge_pages_recovery_ratio, 0644);
81 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
82
83 static bool __read_mostly force_flush_and_sync_on_reuse;
84 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
85
86 /*
87  * When setting this variable to true it enables Two-Dimensional-Paging
88  * where the hardware walks 2 page tables:
89  * 1. the guest-virtual to guest-physical
90  * 2. while doing 1. it walks guest-physical to host-physical
91  * If the hardware supports that we don't need to do shadow paging.
92  */
93 bool tdp_enabled = false;
94
95 static int max_page_level __read_mostly;
96
97 enum {
98         AUDIT_PRE_PAGE_FAULT,
99         AUDIT_POST_PAGE_FAULT,
100         AUDIT_PRE_PTE_WRITE,
101         AUDIT_POST_PTE_WRITE,
102         AUDIT_PRE_SYNC,
103         AUDIT_POST_SYNC
104 };
105
106 #undef MMU_DEBUG
107
108 #ifdef MMU_DEBUG
109 static bool dbg = 0;
110 module_param(dbg, bool, 0644);
111
112 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
113 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
114 #define MMU_WARN_ON(x) WARN_ON(x)
115 #else
116 #define pgprintk(x...) do { } while (0)
117 #define rmap_printk(x...) do { } while (0)
118 #define MMU_WARN_ON(x) do { } while (0)
119 #endif
120
121 #define PTE_PREFETCH_NUM                8
122
123 #define PT_FIRST_AVAIL_BITS_SHIFT 10
124 #define PT64_SECOND_AVAIL_BITS_SHIFT 54
125
126 /*
127  * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
128  * Access Tracking SPTEs.
129  */
130 #define SPTE_SPECIAL_MASK (3ULL << 52)
131 #define SPTE_AD_ENABLED_MASK (0ULL << 52)
132 #define SPTE_AD_DISABLED_MASK (1ULL << 52)
133 #define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
134 #define SPTE_MMIO_MASK (3ULL << 52)
135
136 #define PT64_LEVEL_BITS 9
137
138 #define PT64_LEVEL_SHIFT(level) \
139                 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
140
141 #define PT64_INDEX(address, level)\
142         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
143
144
145 #define PT32_LEVEL_BITS 10
146
147 #define PT32_LEVEL_SHIFT(level) \
148                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
149
150 #define PT32_LVL_OFFSET_MASK(level) \
151         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
152                                                 * PT32_LEVEL_BITS))) - 1))
153
154 #define PT32_INDEX(address, level)\
155         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
156
157
158 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
159 #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
160 #else
161 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
162 #endif
163 #define PT64_LVL_ADDR_MASK(level) \
164         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
165                                                 * PT64_LEVEL_BITS))) - 1))
166 #define PT64_LVL_OFFSET_MASK(level) \
167         (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
168                                                 * PT64_LEVEL_BITS))) - 1))
169
170 #define PT32_BASE_ADDR_MASK PAGE_MASK
171 #define PT32_DIR_BASE_ADDR_MASK \
172         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
173 #define PT32_LVL_ADDR_MASK(level) \
174         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
175                                             * PT32_LEVEL_BITS))) - 1))
176
177 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
178                         | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
179
180 #define ACC_EXEC_MASK    1
181 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
182 #define ACC_USER_MASK    PT_USER_MASK
183 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
184
185 /* The mask for the R/X bits in EPT PTEs */
186 #define PT64_EPT_READABLE_MASK                  0x1ull
187 #define PT64_EPT_EXECUTABLE_MASK                0x4ull
188
189 #include <trace/events/kvm.h>
190
191 #define SPTE_HOST_WRITEABLE     (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
192 #define SPTE_MMU_WRITEABLE      (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
193
194 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
195
196 /* make pte_list_desc fit well in cache line */
197 #define PTE_LIST_EXT 3
198
199 /*
200  * Return values of handle_mmio_page_fault and mmu.page_fault:
201  * RET_PF_RETRY: let CPU fault again on the address.
202  * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
203  *
204  * For handle_mmio_page_fault only:
205  * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
206  */
207 enum {
208         RET_PF_RETRY = 0,
209         RET_PF_EMULATE = 1,
210         RET_PF_INVALID = 2,
211 };
212
213 struct pte_list_desc {
214         u64 *sptes[PTE_LIST_EXT];
215         struct pte_list_desc *more;
216 };
217
218 struct kvm_shadow_walk_iterator {
219         u64 addr;
220         hpa_t shadow_addr;
221         u64 *sptep;
222         int level;
223         unsigned index;
224 };
225
226 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
227         for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
228                                          (_root), (_addr));                \
229              shadow_walk_okay(&(_walker));                                 \
230              shadow_walk_next(&(_walker)))
231
232 #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
233         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
234              shadow_walk_okay(&(_walker));                      \
235              shadow_walk_next(&(_walker)))
236
237 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)     \
238         for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
239              shadow_walk_okay(&(_walker)) &&                            \
240                 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
241              __shadow_walk_next(&(_walker), spte))
242
243 static struct kmem_cache *pte_list_desc_cache;
244 static struct kmem_cache *mmu_page_header_cache;
245 static struct percpu_counter kvm_total_used_mmu_pages;
246
247 static u64 __read_mostly shadow_nx_mask;
248 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
249 static u64 __read_mostly shadow_user_mask;
250 static u64 __read_mostly shadow_accessed_mask;
251 static u64 __read_mostly shadow_dirty_mask;
252 static u64 __read_mostly shadow_mmio_value;
253 static u64 __read_mostly shadow_mmio_access_mask;
254 static u64 __read_mostly shadow_present_mask;
255 static u64 __read_mostly shadow_me_mask;
256
257 /*
258  * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
259  * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
260  * pages.
261  */
262 static u64 __read_mostly shadow_acc_track_mask;
263
264 /*
265  * The mask/shift to use for saving the original R/X bits when marking the PTE
266  * as not-present for access tracking purposes. We do not save the W bit as the
267  * PTEs being access tracked also need to be dirty tracked, so the W bit will be
268  * restored only when a write is attempted to the page.
269  */
270 static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
271                                                     PT64_EPT_EXECUTABLE_MASK;
272 static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
273
274 /*
275  * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
276  * to guard against L1TF attacks.
277  */
278 static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
279
280 /*
281  * The number of high-order 1 bits to use in the mask above.
282  */
283 static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
284
285 /*
286  * In some cases, we need to preserve the GFN of a non-present or reserved
287  * SPTE when we usurp the upper five bits of the physical address space to
288  * defend against L1TF, e.g. for MMIO SPTEs.  To preserve the GFN, we'll
289  * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
290  * left into the reserved bits, i.e. the GFN in the SPTE will be split into
291  * high and low parts.  This mask covers the lower bits of the GFN.
292  */
293 static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
294
295 /*
296  * The number of non-reserved physical address bits irrespective of features
297  * that repurpose legal bits, e.g. MKTME.
298  */
299 static u8 __read_mostly shadow_phys_bits;
300
301 static void mmu_spte_set(u64 *sptep, u64 spte);
302 static bool is_executable_pte(u64 spte);
303 static union kvm_mmu_page_role
304 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
305
306 #define CREATE_TRACE_POINTS
307 #include "mmutrace.h"
308
309
310 static inline bool kvm_available_flush_tlb_with_range(void)
311 {
312         return kvm_x86_ops.tlb_remote_flush_with_range;
313 }
314
315 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
316                 struct kvm_tlb_range *range)
317 {
318         int ret = -ENOTSUPP;
319
320         if (range && kvm_x86_ops.tlb_remote_flush_with_range)
321                 ret = kvm_x86_ops.tlb_remote_flush_with_range(kvm, range);
322
323         if (ret)
324                 kvm_flush_remote_tlbs(kvm);
325 }
326
327 static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
328                 u64 start_gfn, u64 pages)
329 {
330         struct kvm_tlb_range range;
331
332         range.start_gfn = start_gfn;
333         range.pages = pages;
334
335         kvm_flush_remote_tlbs_with_range(kvm, &range);
336 }
337
338 void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
339 {
340         BUG_ON((u64)(unsigned)access_mask != access_mask);
341         WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
342         WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
343         shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
344         shadow_mmio_access_mask = access_mask;
345 }
346 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
347
348 static bool is_mmio_spte(u64 spte)
349 {
350         return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
351 }
352
353 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
354 {
355         return sp->role.ad_disabled;
356 }
357
358 static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
359 {
360         /*
361          * When using the EPT page-modification log, the GPAs in the log
362          * would come from L2 rather than L1.  Therefore, we need to rely
363          * on write protection to record dirty pages.  This also bypasses
364          * PML, since writes now result in a vmexit.
365          */
366         return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
367 }
368
369 static inline bool spte_ad_enabled(u64 spte)
370 {
371         MMU_WARN_ON(is_mmio_spte(spte));
372         return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
373 }
374
375 static inline bool spte_ad_need_write_protect(u64 spte)
376 {
377         MMU_WARN_ON(is_mmio_spte(spte));
378         return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
379 }
380
381 static bool is_nx_huge_page_enabled(void)
382 {
383         return READ_ONCE(nx_huge_pages);
384 }
385
386 static inline u64 spte_shadow_accessed_mask(u64 spte)
387 {
388         MMU_WARN_ON(is_mmio_spte(spte));
389         return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
390 }
391
392 static inline u64 spte_shadow_dirty_mask(u64 spte)
393 {
394         MMU_WARN_ON(is_mmio_spte(spte));
395         return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
396 }
397
398 static inline bool is_access_track_spte(u64 spte)
399 {
400         return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
401 }
402
403 /*
404  * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
405  * the memslots generation and is derived as follows:
406  *
407  * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
408  * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
409  *
410  * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
411  * the MMIO generation number, as doing so would require stealing a bit from
412  * the "real" generation number and thus effectively halve the maximum number
413  * of MMIO generations that can be handled before encountering a wrap (which
414  * requires a full MMU zap).  The flag is instead explicitly queried when
415  * checking for MMIO spte cache hits.
416  */
417 #define MMIO_SPTE_GEN_MASK              GENMASK_ULL(17, 0)
418
419 #define MMIO_SPTE_GEN_LOW_START         3
420 #define MMIO_SPTE_GEN_LOW_END           11
421 #define MMIO_SPTE_GEN_LOW_MASK          GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
422                                                     MMIO_SPTE_GEN_LOW_START)
423
424 #define MMIO_SPTE_GEN_HIGH_START        PT64_SECOND_AVAIL_BITS_SHIFT
425 #define MMIO_SPTE_GEN_HIGH_END          62
426 #define MMIO_SPTE_GEN_HIGH_MASK         GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
427                                                     MMIO_SPTE_GEN_HIGH_START)
428
429 static u64 generation_mmio_spte_mask(u64 gen)
430 {
431         u64 mask;
432
433         WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
434         BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
435
436         mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
437         mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
438         return mask;
439 }
440
441 static u64 get_mmio_spte_generation(u64 spte)
442 {
443         u64 gen;
444
445         gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
446         gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
447         return gen;
448 }
449
450 static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
451 {
452
453         u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
454         u64 mask = generation_mmio_spte_mask(gen);
455         u64 gpa = gfn << PAGE_SHIFT;
456
457         access &= shadow_mmio_access_mask;
458         mask |= shadow_mmio_value | access;
459         mask |= gpa | shadow_nonpresent_or_rsvd_mask;
460         mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
461                 << shadow_nonpresent_or_rsvd_mask_len;
462
463         return mask;
464 }
465
466 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
467                            unsigned int access)
468 {
469         u64 mask = make_mmio_spte(vcpu, gfn, access);
470         unsigned int gen = get_mmio_spte_generation(mask);
471
472         access = mask & ACC_ALL;
473
474         trace_mark_mmio_spte(sptep, gfn, access, gen);
475         mmu_spte_set(sptep, mask);
476 }
477
478 static gfn_t get_mmio_spte_gfn(u64 spte)
479 {
480         u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
481
482         gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
483                & shadow_nonpresent_or_rsvd_mask;
484
485         return gpa >> PAGE_SHIFT;
486 }
487
488 static unsigned get_mmio_spte_access(u64 spte)
489 {
490         return spte & shadow_mmio_access_mask;
491 }
492
493 static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
494                           kvm_pfn_t pfn, unsigned int access)
495 {
496         if (unlikely(is_noslot_pfn(pfn))) {
497                 mark_mmio_spte(vcpu, sptep, gfn, access);
498                 return true;
499         }
500
501         return false;
502 }
503
504 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
505 {
506         u64 kvm_gen, spte_gen, gen;
507
508         gen = kvm_vcpu_memslots(vcpu)->generation;
509         if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
510                 return false;
511
512         kvm_gen = gen & MMIO_SPTE_GEN_MASK;
513         spte_gen = get_mmio_spte_generation(spte);
514
515         trace_check_mmio_spte(spte, kvm_gen, spte_gen);
516         return likely(kvm_gen == spte_gen);
517 }
518
519 /*
520  * Sets the shadow PTE masks used by the MMU.
521  *
522  * Assumptions:
523  *  - Setting either @accessed_mask or @dirty_mask requires setting both
524  *  - At least one of @accessed_mask or @acc_track_mask must be set
525  */
526 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
527                 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
528                 u64 acc_track_mask, u64 me_mask)
529 {
530         BUG_ON(!dirty_mask != !accessed_mask);
531         BUG_ON(!accessed_mask && !acc_track_mask);
532         BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
533
534         shadow_user_mask = user_mask;
535         shadow_accessed_mask = accessed_mask;
536         shadow_dirty_mask = dirty_mask;
537         shadow_nx_mask = nx_mask;
538         shadow_x_mask = x_mask;
539         shadow_present_mask = p_mask;
540         shadow_acc_track_mask = acc_track_mask;
541         shadow_me_mask = me_mask;
542 }
543 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
544
545 static u8 kvm_get_shadow_phys_bits(void)
546 {
547         /*
548          * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
549          * in CPU detection code, but the processor treats those reduced bits as
550          * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
551          * the physical address bits reported by CPUID.
552          */
553         if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
554                 return cpuid_eax(0x80000008) & 0xff;
555
556         /*
557          * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
558          * custom CPUID.  Proceed with whatever the kernel found since these features
559          * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
560          */
561         return boot_cpu_data.x86_phys_bits;
562 }
563
564 static void kvm_mmu_reset_all_pte_masks(void)
565 {
566         u8 low_phys_bits;
567
568         shadow_user_mask = 0;
569         shadow_accessed_mask = 0;
570         shadow_dirty_mask = 0;
571         shadow_nx_mask = 0;
572         shadow_x_mask = 0;
573         shadow_present_mask = 0;
574         shadow_acc_track_mask = 0;
575
576         shadow_phys_bits = kvm_get_shadow_phys_bits();
577
578         /*
579          * If the CPU has 46 or less physical address bits, then set an
580          * appropriate mask to guard against L1TF attacks. Otherwise, it is
581          * assumed that the CPU is not vulnerable to L1TF.
582          *
583          * Some Intel CPUs address the L1 cache using more PA bits than are
584          * reported by CPUID. Use the PA width of the L1 cache when possible
585          * to achieve more effective mitigation, e.g. if system RAM overlaps
586          * the most significant bits of legal physical address space.
587          */
588         shadow_nonpresent_or_rsvd_mask = 0;
589         low_phys_bits = boot_cpu_data.x86_phys_bits;
590         if (boot_cpu_has_bug(X86_BUG_L1TF) &&
591             !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
592                           52 - shadow_nonpresent_or_rsvd_mask_len)) {
593                 low_phys_bits = boot_cpu_data.x86_cache_bits
594                         - shadow_nonpresent_or_rsvd_mask_len;
595                 shadow_nonpresent_or_rsvd_mask =
596                         rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
597         }
598
599         shadow_nonpresent_or_rsvd_lower_gfn_mask =
600                 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
601 }
602
603 static int is_cpuid_PSE36(void)
604 {
605         return 1;
606 }
607
608 static int is_nx(struct kvm_vcpu *vcpu)
609 {
610         return vcpu->arch.efer & EFER_NX;
611 }
612
613 static int is_shadow_present_pte(u64 pte)
614 {
615         return (pte != 0) && !is_mmio_spte(pte);
616 }
617
618 static int is_large_pte(u64 pte)
619 {
620         return pte & PT_PAGE_SIZE_MASK;
621 }
622
623 static int is_last_spte(u64 pte, int level)
624 {
625         if (level == PG_LEVEL_4K)
626                 return 1;
627         if (is_large_pte(pte))
628                 return 1;
629         return 0;
630 }
631
632 static bool is_executable_pte(u64 spte)
633 {
634         return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
635 }
636
637 static kvm_pfn_t spte_to_pfn(u64 pte)
638 {
639         return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
640 }
641
642 static gfn_t pse36_gfn_delta(u32 gpte)
643 {
644         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
645
646         return (gpte & PT32_DIR_PSE36_MASK) << shift;
647 }
648
649 #ifdef CONFIG_X86_64
650 static void __set_spte(u64 *sptep, u64 spte)
651 {
652         WRITE_ONCE(*sptep, spte);
653 }
654
655 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
656 {
657         WRITE_ONCE(*sptep, spte);
658 }
659
660 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
661 {
662         return xchg(sptep, spte);
663 }
664
665 static u64 __get_spte_lockless(u64 *sptep)
666 {
667         return READ_ONCE(*sptep);
668 }
669 #else
670 union split_spte {
671         struct {
672                 u32 spte_low;
673                 u32 spte_high;
674         };
675         u64 spte;
676 };
677
678 static void count_spte_clear(u64 *sptep, u64 spte)
679 {
680         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
681
682         if (is_shadow_present_pte(spte))
683                 return;
684
685         /* Ensure the spte is completely set before we increase the count */
686         smp_wmb();
687         sp->clear_spte_count++;
688 }
689
690 static void __set_spte(u64 *sptep, u64 spte)
691 {
692         union split_spte *ssptep, sspte;
693
694         ssptep = (union split_spte *)sptep;
695         sspte = (union split_spte)spte;
696
697         ssptep->spte_high = sspte.spte_high;
698
699         /*
700          * If we map the spte from nonpresent to present, We should store
701          * the high bits firstly, then set present bit, so cpu can not
702          * fetch this spte while we are setting the spte.
703          */
704         smp_wmb();
705
706         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
707 }
708
709 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
710 {
711         union split_spte *ssptep, sspte;
712
713         ssptep = (union split_spte *)sptep;
714         sspte = (union split_spte)spte;
715
716         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
717
718         /*
719          * If we map the spte from present to nonpresent, we should clear
720          * present bit firstly to avoid vcpu fetch the old high bits.
721          */
722         smp_wmb();
723
724         ssptep->spte_high = sspte.spte_high;
725         count_spte_clear(sptep, spte);
726 }
727
728 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
729 {
730         union split_spte *ssptep, sspte, orig;
731
732         ssptep = (union split_spte *)sptep;
733         sspte = (union split_spte)spte;
734
735         /* xchg acts as a barrier before the setting of the high bits */
736         orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
737         orig.spte_high = ssptep->spte_high;
738         ssptep->spte_high = sspte.spte_high;
739         count_spte_clear(sptep, spte);
740
741         return orig.spte;
742 }
743
744 /*
745  * The idea using the light way get the spte on x86_32 guest is from
746  * gup_get_pte (mm/gup.c).
747  *
748  * An spte tlb flush may be pending, because kvm_set_pte_rmapp
749  * coalesces them and we are running out of the MMU lock.  Therefore
750  * we need to protect against in-progress updates of the spte.
751  *
752  * Reading the spte while an update is in progress may get the old value
753  * for the high part of the spte.  The race is fine for a present->non-present
754  * change (because the high part of the spte is ignored for non-present spte),
755  * but for a present->present change we must reread the spte.
756  *
757  * All such changes are done in two steps (present->non-present and
758  * non-present->present), hence it is enough to count the number of
759  * present->non-present updates: if it changed while reading the spte,
760  * we might have hit the race.  This is done using clear_spte_count.
761  */
762 static u64 __get_spte_lockless(u64 *sptep)
763 {
764         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
765         union split_spte spte, *orig = (union split_spte *)sptep;
766         int count;
767
768 retry:
769         count = sp->clear_spte_count;
770         smp_rmb();
771
772         spte.spte_low = orig->spte_low;
773         smp_rmb();
774
775         spte.spte_high = orig->spte_high;
776         smp_rmb();
777
778         if (unlikely(spte.spte_low != orig->spte_low ||
779               count != sp->clear_spte_count))
780                 goto retry;
781
782         return spte.spte;
783 }
784 #endif
785
786 static bool spte_can_locklessly_be_made_writable(u64 spte)
787 {
788         return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
789                 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
790 }
791
792 static bool spte_has_volatile_bits(u64 spte)
793 {
794         if (!is_shadow_present_pte(spte))
795                 return false;
796
797         /*
798          * Always atomically update spte if it can be updated
799          * out of mmu-lock, it can ensure dirty bit is not lost,
800          * also, it can help us to get a stable is_writable_pte()
801          * to ensure tlb flush is not missed.
802          */
803         if (spte_can_locklessly_be_made_writable(spte) ||
804             is_access_track_spte(spte))
805                 return true;
806
807         if (spte_ad_enabled(spte)) {
808                 if ((spte & shadow_accessed_mask) == 0 ||
809                     (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
810                         return true;
811         }
812
813         return false;
814 }
815
816 static bool is_accessed_spte(u64 spte)
817 {
818         u64 accessed_mask = spte_shadow_accessed_mask(spte);
819
820         return accessed_mask ? spte & accessed_mask
821                              : !is_access_track_spte(spte);
822 }
823
824 static bool is_dirty_spte(u64 spte)
825 {
826         u64 dirty_mask = spte_shadow_dirty_mask(spte);
827
828         return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
829 }
830
831 /* Rules for using mmu_spte_set:
832  * Set the sptep from nonpresent to present.
833  * Note: the sptep being assigned *must* be either not present
834  * or in a state where the hardware will not attempt to update
835  * the spte.
836  */
837 static void mmu_spte_set(u64 *sptep, u64 new_spte)
838 {
839         WARN_ON(is_shadow_present_pte(*sptep));
840         __set_spte(sptep, new_spte);
841 }
842
843 /*
844  * Update the SPTE (excluding the PFN), but do not track changes in its
845  * accessed/dirty status.
846  */
847 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
848 {
849         u64 old_spte = *sptep;
850
851         WARN_ON(!is_shadow_present_pte(new_spte));
852
853         if (!is_shadow_present_pte(old_spte)) {
854                 mmu_spte_set(sptep, new_spte);
855                 return old_spte;
856         }
857
858         if (!spte_has_volatile_bits(old_spte))
859                 __update_clear_spte_fast(sptep, new_spte);
860         else
861                 old_spte = __update_clear_spte_slow(sptep, new_spte);
862
863         WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
864
865         return old_spte;
866 }
867
868 /* Rules for using mmu_spte_update:
869  * Update the state bits, it means the mapped pfn is not changed.
870  *
871  * Whenever we overwrite a writable spte with a read-only one we
872  * should flush remote TLBs. Otherwise rmap_write_protect
873  * will find a read-only spte, even though the writable spte
874  * might be cached on a CPU's TLB, the return value indicates this
875  * case.
876  *
877  * Returns true if the TLB needs to be flushed
878  */
879 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
880 {
881         bool flush = false;
882         u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
883
884         if (!is_shadow_present_pte(old_spte))
885                 return false;
886
887         /*
888          * For the spte updated out of mmu-lock is safe, since
889          * we always atomically update it, see the comments in
890          * spte_has_volatile_bits().
891          */
892         if (spte_can_locklessly_be_made_writable(old_spte) &&
893               !is_writable_pte(new_spte))
894                 flush = true;
895
896         /*
897          * Flush TLB when accessed/dirty states are changed in the page tables,
898          * to guarantee consistency between TLB and page tables.
899          */
900
901         if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
902                 flush = true;
903                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
904         }
905
906         if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
907                 flush = true;
908                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
909         }
910
911         return flush;
912 }
913
914 /*
915  * Rules for using mmu_spte_clear_track_bits:
916  * It sets the sptep from present to nonpresent, and track the
917  * state bits, it is used to clear the last level sptep.
918  * Returns non-zero if the PTE was previously valid.
919  */
920 static int mmu_spte_clear_track_bits(u64 *sptep)
921 {
922         kvm_pfn_t pfn;
923         u64 old_spte = *sptep;
924
925         if (!spte_has_volatile_bits(old_spte))
926                 __update_clear_spte_fast(sptep, 0ull);
927         else
928                 old_spte = __update_clear_spte_slow(sptep, 0ull);
929
930         if (!is_shadow_present_pte(old_spte))
931                 return 0;
932
933         pfn = spte_to_pfn(old_spte);
934
935         /*
936          * KVM does not hold the refcount of the page used by
937          * kvm mmu, before reclaiming the page, we should
938          * unmap it from mmu first.
939          */
940         WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
941
942         if (is_accessed_spte(old_spte))
943                 kvm_set_pfn_accessed(pfn);
944
945         if (is_dirty_spte(old_spte))
946                 kvm_set_pfn_dirty(pfn);
947
948         return 1;
949 }
950
951 /*
952  * Rules for using mmu_spte_clear_no_track:
953  * Directly clear spte without caring the state bits of sptep,
954  * it is used to set the upper level spte.
955  */
956 static void mmu_spte_clear_no_track(u64 *sptep)
957 {
958         __update_clear_spte_fast(sptep, 0ull);
959 }
960
961 static u64 mmu_spte_get_lockless(u64 *sptep)
962 {
963         return __get_spte_lockless(sptep);
964 }
965
966 static u64 mark_spte_for_access_track(u64 spte)
967 {
968         if (spte_ad_enabled(spte))
969                 return spte & ~shadow_accessed_mask;
970
971         if (is_access_track_spte(spte))
972                 return spte;
973
974         /*
975          * Making an Access Tracking PTE will result in removal of write access
976          * from the PTE. So, verify that we will be able to restore the write
977          * access in the fast page fault path later on.
978          */
979         WARN_ONCE((spte & PT_WRITABLE_MASK) &&
980                   !spte_can_locklessly_be_made_writable(spte),
981                   "kvm: Writable SPTE is not locklessly dirty-trackable\n");
982
983         WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
984                           shadow_acc_track_saved_bits_shift),
985                   "kvm: Access Tracking saved bit locations are not zero\n");
986
987         spte |= (spte & shadow_acc_track_saved_bits_mask) <<
988                 shadow_acc_track_saved_bits_shift;
989         spte &= ~shadow_acc_track_mask;
990
991         return spte;
992 }
993
994 /* Restore an acc-track PTE back to a regular PTE */
995 static u64 restore_acc_track_spte(u64 spte)
996 {
997         u64 new_spte = spte;
998         u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
999                          & shadow_acc_track_saved_bits_mask;
1000
1001         WARN_ON_ONCE(spte_ad_enabled(spte));
1002         WARN_ON_ONCE(!is_access_track_spte(spte));
1003
1004         new_spte &= ~shadow_acc_track_mask;
1005         new_spte &= ~(shadow_acc_track_saved_bits_mask <<
1006                       shadow_acc_track_saved_bits_shift);
1007         new_spte |= saved_bits;
1008
1009         return new_spte;
1010 }
1011
1012 /* Returns the Accessed status of the PTE and resets it at the same time. */
1013 static bool mmu_spte_age(u64 *sptep)
1014 {
1015         u64 spte = mmu_spte_get_lockless(sptep);
1016
1017         if (!is_accessed_spte(spte))
1018                 return false;
1019
1020         if (spte_ad_enabled(spte)) {
1021                 clear_bit((ffs(shadow_accessed_mask) - 1),
1022                           (unsigned long *)sptep);
1023         } else {
1024                 /*
1025                  * Capture the dirty status of the page, so that it doesn't get
1026                  * lost when the SPTE is marked for access tracking.
1027                  */
1028                 if (is_writable_pte(spte))
1029                         kvm_set_pfn_dirty(spte_to_pfn(spte));
1030
1031                 spte = mark_spte_for_access_track(spte);
1032                 mmu_spte_update_no_track(sptep, spte);
1033         }
1034
1035         return true;
1036 }
1037
1038 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
1039 {
1040         /*
1041          * Prevent page table teardown by making any free-er wait during
1042          * kvm_flush_remote_tlbs() IPI to all active vcpus.
1043          */
1044         local_irq_disable();
1045
1046         /*
1047          * Make sure a following spte read is not reordered ahead of the write
1048          * to vcpu->mode.
1049          */
1050         smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
1051 }
1052
1053 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
1054 {
1055         /*
1056          * Make sure the write to vcpu->mode is not reordered in front of
1057          * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
1058          * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
1059          */
1060         smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
1061         local_irq_enable();
1062 }
1063
1064 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
1065                                   struct kmem_cache *base_cache, int min)
1066 {
1067         void *obj;
1068
1069         if (cache->nobjs >= min)
1070                 return 0;
1071         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1072                 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
1073                 if (!obj)
1074                         return cache->nobjs >= min ? 0 : -ENOMEM;
1075                 cache->objects[cache->nobjs++] = obj;
1076         }
1077         return 0;
1078 }
1079
1080 static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
1081 {
1082         return cache->nobjs;
1083 }
1084
1085 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
1086                                   struct kmem_cache *cache)
1087 {
1088         while (mc->nobjs)
1089                 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
1090 }
1091
1092 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
1093                                        int min)
1094 {
1095         void *page;
1096
1097         if (cache->nobjs >= min)
1098                 return 0;
1099         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1100                 page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
1101                 if (!page)
1102                         return cache->nobjs >= min ? 0 : -ENOMEM;
1103                 cache->objects[cache->nobjs++] = page;
1104         }
1105         return 0;
1106 }
1107
1108 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
1109 {
1110         while (mc->nobjs)
1111                 free_page((unsigned long)mc->objects[--mc->nobjs]);
1112 }
1113
1114 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
1115 {
1116         int r;
1117
1118         r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1119                                    pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
1120         if (r)
1121                 goto out;
1122         r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
1123         if (r)
1124                 goto out;
1125         r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
1126                                    mmu_page_header_cache, 4);
1127 out:
1128         return r;
1129 }
1130
1131 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1132 {
1133         mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1134                                 pte_list_desc_cache);
1135         mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
1136         mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
1137                                 mmu_page_header_cache);
1138 }
1139
1140 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
1141 {
1142         void *p;
1143
1144         BUG_ON(!mc->nobjs);
1145         p = mc->objects[--mc->nobjs];
1146         return p;
1147 }
1148
1149 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
1150 {
1151         return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
1152 }
1153
1154 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
1155 {
1156         kmem_cache_free(pte_list_desc_cache, pte_list_desc);
1157 }
1158
1159 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
1160 {
1161         if (!sp->role.direct)
1162                 return sp->gfns[index];
1163
1164         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
1165 }
1166
1167 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
1168 {
1169         if (!sp->role.direct) {
1170                 sp->gfns[index] = gfn;
1171                 return;
1172         }
1173
1174         if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
1175                 pr_err_ratelimited("gfn mismatch under direct page %llx "
1176                                    "(expected %llx, got %llx)\n",
1177                                    sp->gfn,
1178                                    kvm_mmu_page_get_gfn(sp, index), gfn);
1179 }
1180
1181 /*
1182  * Return the pointer to the large page information for a given gfn,
1183  * handling slots that are not large page aligned.
1184  */
1185 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
1186                                               struct kvm_memory_slot *slot,
1187                                               int level)
1188 {
1189         unsigned long idx;
1190
1191         idx = gfn_to_index(gfn, slot->base_gfn, level);
1192         return &slot->arch.lpage_info[level - 2][idx];
1193 }
1194
1195 static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
1196                                             gfn_t gfn, int count)
1197 {
1198         struct kvm_lpage_info *linfo;
1199         int i;
1200
1201         for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
1202                 linfo = lpage_info_slot(gfn, slot, i);
1203                 linfo->disallow_lpage += count;
1204                 WARN_ON(linfo->disallow_lpage < 0);
1205         }
1206 }
1207
1208 void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1209 {
1210         update_gfn_disallow_lpage_count(slot, gfn, 1);
1211 }
1212
1213 void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1214 {
1215         update_gfn_disallow_lpage_count(slot, gfn, -1);
1216 }
1217
1218 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1219 {
1220         struct kvm_memslots *slots;
1221         struct kvm_memory_slot *slot;
1222         gfn_t gfn;
1223
1224         kvm->arch.indirect_shadow_pages++;
1225         gfn = sp->gfn;
1226         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1227         slot = __gfn_to_memslot(slots, gfn);
1228
1229         /* the non-leaf shadow pages are keeping readonly. */
1230         if (sp->role.level > PG_LEVEL_4K)
1231                 return kvm_slot_page_track_add_page(kvm, slot, gfn,
1232                                                     KVM_PAGE_TRACK_WRITE);
1233
1234         kvm_mmu_gfn_disallow_lpage(slot, gfn);
1235 }
1236
1237 static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1238 {
1239         if (sp->lpage_disallowed)
1240                 return;
1241
1242         ++kvm->stat.nx_lpage_splits;
1243         list_add_tail(&sp->lpage_disallowed_link,
1244                       &kvm->arch.lpage_disallowed_mmu_pages);
1245         sp->lpage_disallowed = true;
1246 }
1247
1248 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1249 {
1250         struct kvm_memslots *slots;
1251         struct kvm_memory_slot *slot;
1252         gfn_t gfn;
1253
1254         kvm->arch.indirect_shadow_pages--;
1255         gfn = sp->gfn;
1256         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1257         slot = __gfn_to_memslot(slots, gfn);
1258         if (sp->role.level > PG_LEVEL_4K)
1259                 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
1260                                                        KVM_PAGE_TRACK_WRITE);
1261
1262         kvm_mmu_gfn_allow_lpage(slot, gfn);
1263 }
1264
1265 static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1266 {
1267         --kvm->stat.nx_lpage_splits;
1268         sp->lpage_disallowed = false;
1269         list_del(&sp->lpage_disallowed_link);
1270 }
1271
1272 static struct kvm_memory_slot *
1273 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
1274                             bool no_dirty_log)
1275 {
1276         struct kvm_memory_slot *slot;
1277
1278         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1279         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1280                 return NULL;
1281         if (no_dirty_log && slot->dirty_bitmap)
1282                 return NULL;
1283
1284         return slot;
1285 }
1286
1287 /*
1288  * About rmap_head encoding:
1289  *
1290  * If the bit zero of rmap_head->val is clear, then it points to the only spte
1291  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
1292  * pte_list_desc containing more mappings.
1293  */
1294
1295 /*
1296  * Returns the number of pointers in the rmap chain, not counting the new one.
1297  */
1298 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
1299                         struct kvm_rmap_head *rmap_head)
1300 {
1301         struct pte_list_desc *desc;
1302         int i, count = 0;
1303
1304         if (!rmap_head->val) {
1305                 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
1306                 rmap_head->val = (unsigned long)spte;
1307         } else if (!(rmap_head->val & 1)) {
1308                 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
1309                 desc = mmu_alloc_pte_list_desc(vcpu);
1310                 desc->sptes[0] = (u64 *)rmap_head->val;
1311                 desc->sptes[1] = spte;
1312                 rmap_head->val = (unsigned long)desc | 1;
1313                 ++count;
1314         } else {
1315                 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
1316                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1317                 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
1318                         desc = desc->more;
1319                         count += PTE_LIST_EXT;
1320                 }
1321                 if (desc->sptes[PTE_LIST_EXT-1]) {
1322                         desc->more = mmu_alloc_pte_list_desc(vcpu);
1323                         desc = desc->more;
1324                 }
1325                 for (i = 0; desc->sptes[i]; ++i)
1326                         ++count;
1327                 desc->sptes[i] = spte;
1328         }
1329         return count;
1330 }
1331
1332 static void
1333 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
1334                            struct pte_list_desc *desc, int i,
1335                            struct pte_list_desc *prev_desc)
1336 {
1337         int j;
1338
1339         for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
1340                 ;
1341         desc->sptes[i] = desc->sptes[j];
1342         desc->sptes[j] = NULL;
1343         if (j != 0)
1344                 return;
1345         if (!prev_desc && !desc->more)
1346                 rmap_head->val = 0;
1347         else
1348                 if (prev_desc)
1349                         prev_desc->more = desc->more;
1350                 else
1351                         rmap_head->val = (unsigned long)desc->more | 1;
1352         mmu_free_pte_list_desc(desc);
1353 }
1354
1355 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
1356 {
1357         struct pte_list_desc *desc;
1358         struct pte_list_desc *prev_desc;
1359         int i;
1360
1361         if (!rmap_head->val) {
1362                 pr_err("%s: %p 0->BUG\n", __func__, spte);
1363                 BUG();
1364         } else if (!(rmap_head->val & 1)) {
1365                 rmap_printk("%s:  %p 1->0\n", __func__, spte);
1366                 if ((u64 *)rmap_head->val != spte) {
1367                         pr_err("%s:  %p 1->BUG\n", __func__, spte);
1368                         BUG();
1369                 }
1370                 rmap_head->val = 0;
1371         } else {
1372                 rmap_printk("%s:  %p many->many\n", __func__, spte);
1373                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1374                 prev_desc = NULL;
1375                 while (desc) {
1376                         for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
1377                                 if (desc->sptes[i] == spte) {
1378                                         pte_list_desc_remove_entry(rmap_head,
1379                                                         desc, i, prev_desc);
1380                                         return;
1381                                 }
1382                         }
1383                         prev_desc = desc;
1384                         desc = desc->more;
1385                 }
1386                 pr_err("%s: %p many->many\n", __func__, spte);
1387                 BUG();
1388         }
1389 }
1390
1391 static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
1392 {
1393         mmu_spte_clear_track_bits(sptep);
1394         __pte_list_remove(sptep, rmap_head);
1395 }
1396
1397 static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
1398                                            struct kvm_memory_slot *slot)
1399 {
1400         unsigned long idx;
1401
1402         idx = gfn_to_index(gfn, slot->base_gfn, level);
1403         return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
1404 }
1405
1406 static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
1407                                          struct kvm_mmu_page *sp)
1408 {
1409         struct kvm_memslots *slots;
1410         struct kvm_memory_slot *slot;
1411
1412         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1413         slot = __gfn_to_memslot(slots, gfn);
1414         return __gfn_to_rmap(gfn, sp->role.level, slot);
1415 }
1416
1417 static bool rmap_can_add(struct kvm_vcpu *vcpu)
1418 {
1419         struct kvm_mmu_memory_cache *cache;
1420
1421         cache = &vcpu->arch.mmu_pte_list_desc_cache;
1422         return mmu_memory_cache_free_objects(cache);
1423 }
1424
1425 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1426 {
1427         struct kvm_mmu_page *sp;
1428         struct kvm_rmap_head *rmap_head;
1429
1430         sp = sptep_to_sp(spte);
1431         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1432         rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1433         return pte_list_add(vcpu, spte, rmap_head);
1434 }
1435
1436 static void rmap_remove(struct kvm *kvm, u64 *spte)
1437 {
1438         struct kvm_mmu_page *sp;
1439         gfn_t gfn;
1440         struct kvm_rmap_head *rmap_head;
1441
1442         sp = sptep_to_sp(spte);
1443         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1444         rmap_head = gfn_to_rmap(kvm, gfn, sp);
1445         __pte_list_remove(spte, rmap_head);
1446 }
1447
1448 /*
1449  * Used by the following functions to iterate through the sptes linked by a
1450  * rmap.  All fields are private and not assumed to be used outside.
1451  */
1452 struct rmap_iterator {
1453         /* private fields */
1454         struct pte_list_desc *desc;     /* holds the sptep if not NULL */
1455         int pos;                        /* index of the sptep */
1456 };
1457
1458 /*
1459  * Iteration must be started by this function.  This should also be used after
1460  * removing/dropping sptes from the rmap link because in such cases the
1461  * information in the iterator may not be valid.
1462  *
1463  * Returns sptep if found, NULL otherwise.
1464  */
1465 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1466                            struct rmap_iterator *iter)
1467 {
1468         u64 *sptep;
1469
1470         if (!rmap_head->val)
1471                 return NULL;
1472
1473         if (!(rmap_head->val & 1)) {
1474                 iter->desc = NULL;
1475                 sptep = (u64 *)rmap_head->val;
1476                 goto out;
1477         }
1478
1479         iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1480         iter->pos = 0;
1481         sptep = iter->desc->sptes[iter->pos];
1482 out:
1483         BUG_ON(!is_shadow_present_pte(*sptep));
1484         return sptep;
1485 }
1486
1487 /*
1488  * Must be used with a valid iterator: e.g. after rmap_get_first().
1489  *
1490  * Returns sptep if found, NULL otherwise.
1491  */
1492 static u64 *rmap_get_next(struct rmap_iterator *iter)
1493 {
1494         u64 *sptep;
1495
1496         if (iter->desc) {
1497                 if (iter->pos < PTE_LIST_EXT - 1) {
1498                         ++iter->pos;
1499                         sptep = iter->desc->sptes[iter->pos];
1500                         if (sptep)
1501                                 goto out;
1502                 }
1503
1504                 iter->desc = iter->desc->more;
1505
1506                 if (iter->desc) {
1507                         iter->pos = 0;
1508                         /* desc->sptes[0] cannot be NULL */
1509                         sptep = iter->desc->sptes[iter->pos];
1510                         goto out;
1511                 }
1512         }
1513
1514         return NULL;
1515 out:
1516         BUG_ON(!is_shadow_present_pte(*sptep));
1517         return sptep;
1518 }
1519
1520 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)                 \
1521         for (_spte_ = rmap_get_first(_rmap_head_, _iter_);              \
1522              _spte_; _spte_ = rmap_get_next(_iter_))
1523
1524 static void drop_spte(struct kvm *kvm, u64 *sptep)
1525 {
1526         if (mmu_spte_clear_track_bits(sptep))
1527                 rmap_remove(kvm, sptep);
1528 }
1529
1530
1531 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1532 {
1533         if (is_large_pte(*sptep)) {
1534                 WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
1535                 drop_spte(kvm, sptep);
1536                 --kvm->stat.lpages;
1537                 return true;
1538         }
1539
1540         return false;
1541 }
1542
1543 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1544 {
1545         if (__drop_large_spte(vcpu->kvm, sptep)) {
1546                 struct kvm_mmu_page *sp = sptep_to_sp(sptep);
1547
1548                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1549                         KVM_PAGES_PER_HPAGE(sp->role.level));
1550         }
1551 }
1552
1553 /*
1554  * Write-protect on the specified @sptep, @pt_protect indicates whether
1555  * spte write-protection is caused by protecting shadow page table.
1556  *
1557  * Note: write protection is difference between dirty logging and spte
1558  * protection:
1559  * - for dirty logging, the spte can be set to writable at anytime if
1560  *   its dirty bitmap is properly set.
1561  * - for spte protection, the spte can be writable only after unsync-ing
1562  *   shadow page.
1563  *
1564  * Return true if tlb need be flushed.
1565  */
1566 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1567 {
1568         u64 spte = *sptep;
1569
1570         if (!is_writable_pte(spte) &&
1571               !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1572                 return false;
1573
1574         rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1575
1576         if (pt_protect)
1577                 spte &= ~SPTE_MMU_WRITEABLE;
1578         spte = spte & ~PT_WRITABLE_MASK;
1579
1580         return mmu_spte_update(sptep, spte);
1581 }
1582
1583 static bool __rmap_write_protect(struct kvm *kvm,
1584                                  struct kvm_rmap_head *rmap_head,
1585                                  bool pt_protect)
1586 {
1587         u64 *sptep;
1588         struct rmap_iterator iter;
1589         bool flush = false;
1590
1591         for_each_rmap_spte(rmap_head, &iter, sptep)
1592                 flush |= spte_write_protect(sptep, pt_protect);
1593
1594         return flush;
1595 }
1596
1597 static bool spte_clear_dirty(u64 *sptep)
1598 {
1599         u64 spte = *sptep;
1600
1601         rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1602
1603         MMU_WARN_ON(!spte_ad_enabled(spte));
1604         spte &= ~shadow_dirty_mask;
1605         return mmu_spte_update(sptep, spte);
1606 }
1607
1608 static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1609 {
1610         bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1611                                                (unsigned long *)sptep);
1612         if (was_writable && !spte_ad_enabled(*sptep))
1613                 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1614
1615         return was_writable;
1616 }
1617
1618 /*
1619  * Gets the GFN ready for another round of dirty logging by clearing the
1620  *      - D bit on ad-enabled SPTEs, and
1621  *      - W bit on ad-disabled SPTEs.
1622  * Returns true iff any D or W bits were cleared.
1623  */
1624 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1625 {
1626         u64 *sptep;
1627         struct rmap_iterator iter;
1628         bool flush = false;
1629
1630         for_each_rmap_spte(rmap_head, &iter, sptep)
1631                 if (spte_ad_need_write_protect(*sptep))
1632                         flush |= spte_wrprot_for_clear_dirty(sptep);
1633                 else
1634                         flush |= spte_clear_dirty(sptep);
1635
1636         return flush;
1637 }
1638
1639 static bool spte_set_dirty(u64 *sptep)
1640 {
1641         u64 spte = *sptep;
1642
1643         rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1644
1645         /*
1646          * Similar to the !kvm_x86_ops.slot_disable_log_dirty case,
1647          * do not bother adding back write access to pages marked
1648          * SPTE_AD_WRPROT_ONLY_MASK.
1649          */
1650         spte |= shadow_dirty_mask;
1651
1652         return mmu_spte_update(sptep, spte);
1653 }
1654
1655 static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1656 {
1657         u64 *sptep;
1658         struct rmap_iterator iter;
1659         bool flush = false;
1660
1661         for_each_rmap_spte(rmap_head, &iter, sptep)
1662                 if (spte_ad_enabled(*sptep))
1663                         flush |= spte_set_dirty(sptep);
1664
1665         return flush;
1666 }
1667
1668 /**
1669  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1670  * @kvm: kvm instance
1671  * @slot: slot to protect
1672  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1673  * @mask: indicates which pages we should protect
1674  *
1675  * Used when we do not need to care about huge page mappings: e.g. during dirty
1676  * logging we do not have any such mappings.
1677  */
1678 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1679                                      struct kvm_memory_slot *slot,
1680                                      gfn_t gfn_offset, unsigned long mask)
1681 {
1682         struct kvm_rmap_head *rmap_head;
1683
1684         while (mask) {
1685                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1686                                           PG_LEVEL_4K, slot);
1687                 __rmap_write_protect(kvm, rmap_head, false);
1688
1689                 /* clear the first set bit */
1690                 mask &= mask - 1;
1691         }
1692 }
1693
1694 /**
1695  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1696  * protect the page if the D-bit isn't supported.
1697  * @kvm: kvm instance
1698  * @slot: slot to clear D-bit
1699  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1700  * @mask: indicates which pages we should clear D-bit
1701  *
1702  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1703  */
1704 void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1705                                      struct kvm_memory_slot *slot,
1706                                      gfn_t gfn_offset, unsigned long mask)
1707 {
1708         struct kvm_rmap_head *rmap_head;
1709
1710         while (mask) {
1711                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1712                                           PG_LEVEL_4K, slot);
1713                 __rmap_clear_dirty(kvm, rmap_head);
1714
1715                 /* clear the first set bit */
1716                 mask &= mask - 1;
1717         }
1718 }
1719 EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1720
1721 /**
1722  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1723  * PT level pages.
1724  *
1725  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1726  * enable dirty logging for them.
1727  *
1728  * Used when we do not need to care about huge page mappings: e.g. during dirty
1729  * logging we do not have any such mappings.
1730  */
1731 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1732                                 struct kvm_memory_slot *slot,
1733                                 gfn_t gfn_offset, unsigned long mask)
1734 {
1735         if (kvm_x86_ops.enable_log_dirty_pt_masked)
1736                 kvm_x86_ops.enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1737                                 mask);
1738         else
1739                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1740 }
1741
1742 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1743                                     struct kvm_memory_slot *slot, u64 gfn)
1744 {
1745         struct kvm_rmap_head *rmap_head;
1746         int i;
1747         bool write_protected = false;
1748
1749         for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
1750                 rmap_head = __gfn_to_rmap(gfn, i, slot);
1751                 write_protected |= __rmap_write_protect(kvm, rmap_head, true);
1752         }
1753
1754         return write_protected;
1755 }
1756
1757 static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1758 {
1759         struct kvm_memory_slot *slot;
1760
1761         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1762         return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
1763 }
1764
1765 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1766 {
1767         u64 *sptep;
1768         struct rmap_iterator iter;
1769         bool flush = false;
1770
1771         while ((sptep = rmap_get_first(rmap_head, &iter))) {
1772                 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1773
1774                 pte_list_remove(rmap_head, sptep);
1775                 flush = true;
1776         }
1777
1778         return flush;
1779 }
1780
1781 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1782                            struct kvm_memory_slot *slot, gfn_t gfn, int level,
1783                            unsigned long data)
1784 {
1785         return kvm_zap_rmapp(kvm, rmap_head);
1786 }
1787
1788 static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1789                              struct kvm_memory_slot *slot, gfn_t gfn, int level,
1790                              unsigned long data)
1791 {
1792         u64 *sptep;
1793         struct rmap_iterator iter;
1794         int need_flush = 0;
1795         u64 new_spte;
1796         pte_t *ptep = (pte_t *)data;
1797         kvm_pfn_t new_pfn;
1798
1799         WARN_ON(pte_huge(*ptep));
1800         new_pfn = pte_pfn(*ptep);
1801
1802 restart:
1803         for_each_rmap_spte(rmap_head, &iter, sptep) {
1804                 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1805                             sptep, *sptep, gfn, level);
1806
1807                 need_flush = 1;
1808
1809                 if (pte_write(*ptep)) {
1810                         pte_list_remove(rmap_head, sptep);
1811                         goto restart;
1812                 } else {
1813                         new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1814                         new_spte |= (u64)new_pfn << PAGE_SHIFT;
1815
1816                         new_spte &= ~PT_WRITABLE_MASK;
1817                         new_spte &= ~SPTE_HOST_WRITEABLE;
1818
1819                         new_spte = mark_spte_for_access_track(new_spte);
1820
1821                         mmu_spte_clear_track_bits(sptep);
1822                         mmu_spte_set(sptep, new_spte);
1823                 }
1824         }
1825
1826         if (need_flush && kvm_available_flush_tlb_with_range()) {
1827                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1828                 return 0;
1829         }
1830
1831         return need_flush;
1832 }
1833
1834 struct slot_rmap_walk_iterator {
1835         /* input fields. */
1836         struct kvm_memory_slot *slot;
1837         gfn_t start_gfn;
1838         gfn_t end_gfn;
1839         int start_level;
1840         int end_level;
1841
1842         /* output fields. */
1843         gfn_t gfn;
1844         struct kvm_rmap_head *rmap;
1845         int level;
1846
1847         /* private field. */
1848         struct kvm_rmap_head *end_rmap;
1849 };
1850
1851 static void
1852 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1853 {
1854         iterator->level = level;
1855         iterator->gfn = iterator->start_gfn;
1856         iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1857         iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1858                                            iterator->slot);
1859 }
1860
1861 static void
1862 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1863                     struct kvm_memory_slot *slot, int start_level,
1864                     int end_level, gfn_t start_gfn, gfn_t end_gfn)
1865 {
1866         iterator->slot = slot;
1867         iterator->start_level = start_level;
1868         iterator->end_level = end_level;
1869         iterator->start_gfn = start_gfn;
1870         iterator->end_gfn = end_gfn;
1871
1872         rmap_walk_init_level(iterator, iterator->start_level);
1873 }
1874
1875 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1876 {
1877         return !!iterator->rmap;
1878 }
1879
1880 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1881 {
1882         if (++iterator->rmap <= iterator->end_rmap) {
1883                 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1884                 return;
1885         }
1886
1887         if (++iterator->level > iterator->end_level) {
1888                 iterator->rmap = NULL;
1889                 return;
1890         }
1891
1892         rmap_walk_init_level(iterator, iterator->level);
1893 }
1894
1895 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1896            _start_gfn, _end_gfn, _iter_)                                \
1897         for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
1898                                  _end_level_, _start_gfn, _end_gfn);    \
1899              slot_rmap_walk_okay(_iter_);                               \
1900              slot_rmap_walk_next(_iter_))
1901
1902 static int kvm_handle_hva_range(struct kvm *kvm,
1903                                 unsigned long start,
1904                                 unsigned long end,
1905                                 unsigned long data,
1906                                 int (*handler)(struct kvm *kvm,
1907                                                struct kvm_rmap_head *rmap_head,
1908                                                struct kvm_memory_slot *slot,
1909                                                gfn_t gfn,
1910                                                int level,
1911                                                unsigned long data))
1912 {
1913         struct kvm_memslots *slots;
1914         struct kvm_memory_slot *memslot;
1915         struct slot_rmap_walk_iterator iterator;
1916         int ret = 0;
1917         int i;
1918
1919         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1920                 slots = __kvm_memslots(kvm, i);
1921                 kvm_for_each_memslot(memslot, slots) {
1922                         unsigned long hva_start, hva_end;
1923                         gfn_t gfn_start, gfn_end;
1924
1925                         hva_start = max(start, memslot->userspace_addr);
1926                         hva_end = min(end, memslot->userspace_addr +
1927                                       (memslot->npages << PAGE_SHIFT));
1928                         if (hva_start >= hva_end)
1929                                 continue;
1930                         /*
1931                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
1932                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1933                          */
1934                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1935                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1936
1937                         for_each_slot_rmap_range(memslot, PG_LEVEL_4K,
1938                                                  KVM_MAX_HUGEPAGE_LEVEL,
1939                                                  gfn_start, gfn_end - 1,
1940                                                  &iterator)
1941                                 ret |= handler(kvm, iterator.rmap, memslot,
1942                                                iterator.gfn, iterator.level, data);
1943                 }
1944         }
1945
1946         return ret;
1947 }
1948
1949 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1950                           unsigned long data,
1951                           int (*handler)(struct kvm *kvm,
1952                                          struct kvm_rmap_head *rmap_head,
1953                                          struct kvm_memory_slot *slot,
1954                                          gfn_t gfn, int level,
1955                                          unsigned long data))
1956 {
1957         return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
1958 }
1959
1960 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1961 {
1962         return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1963 }
1964
1965 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1966 {
1967         return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1968 }
1969
1970 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1971                          struct kvm_memory_slot *slot, gfn_t gfn, int level,
1972                          unsigned long data)
1973 {
1974         u64 *sptep;
1975         struct rmap_iterator uninitialized_var(iter);
1976         int young = 0;
1977
1978         for_each_rmap_spte(rmap_head, &iter, sptep)
1979                 young |= mmu_spte_age(sptep);
1980
1981         trace_kvm_age_page(gfn, level, slot, young);
1982         return young;
1983 }
1984
1985 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1986                               struct kvm_memory_slot *slot, gfn_t gfn,
1987                               int level, unsigned long data)
1988 {
1989         u64 *sptep;
1990         struct rmap_iterator iter;
1991
1992         for_each_rmap_spte(rmap_head, &iter, sptep)
1993                 if (is_accessed_spte(*sptep))
1994                         return 1;
1995         return 0;
1996 }
1997
1998 #define RMAP_RECYCLE_THRESHOLD 1000
1999
2000 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
2001 {
2002         struct kvm_rmap_head *rmap_head;
2003         struct kvm_mmu_page *sp;
2004
2005         sp = sptep_to_sp(spte);
2006
2007         rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
2008
2009         kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
2010         kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
2011                         KVM_PAGES_PER_HPAGE(sp->role.level));
2012 }
2013
2014 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2015 {
2016         return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
2017 }
2018
2019 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2020 {
2021         return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
2022 }
2023
2024 #ifdef MMU_DEBUG
2025 static int is_empty_shadow_page(u64 *spt)
2026 {
2027         u64 *pos;
2028         u64 *end;
2029
2030         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
2031                 if (is_shadow_present_pte(*pos)) {
2032                         printk(KERN_ERR "%s: %p %llx\n", __func__,
2033                                pos, *pos);
2034                         return 0;
2035                 }
2036         return 1;
2037 }
2038 #endif
2039
2040 /*
2041  * This value is the sum of all of the kvm instances's
2042  * kvm->arch.n_used_mmu_pages values.  We need a global,
2043  * aggregate version in order to make the slab shrinker
2044  * faster
2045  */
2046 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
2047 {
2048         kvm->arch.n_used_mmu_pages += nr;
2049         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
2050 }
2051
2052 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
2053 {
2054         MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
2055         hlist_del(&sp->hash_link);
2056         list_del(&sp->link);
2057         free_page((unsigned long)sp->spt);
2058         if (!sp->role.direct)
2059                 free_page((unsigned long)sp->gfns);
2060         kmem_cache_free(mmu_page_header_cache, sp);
2061 }
2062
2063 static unsigned kvm_page_table_hashfn(gfn_t gfn)
2064 {
2065         return hash_64(gfn, KVM_MMU_HASH_SHIFT);
2066 }
2067
2068 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
2069                                     struct kvm_mmu_page *sp, u64 *parent_pte)
2070 {
2071         if (!parent_pte)
2072                 return;
2073
2074         pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
2075 }
2076
2077 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
2078                                        u64 *parent_pte)
2079 {
2080         __pte_list_remove(parent_pte, &sp->parent_ptes);
2081 }
2082
2083 static void drop_parent_pte(struct kvm_mmu_page *sp,
2084                             u64 *parent_pte)
2085 {
2086         mmu_page_remove_parent_pte(sp, parent_pte);
2087         mmu_spte_clear_no_track(parent_pte);
2088 }
2089
2090 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
2091 {
2092         struct kvm_mmu_page *sp;
2093
2094         sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
2095         sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2096         if (!direct)
2097                 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2098         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2099
2100         /*
2101          * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
2102          * depends on valid pages being added to the head of the list.  See
2103          * comments in kvm_zap_obsolete_pages().
2104          */
2105         sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
2106         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2107         kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2108         return sp;
2109 }
2110
2111 static void mark_unsync(u64 *spte);
2112 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
2113 {
2114         u64 *sptep;
2115         struct rmap_iterator iter;
2116
2117         for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
2118                 mark_unsync(sptep);
2119         }
2120 }
2121
2122 static void mark_unsync(u64 *spte)
2123 {
2124         struct kvm_mmu_page *sp;
2125         unsigned int index;
2126
2127         sp = sptep_to_sp(spte);
2128         index = spte - sp->spt;
2129         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
2130                 return;
2131         if (sp->unsync_children++)
2132                 return;
2133         kvm_mmu_mark_parents_unsync(sp);
2134 }
2135
2136 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
2137                                struct kvm_mmu_page *sp)
2138 {
2139         return 0;
2140 }
2141
2142 static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
2143                                  struct kvm_mmu_page *sp, u64 *spte,
2144                                  const void *pte)
2145 {
2146         WARN_ON(1);
2147 }
2148
2149 #define KVM_PAGE_ARRAY_NR 16
2150
2151 struct kvm_mmu_pages {
2152         struct mmu_page_and_offset {
2153                 struct kvm_mmu_page *sp;
2154                 unsigned int idx;
2155         } page[KVM_PAGE_ARRAY_NR];
2156         unsigned int nr;
2157 };
2158
2159 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
2160                          int idx)
2161 {
2162         int i;
2163
2164         if (sp->unsync)
2165                 for (i=0; i < pvec->nr; i++)
2166                         if (pvec->page[i].sp == sp)
2167                                 return 0;
2168
2169         pvec->page[pvec->nr].sp = sp;
2170         pvec->page[pvec->nr].idx = idx;
2171         pvec->nr++;
2172         return (pvec->nr == KVM_PAGE_ARRAY_NR);
2173 }
2174
2175 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
2176 {
2177         --sp->unsync_children;
2178         WARN_ON((int)sp->unsync_children < 0);
2179         __clear_bit(idx, sp->unsync_child_bitmap);
2180 }
2181
2182 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
2183                            struct kvm_mmu_pages *pvec)
2184 {
2185         int i, ret, nr_unsync_leaf = 0;
2186
2187         for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
2188                 struct kvm_mmu_page *child;
2189                 u64 ent = sp->spt[i];
2190
2191                 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
2192                         clear_unsync_child_bit(sp, i);
2193                         continue;
2194                 }
2195
2196                 child = page_header(ent & PT64_BASE_ADDR_MASK);
2197
2198                 if (child->unsync_children) {
2199                         if (mmu_pages_add(pvec, child, i))
2200                                 return -ENOSPC;
2201
2202                         ret = __mmu_unsync_walk(child, pvec);
2203                         if (!ret) {
2204                                 clear_unsync_child_bit(sp, i);
2205                                 continue;
2206                         } else if (ret > 0) {
2207                                 nr_unsync_leaf += ret;
2208                         } else
2209                                 return ret;
2210                 } else if (child->unsync) {
2211                         nr_unsync_leaf++;
2212                         if (mmu_pages_add(pvec, child, i))
2213                                 return -ENOSPC;
2214                 } else
2215                         clear_unsync_child_bit(sp, i);
2216         }
2217
2218         return nr_unsync_leaf;
2219 }
2220
2221 #define INVALID_INDEX (-1)
2222
2223 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
2224                            struct kvm_mmu_pages *pvec)
2225 {
2226         pvec->nr = 0;
2227         if (!sp->unsync_children)
2228                 return 0;
2229
2230         mmu_pages_add(pvec, sp, INVALID_INDEX);
2231         return __mmu_unsync_walk(sp, pvec);
2232 }
2233
2234 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2235 {
2236         WARN_ON(!sp->unsync);
2237         trace_kvm_mmu_sync_page(sp);
2238         sp->unsync = 0;
2239         --kvm->stat.mmu_unsync;
2240 }
2241
2242 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2243                                      struct list_head *invalid_list);
2244 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2245                                     struct list_head *invalid_list);
2246
2247 #define for_each_valid_sp(_kvm, _sp, _list)                             \
2248         hlist_for_each_entry(_sp, _list, hash_link)                     \
2249                 if (is_obsolete_sp((_kvm), (_sp))) {                    \
2250                 } else
2251
2252 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                 \
2253         for_each_valid_sp(_kvm, _sp,                                    \
2254           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])     \
2255                 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
2256
2257 static inline bool is_ept_sp(struct kvm_mmu_page *sp)
2258 {
2259         return sp->role.cr0_wp && sp->role.smap_andnot_wp;
2260 }
2261
2262 /* @sp->gfn should be write-protected at the call site */
2263 static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2264                             struct list_head *invalid_list)
2265 {
2266         if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
2267             vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
2268                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
2269                 return false;
2270         }
2271
2272         return true;
2273 }
2274
2275 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
2276                                         struct list_head *invalid_list,
2277                                         bool remote_flush)
2278 {
2279         if (!remote_flush && list_empty(invalid_list))
2280                 return false;
2281
2282         if (!list_empty(invalid_list))
2283                 kvm_mmu_commit_zap_page(kvm, invalid_list);
2284         else
2285                 kvm_flush_remote_tlbs(kvm);
2286         return true;
2287 }
2288
2289 static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2290                                  struct list_head *invalid_list,
2291                                  bool remote_flush, bool local_flush)
2292 {
2293         if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
2294                 return;
2295
2296         if (local_flush)
2297                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
2298 }
2299
2300 #ifdef CONFIG_KVM_MMU_AUDIT
2301 #include "mmu_audit.c"
2302 #else
2303 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2304 static void mmu_audit_disable(void) { }
2305 #endif
2306
2307 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2308 {
2309         return sp->role.invalid ||
2310                unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2311 }
2312
2313 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2314                          struct list_head *invalid_list)
2315 {
2316         kvm_unlink_unsync_page(vcpu->kvm, sp);
2317         return __kvm_sync_page(vcpu, sp, invalid_list);
2318 }
2319
2320 /* @gfn should be write-protected at the call site */
2321 static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2322                            struct list_head *invalid_list)
2323 {
2324         struct kvm_mmu_page *s;
2325         bool ret = false;
2326
2327         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
2328                 if (!s->unsync)
2329                         continue;
2330
2331                 WARN_ON(s->role.level != PG_LEVEL_4K);
2332                 ret |= kvm_sync_page(vcpu, s, invalid_list);
2333         }
2334
2335         return ret;
2336 }
2337
2338 struct mmu_page_path {
2339         struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2340         unsigned int idx[PT64_ROOT_MAX_LEVEL];
2341 };
2342
2343 #define for_each_sp(pvec, sp, parents, i)                       \
2344                 for (i = mmu_pages_first(&pvec, &parents);      \
2345                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
2346                         i = mmu_pages_next(&pvec, &parents, i))
2347
2348 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2349                           struct mmu_page_path *parents,
2350                           int i)
2351 {
2352         int n;
2353
2354         for (n = i+1; n < pvec->nr; n++) {
2355                 struct kvm_mmu_page *sp = pvec->page[n].sp;
2356                 unsigned idx = pvec->page[n].idx;
2357                 int level = sp->role.level;
2358
2359                 parents->idx[level-1] = idx;
2360                 if (level == PG_LEVEL_4K)
2361                         break;
2362
2363                 parents->parent[level-2] = sp;
2364         }
2365
2366         return n;
2367 }
2368
2369 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2370                            struct mmu_page_path *parents)
2371 {
2372         struct kvm_mmu_page *sp;
2373         int level;
2374
2375         if (pvec->nr == 0)
2376                 return 0;
2377
2378         WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2379
2380         sp = pvec->page[0].sp;
2381         level = sp->role.level;
2382         WARN_ON(level == PG_LEVEL_4K);
2383
2384         parents->parent[level-2] = sp;
2385
2386         /* Also set up a sentinel.  Further entries in pvec are all
2387          * children of sp, so this element is never overwritten.
2388          */
2389         parents->parent[level-1] = NULL;
2390         return mmu_pages_next(pvec, parents, 0);
2391 }
2392
2393 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2394 {
2395         struct kvm_mmu_page *sp;
2396         unsigned int level = 0;
2397
2398         do {
2399                 unsigned int idx = parents->idx[level];
2400                 sp = parents->parent[level];
2401                 if (!sp)
2402                         return;
2403
2404                 WARN_ON(idx == INVALID_INDEX);
2405                 clear_unsync_child_bit(sp, idx);
2406                 level++;
2407         } while (!sp->unsync_children);
2408 }
2409
2410 static void mmu_sync_children(struct kvm_vcpu *vcpu,
2411                               struct kvm_mmu_page *parent)
2412 {
2413         int i;
2414         struct kvm_mmu_page *sp;
2415         struct mmu_page_path parents;
2416         struct kvm_mmu_pages pages;
2417         LIST_HEAD(invalid_list);
2418         bool flush = false;
2419
2420         while (mmu_unsync_walk(parent, &pages)) {
2421                 bool protected = false;
2422
2423                 for_each_sp(pages, sp, parents, i)
2424                         protected |= rmap_write_protect(vcpu, sp->gfn);
2425
2426                 if (protected) {
2427                         kvm_flush_remote_tlbs(vcpu->kvm);
2428                         flush = false;
2429                 }
2430
2431                 for_each_sp(pages, sp, parents, i) {
2432                         flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2433                         mmu_pages_clear_parents(&parents);
2434                 }
2435                 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
2436                         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2437                         cond_resched_lock(&vcpu->kvm->mmu_lock);
2438                         flush = false;
2439                 }
2440         }
2441
2442         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2443 }
2444
2445 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2446 {
2447         atomic_set(&sp->write_flooding_count,  0);
2448 }
2449
2450 static void clear_sp_write_flooding_count(u64 *spte)
2451 {
2452         __clear_sp_write_flooding_count(sptep_to_sp(spte));
2453 }
2454
2455 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2456                                              gfn_t gfn,
2457                                              gva_t gaddr,
2458                                              unsigned level,
2459                                              int direct,
2460                                              unsigned int access)
2461 {
2462         bool direct_mmu = vcpu->arch.mmu->direct_map;
2463         union kvm_mmu_page_role role;
2464         struct hlist_head *sp_list;
2465         unsigned quadrant;
2466         struct kvm_mmu_page *sp;
2467         bool need_sync = false;
2468         bool flush = false;
2469         int collisions = 0;
2470         LIST_HEAD(invalid_list);
2471
2472         role = vcpu->arch.mmu->mmu_role.base;
2473         role.level = level;
2474         role.direct = direct;
2475         if (role.direct)
2476                 role.gpte_is_8_bytes = true;
2477         role.access = access;
2478         if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
2479                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2480                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2481                 role.quadrant = quadrant;
2482         }
2483
2484         sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
2485         for_each_valid_sp(vcpu->kvm, sp, sp_list) {
2486                 if (sp->gfn != gfn) {
2487                         collisions++;
2488                         continue;
2489                 }
2490
2491                 if (!need_sync && sp->unsync)
2492                         need_sync = true;
2493
2494                 if (sp->role.word != role.word)
2495                         continue;
2496
2497                 if (direct_mmu)
2498                         goto trace_get_page;
2499
2500                 if (sp->unsync) {
2501                         /* The page is good, but __kvm_sync_page might still end
2502                          * up zapping it.  If so, break in order to rebuild it.
2503                          */
2504                         if (!__kvm_sync_page(vcpu, sp, &invalid_list))
2505                                 break;
2506
2507                         WARN_ON(!list_empty(&invalid_list));
2508                         kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
2509                 }
2510
2511                 if (sp->unsync_children)
2512                         kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
2513
2514                 __clear_sp_write_flooding_count(sp);
2515
2516 trace_get_page:
2517                 trace_kvm_mmu_get_page(sp, false);
2518                 goto out;
2519         }
2520
2521         ++vcpu->kvm->stat.mmu_cache_miss;
2522
2523         sp = kvm_mmu_alloc_page(vcpu, direct);
2524
2525         sp->gfn = gfn;
2526         sp->role = role;
2527         hlist_add_head(&sp->hash_link, sp_list);
2528         if (!direct) {
2529                 /*
2530                  * we should do write protection before syncing pages
2531                  * otherwise the content of the synced shadow page may
2532                  * be inconsistent with guest page table.
2533                  */
2534                 account_shadowed(vcpu->kvm, sp);
2535                 if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
2536                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2537
2538                 if (level > PG_LEVEL_4K && need_sync)
2539                         flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2540         }
2541         clear_page(sp->spt);
2542         trace_kvm_mmu_get_page(sp, true);
2543
2544         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2545 out:
2546         if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2547                 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2548         return sp;
2549 }
2550
2551 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2552                                         struct kvm_vcpu *vcpu, hpa_t root,
2553                                         u64 addr)
2554 {
2555         iterator->addr = addr;
2556         iterator->shadow_addr = root;
2557         iterator->level = vcpu->arch.mmu->shadow_root_level;
2558
2559         if (iterator->level == PT64_ROOT_4LEVEL &&
2560             vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2561             !vcpu->arch.mmu->direct_map)
2562                 --iterator->level;
2563
2564         if (iterator->level == PT32E_ROOT_LEVEL) {
2565                 /*
2566                  * prev_root is currently only used for 64-bit hosts. So only
2567                  * the active root_hpa is valid here.
2568                  */
2569                 BUG_ON(root != vcpu->arch.mmu->root_hpa);
2570
2571                 iterator->shadow_addr
2572                         = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2573                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2574                 --iterator->level;
2575                 if (!iterator->shadow_addr)
2576                         iterator->level = 0;
2577         }
2578 }
2579
2580 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2581                              struct kvm_vcpu *vcpu, u64 addr)
2582 {
2583         shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
2584                                     addr);
2585 }
2586
2587 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2588 {
2589         if (iterator->level < PG_LEVEL_4K)
2590                 return false;
2591
2592         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2593         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2594         return true;
2595 }
2596
2597 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2598                                u64 spte)
2599 {
2600         if (is_last_spte(spte, iterator->level)) {
2601                 iterator->level = 0;
2602                 return;
2603         }
2604
2605         iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2606         --iterator->level;
2607 }
2608
2609 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2610 {
2611         __shadow_walk_next(iterator, *iterator->sptep);
2612 }
2613
2614 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2615                              struct kvm_mmu_page *sp)
2616 {
2617         u64 spte;
2618
2619         BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2620
2621         spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2622                shadow_user_mask | shadow_x_mask | shadow_me_mask;
2623
2624         if (sp_ad_disabled(sp))
2625                 spte |= SPTE_AD_DISABLED_MASK;
2626         else
2627                 spte |= shadow_accessed_mask;
2628
2629         mmu_spte_set(sptep, spte);
2630
2631         mmu_page_add_parent_pte(vcpu, sp, sptep);
2632
2633         if (sp->unsync_children || sp->unsync)
2634                 mark_unsync(sptep);
2635 }
2636
2637 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2638                                    unsigned direct_access)
2639 {
2640         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2641                 struct kvm_mmu_page *child;
2642
2643                 /*
2644                  * For the direct sp, if the guest pte's dirty bit
2645                  * changed form clean to dirty, it will corrupt the
2646                  * sp's access: allow writable in the read-only sp,
2647                  * so we should update the spte at this point to get
2648                  * a new sp with the correct access.
2649                  */
2650                 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
2651                 if (child->role.access == direct_access)
2652                         return;
2653
2654                 drop_parent_pte(child, sptep);
2655                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2656         }
2657 }
2658
2659 static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2660                              u64 *spte)
2661 {
2662         u64 pte;
2663         struct kvm_mmu_page *child;
2664
2665         pte = *spte;
2666         if (is_shadow_present_pte(pte)) {
2667                 if (is_last_spte(pte, sp->role.level)) {
2668                         drop_spte(kvm, spte);
2669                         if (is_large_pte(pte))
2670                                 --kvm->stat.lpages;
2671                 } else {
2672                         child = page_header(pte & PT64_BASE_ADDR_MASK);
2673                         drop_parent_pte(child, spte);
2674                 }
2675                 return true;
2676         }
2677
2678         if (is_mmio_spte(pte))
2679                 mmu_spte_clear_no_track(spte);
2680
2681         return false;
2682 }
2683
2684 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
2685                                          struct kvm_mmu_page *sp)
2686 {
2687         unsigned i;
2688
2689         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2690                 mmu_page_zap_pte(kvm, sp, sp->spt + i);
2691 }
2692
2693 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2694 {
2695         u64 *sptep;
2696         struct rmap_iterator iter;
2697
2698         while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2699                 drop_parent_pte(sp, sptep);
2700 }
2701
2702 static int mmu_zap_unsync_children(struct kvm *kvm,
2703                                    struct kvm_mmu_page *parent,
2704                                    struct list_head *invalid_list)
2705 {
2706         int i, zapped = 0;
2707         struct mmu_page_path parents;
2708         struct kvm_mmu_pages pages;
2709
2710         if (parent->role.level == PG_LEVEL_4K)
2711                 return 0;
2712
2713         while (mmu_unsync_walk(parent, &pages)) {
2714                 struct kvm_mmu_page *sp;
2715
2716                 for_each_sp(pages, sp, parents, i) {
2717                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2718                         mmu_pages_clear_parents(&parents);
2719                         zapped++;
2720                 }
2721         }
2722
2723         return zapped;
2724 }
2725
2726 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2727                                        struct kvm_mmu_page *sp,
2728                                        struct list_head *invalid_list,
2729                                        int *nr_zapped)
2730 {
2731         bool list_unstable;
2732
2733         trace_kvm_mmu_prepare_zap_page(sp);
2734         ++kvm->stat.mmu_shadow_zapped;
2735         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2736         kvm_mmu_page_unlink_children(kvm, sp);
2737         kvm_mmu_unlink_parents(kvm, sp);
2738
2739         /* Zapping children means active_mmu_pages has become unstable. */
2740         list_unstable = *nr_zapped;
2741
2742         if (!sp->role.invalid && !sp->role.direct)
2743                 unaccount_shadowed(kvm, sp);
2744
2745         if (sp->unsync)
2746                 kvm_unlink_unsync_page(kvm, sp);
2747         if (!sp->root_count) {
2748                 /* Count self */
2749                 (*nr_zapped)++;
2750
2751                 /*
2752                  * Already invalid pages (previously active roots) are not on
2753                  * the active page list.  See list_del() in the "else" case of
2754                  * !sp->root_count.
2755                  */
2756                 if (sp->role.invalid)
2757                         list_add(&sp->link, invalid_list);
2758                 else
2759                         list_move(&sp->link, invalid_list);
2760                 kvm_mod_used_mmu_pages(kvm, -1);
2761         } else {
2762                 /*
2763                  * Remove the active root from the active page list, the root
2764                  * will be explicitly freed when the root_count hits zero.
2765                  */
2766                 list_del(&sp->link);
2767
2768                 /*
2769                  * Obsolete pages cannot be used on any vCPUs, see the comment
2770                  * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
2771                  * treats invalid shadow pages as being obsolete.
2772                  */
2773                 if (!is_obsolete_sp(kvm, sp))
2774                         kvm_reload_remote_mmus(kvm);
2775         }
2776
2777         if (sp->lpage_disallowed)
2778                 unaccount_huge_nx_page(kvm, sp);
2779
2780         sp->role.invalid = 1;
2781         return list_unstable;
2782 }
2783
2784 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2785                                      struct list_head *invalid_list)
2786 {
2787         int nr_zapped;
2788
2789         __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2790         return nr_zapped;
2791 }
2792
2793 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2794                                     struct list_head *invalid_list)
2795 {
2796         struct kvm_mmu_page *sp, *nsp;
2797
2798         if (list_empty(invalid_list))
2799                 return;
2800
2801         /*
2802          * We need to make sure everyone sees our modifications to
2803          * the page tables and see changes to vcpu->mode here. The barrier
2804          * in the kvm_flush_remote_tlbs() achieves this. This pairs
2805          * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2806          *
2807          * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2808          * guest mode and/or lockless shadow page table walks.
2809          */
2810         kvm_flush_remote_tlbs(kvm);
2811
2812         list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2813                 WARN_ON(!sp->role.invalid || sp->root_count);
2814                 kvm_mmu_free_page(sp);
2815         }
2816 }
2817
2818 static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
2819                                                   unsigned long nr_to_zap)
2820 {
2821         unsigned long total_zapped = 0;
2822         struct kvm_mmu_page *sp, *tmp;
2823         LIST_HEAD(invalid_list);
2824         bool unstable;
2825         int nr_zapped;
2826
2827         if (list_empty(&kvm->arch.active_mmu_pages))
2828                 return 0;
2829
2830 restart:
2831         list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) {
2832                 /*
2833                  * Don't zap active root pages, the page itself can't be freed
2834                  * and zapping it will just force vCPUs to realloc and reload.
2835                  */
2836                 if (sp->root_count)
2837                         continue;
2838
2839                 unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
2840                                                       &nr_zapped);
2841                 total_zapped += nr_zapped;
2842                 if (total_zapped >= nr_to_zap)
2843                         break;
2844
2845                 if (unstable)
2846                         goto restart;
2847         }
2848
2849         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2850
2851         kvm->stat.mmu_recycled += total_zapped;
2852         return total_zapped;
2853 }
2854
2855 static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
2856 {
2857         if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
2858                 return kvm->arch.n_max_mmu_pages -
2859                         kvm->arch.n_used_mmu_pages;
2860
2861         return 0;
2862 }
2863
2864 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
2865 {
2866         unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
2867
2868         if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
2869                 return 0;
2870
2871         kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
2872
2873         if (!kvm_mmu_available_pages(vcpu->kvm))
2874                 return -ENOSPC;
2875         return 0;
2876 }
2877
2878 /*
2879  * Changing the number of mmu pages allocated to the vm
2880  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2881  */
2882 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2883 {
2884         spin_lock(&kvm->mmu_lock);
2885
2886         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2887                 kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
2888                                                   goal_nr_mmu_pages);
2889
2890                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2891         }
2892
2893         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2894
2895         spin_unlock(&kvm->mmu_lock);
2896 }
2897
2898 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2899 {
2900         struct kvm_mmu_page *sp;
2901         LIST_HEAD(invalid_list);
2902         int r;
2903
2904         pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2905         r = 0;
2906         spin_lock(&kvm->mmu_lock);
2907         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2908                 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2909                          sp->role.word);
2910                 r = 1;
2911                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2912         }
2913         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2914         spin_unlock(&kvm->mmu_lock);
2915
2916         return r;
2917 }
2918 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2919
2920 static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2921 {
2922         trace_kvm_mmu_unsync_page(sp);
2923         ++vcpu->kvm->stat.mmu_unsync;
2924         sp->unsync = 1;
2925
2926         kvm_mmu_mark_parents_unsync(sp);
2927 }
2928
2929 static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2930                                    bool can_unsync)
2931 {
2932         struct kvm_mmu_page *sp;
2933
2934         if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2935                 return true;
2936
2937         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
2938                 if (!can_unsync)
2939                         return true;
2940
2941                 if (sp->unsync)
2942                         continue;
2943
2944                 WARN_ON(sp->role.level != PG_LEVEL_4K);
2945                 kvm_unsync_page(vcpu, sp);
2946         }
2947
2948         /*
2949          * We need to ensure that the marking of unsync pages is visible
2950          * before the SPTE is updated to allow writes because
2951          * kvm_mmu_sync_roots() checks the unsync flags without holding
2952          * the MMU lock and so can race with this. If the SPTE was updated
2953          * before the page had been marked as unsync-ed, something like the
2954          * following could happen:
2955          *
2956          * CPU 1                    CPU 2
2957          * ---------------------------------------------------------------------
2958          * 1.2 Host updates SPTE
2959          *     to be writable
2960          *                      2.1 Guest writes a GPTE for GVA X.
2961          *                          (GPTE being in the guest page table shadowed
2962          *                           by the SP from CPU 1.)
2963          *                          This reads SPTE during the page table walk.
2964          *                          Since SPTE.W is read as 1, there is no
2965          *                          fault.
2966          *
2967          *                      2.2 Guest issues TLB flush.
2968          *                          That causes a VM Exit.
2969          *
2970          *                      2.3 kvm_mmu_sync_pages() reads sp->unsync.
2971          *                          Since it is false, so it just returns.
2972          *
2973          *                      2.4 Guest accesses GVA X.
2974          *                          Since the mapping in the SP was not updated,
2975          *                          so the old mapping for GVA X incorrectly
2976          *                          gets used.
2977          * 1.1 Host marks SP
2978          *     as unsync
2979          *     (sp->unsync = true)
2980          *
2981          * The write barrier below ensures that 1.1 happens before 1.2 and thus
2982          * the situation in 2.4 does not arise. The implicit barrier in 2.2
2983          * pairs with this write barrier.
2984          */
2985         smp_wmb();
2986
2987         return false;
2988 }
2989
2990 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
2991 {
2992         if (pfn_valid(pfn))
2993                 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
2994                         /*
2995                          * Some reserved pages, such as those from NVDIMM
2996                          * DAX devices, are not for MMIO, and can be mapped
2997                          * with cached memory type for better performance.
2998                          * However, the above check misconceives those pages
2999                          * as MMIO, and results in KVM mapping them with UC
3000                          * memory type, which would hurt the performance.
3001                          * Therefore, we check the host memory type in addition
3002                          * and only treat UC/UC-/WC pages as MMIO.
3003                          */
3004                         (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
3005
3006         return !e820__mapped_raw_any(pfn_to_hpa(pfn),
3007                                      pfn_to_hpa(pfn + 1) - 1,
3008                                      E820_TYPE_RAM);
3009 }
3010
3011 /* Bits which may be returned by set_spte() */
3012 #define SET_SPTE_WRITE_PROTECTED_PT     BIT(0)
3013 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH  BIT(1)
3014
3015 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
3016                     unsigned int pte_access, int level,
3017                     gfn_t gfn, kvm_pfn_t pfn, bool speculative,
3018                     bool can_unsync, bool host_writable)
3019 {
3020         u64 spte = 0;
3021         int ret = 0;
3022         struct kvm_mmu_page *sp;
3023
3024         if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
3025                 return 0;
3026
3027         sp = sptep_to_sp(sptep);
3028         if (sp_ad_disabled(sp))
3029                 spte |= SPTE_AD_DISABLED_MASK;
3030         else if (kvm_vcpu_ad_need_write_protect(vcpu))
3031                 spte |= SPTE_AD_WRPROT_ONLY_MASK;
3032
3033         /*
3034          * For the EPT case, shadow_present_mask is 0 if hardware
3035          * supports exec-only page table entries.  In that case,
3036          * ACC_USER_MASK and shadow_user_mask are used to represent
3037          * read access.  See FNAME(gpte_access) in paging_tmpl.h.
3038          */
3039         spte |= shadow_present_mask;
3040         if (!speculative)
3041                 spte |= spte_shadow_accessed_mask(spte);
3042
3043         if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
3044             is_nx_huge_page_enabled()) {
3045                 pte_access &= ~ACC_EXEC_MASK;
3046         }
3047
3048         if (pte_access & ACC_EXEC_MASK)
3049                 spte |= shadow_x_mask;
3050         else
3051                 spte |= shadow_nx_mask;
3052
3053         if (pte_access & ACC_USER_MASK)
3054                 spte |= shadow_user_mask;
3055
3056         if (level > PG_LEVEL_4K)
3057                 spte |= PT_PAGE_SIZE_MASK;
3058         if (tdp_enabled)
3059                 spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
3060                         kvm_is_mmio_pfn(pfn));
3061
3062         if (host_writable)
3063                 spte |= SPTE_HOST_WRITEABLE;
3064         else
3065                 pte_access &= ~ACC_WRITE_MASK;
3066
3067         if (!kvm_is_mmio_pfn(pfn))
3068                 spte |= shadow_me_mask;
3069
3070         spte |= (u64)pfn << PAGE_SHIFT;
3071
3072         if (pte_access & ACC_WRITE_MASK) {
3073                 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
3074
3075                 /*
3076                  * Optimization: for pte sync, if spte was writable the hash
3077                  * lookup is unnecessary (and expensive). Write protection
3078                  * is responsibility of mmu_get_page / kvm_sync_page.
3079                  * Same reasoning can be applied to dirty page accounting.
3080                  */
3081                 if (!can_unsync && is_writable_pte(*sptep))
3082                         goto set_pte;
3083
3084                 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
3085                         pgprintk("%s: found shadow page for %llx, marking ro\n",
3086                                  __func__, gfn);
3087                         ret |= SET_SPTE_WRITE_PROTECTED_PT;
3088                         pte_access &= ~ACC_WRITE_MASK;
3089                         spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
3090                 }
3091         }
3092
3093         if (pte_access & ACC_WRITE_MASK) {
3094                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3095                 spte |= spte_shadow_dirty_mask(spte);
3096         }
3097
3098         if (speculative)
3099                 spte = mark_spte_for_access_track(spte);
3100
3101 set_pte:
3102         if (mmu_spte_update(sptep, spte))
3103                 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
3104         return ret;
3105 }
3106
3107 static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
3108                         unsigned int pte_access, int write_fault, int level,
3109                         gfn_t gfn, kvm_pfn_t pfn, bool speculative,
3110                         bool host_writable)
3111 {
3112         int was_rmapped = 0;
3113         int rmap_count;
3114         int set_spte_ret;
3115         int ret = RET_PF_RETRY;
3116         bool flush = false;
3117
3118         pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
3119                  *sptep, write_fault, gfn);
3120
3121         if (is_shadow_present_pte(*sptep)) {
3122                 /*
3123                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
3124                  * the parent of the now unreachable PTE.
3125                  */
3126                 if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
3127                         struct kvm_mmu_page *child;
3128                         u64 pte = *sptep;
3129
3130                         child = page_header(pte & PT64_BASE_ADDR_MASK);
3131                         drop_parent_pte(child, sptep);
3132                         flush = true;
3133                 } else if (pfn != spte_to_pfn(*sptep)) {
3134                         pgprintk("hfn old %llx new %llx\n",
3135                                  spte_to_pfn(*sptep), pfn);
3136                         drop_spte(vcpu->kvm, sptep);
3137                         flush = true;
3138                 } else
3139                         was_rmapped = 1;
3140         }
3141
3142         set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
3143                                 speculative, true, host_writable);
3144         if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
3145                 if (write_fault)
3146                         ret = RET_PF_EMULATE;
3147                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
3148         }
3149
3150         if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
3151                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
3152                                 KVM_PAGES_PER_HPAGE(level));
3153
3154         if (unlikely(is_mmio_spte(*sptep)))
3155                 ret = RET_PF_EMULATE;
3156
3157         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
3158         trace_kvm_mmu_set_spte(level, gfn, sptep);
3159         if (!was_rmapped && is_large_pte(*sptep))
3160                 ++vcpu->kvm->stat.lpages;
3161
3162         if (is_shadow_present_pte(*sptep)) {
3163                 if (!was_rmapped) {
3164                         rmap_count = rmap_add(vcpu, sptep, gfn);
3165                         if (rmap_count > RMAP_RECYCLE_THRESHOLD)
3166                                 rmap_recycle(vcpu, sptep, gfn);
3167                 }
3168         }
3169
3170         return ret;
3171 }
3172
3173 static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
3174                                      bool no_dirty_log)
3175 {
3176         struct kvm_memory_slot *slot;
3177
3178         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
3179         if (!slot)
3180                 return KVM_PFN_ERR_FAULT;
3181
3182         return gfn_to_pfn_memslot_atomic(slot, gfn);
3183 }
3184
3185 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
3186                                     struct kvm_mmu_page *sp,
3187                                     u64 *start, u64 *end)
3188 {
3189         struct page *pages[PTE_PREFETCH_NUM];
3190         struct kvm_memory_slot *slot;
3191         unsigned int access = sp->role.access;
3192         int i, ret;
3193         gfn_t gfn;
3194
3195         gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
3196         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
3197         if (!slot)
3198                 return -1;
3199
3200         ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
3201         if (ret <= 0)
3202                 return -1;
3203
3204         for (i = 0; i < ret; i++, gfn++, start++) {
3205                 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
3206                              page_to_pfn(pages[i]), true, true);
3207                 put_page(pages[i]);
3208         }
3209
3210         return 0;
3211 }
3212
3213 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3214                                   struct kvm_mmu_page *sp, u64 *sptep)
3215 {
3216         u64 *spte, *start = NULL;
3217         int i;
3218
3219         WARN_ON(!sp->role.direct);
3220
3221         i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
3222         spte = sp->spt + i;
3223
3224         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3225                 if (is_shadow_present_pte(*spte) || spte == sptep) {
3226                         if (!start)
3227                                 continue;
3228                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3229                                 break;
3230                         start = NULL;
3231                 } else if (!start)
3232                         start = spte;
3233         }
3234 }
3235
3236 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3237 {
3238         struct kvm_mmu_page *sp;
3239
3240         sp = sptep_to_sp(sptep);
3241
3242         /*
3243          * Without accessed bits, there's no way to distinguish between
3244          * actually accessed translations and prefetched, so disable pte
3245          * prefetch if accessed bits aren't available.
3246          */
3247         if (sp_ad_disabled(sp))
3248                 return;
3249
3250         if (sp->role.level > PG_LEVEL_4K)
3251                 return;
3252
3253         __direct_pte_prefetch(vcpu, sp, sptep);
3254 }
3255
3256 static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
3257                                   kvm_pfn_t pfn, struct kvm_memory_slot *slot)
3258 {
3259         unsigned long hva;
3260         pte_t *pte;
3261         int level;
3262
3263         if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
3264                 return PG_LEVEL_4K;
3265
3266         /*
3267          * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
3268          * is not solely for performance, it's also necessary to avoid the
3269          * "writable" check in __gfn_to_hva_many(), which will always fail on
3270          * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
3271          * page fault steps have already verified the guest isn't writing a
3272          * read-only memslot.
3273          */
3274         hva = __gfn_to_hva_memslot(slot, gfn);
3275
3276         pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level);
3277         if (unlikely(!pte))
3278                 return PG_LEVEL_4K;
3279
3280         return level;
3281 }
3282
3283 static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
3284                                    int max_level, kvm_pfn_t *pfnp)
3285 {
3286         struct kvm_memory_slot *slot;
3287         struct kvm_lpage_info *linfo;
3288         kvm_pfn_t pfn = *pfnp;
3289         kvm_pfn_t mask;
3290         int level;
3291
3292         if (unlikely(max_level == PG_LEVEL_4K))
3293                 return PG_LEVEL_4K;
3294
3295         if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn))
3296                 return PG_LEVEL_4K;
3297
3298         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true);
3299         if (!slot)
3300                 return PG_LEVEL_4K;
3301
3302         max_level = min(max_level, max_page_level);
3303         for ( ; max_level > PG_LEVEL_4K; max_level--) {
3304                 linfo = lpage_info_slot(gfn, slot, max_level);
3305                 if (!linfo->disallow_lpage)
3306                         break;
3307         }
3308
3309         if (max_level == PG_LEVEL_4K)
3310                 return PG_LEVEL_4K;
3311
3312         level = host_pfn_mapping_level(vcpu, gfn, pfn, slot);
3313         if (level == PG_LEVEL_4K)
3314                 return level;
3315
3316         level = min(level, max_level);
3317
3318         /*
3319          * mmu_notifier_retry() was successful and mmu_lock is held, so
3320          * the pmd can't be split from under us.
3321          */
3322         mask = KVM_PAGES_PER_HPAGE(level) - 1;
3323         VM_BUG_ON((gfn & mask) != (pfn & mask));
3324         *pfnp = pfn & ~mask;
3325
3326         return level;
3327 }
3328
3329 static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
3330                                        gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
3331 {
3332         int level = *levelp;
3333         u64 spte = *it.sptep;
3334
3335         if (it.level == level && level > PG_LEVEL_4K &&
3336             is_nx_huge_page_enabled() &&
3337             is_shadow_present_pte(spte) &&
3338             !is_large_pte(spte)) {
3339                 /*
3340                  * A small SPTE exists for this pfn, but FNAME(fetch)
3341                  * and __direct_map would like to create a large PTE
3342                  * instead: just force them to go down another level,
3343                  * patching back for them into pfn the next 9 bits of
3344                  * the address.
3345                  */
3346                 u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
3347                 *pfnp |= gfn & page_mask;
3348                 (*levelp)--;
3349         }
3350 }
3351
3352 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3353                         int map_writable, int max_level, kvm_pfn_t pfn,
3354                         bool prefault, bool account_disallowed_nx_lpage)
3355 {
3356         struct kvm_shadow_walk_iterator it;
3357         struct kvm_mmu_page *sp;
3358         int level, ret;
3359         gfn_t gfn = gpa >> PAGE_SHIFT;
3360         gfn_t base_gfn = gfn;
3361
3362         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
3363                 return RET_PF_RETRY;
3364
3365         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn);
3366
3367         trace_kvm_mmu_spte_requested(gpa, level, pfn);
3368         for_each_shadow_entry(vcpu, gpa, it) {
3369                 /*
3370                  * We cannot overwrite existing page tables with an NX
3371                  * large page, as the leaf could be executable.
3372                  */
3373                 disallowed_hugepage_adjust(it, gfn, &pfn, &level);
3374
3375                 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
3376                 if (it.level == level)
3377                         break;
3378
3379                 drop_large_spte(vcpu, it.sptep);
3380                 if (!is_shadow_present_pte(*it.sptep)) {
3381                         sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
3382                                               it.level - 1, true, ACC_ALL);
3383
3384                         link_shadow_page(vcpu, it.sptep, sp);
3385                         if (account_disallowed_nx_lpage)
3386                                 account_huge_nx_page(vcpu->kvm, sp);
3387                 }
3388         }
3389
3390         ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
3391                            write, level, base_gfn, pfn, prefault,
3392                            map_writable);
3393         direct_pte_prefetch(vcpu, it.sptep);
3394         ++vcpu->stat.pf_fixed;
3395         return ret;
3396 }
3397
3398 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3399 {
3400         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3401 }
3402
3403 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3404 {
3405         /*
3406          * Do not cache the mmio info caused by writing the readonly gfn
3407          * into the spte otherwise read access on readonly gfn also can
3408          * caused mmio page fault and treat it as mmio access.
3409          */
3410         if (pfn == KVM_PFN_ERR_RO_FAULT)
3411                 return RET_PF_EMULATE;
3412
3413         if (pfn == KVM_PFN_ERR_HWPOISON) {
3414                 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3415                 return RET_PF_RETRY;
3416         }
3417
3418         return -EFAULT;
3419 }
3420
3421 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
3422                                 kvm_pfn_t pfn, unsigned int access,
3423                                 int *ret_val)
3424 {
3425         /* The pfn is invalid, report the error! */
3426         if (unlikely(is_error_pfn(pfn))) {
3427                 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
3428                 return true;
3429         }
3430
3431         if (unlikely(is_noslot_pfn(pfn)))
3432                 vcpu_cache_mmio_info(vcpu, gva, gfn,
3433                                      access & shadow_mmio_access_mask);
3434
3435         return false;
3436 }
3437
3438 static bool page_fault_can_be_fast(u32 error_code)
3439 {
3440         /*
3441          * Do not fix the mmio spte with invalid generation number which
3442          * need to be updated by slow page fault path.
3443          */
3444         if (unlikely(error_code & PFERR_RSVD_MASK))
3445                 return false;
3446
3447         /* See if the page fault is due to an NX violation */
3448         if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3449                       == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3450                 return false;
3451
3452         /*
3453          * #PF can be fast if:
3454          * 1. The shadow page table entry is not present, which could mean that
3455          *    the fault is potentially caused by access tracking (if enabled).
3456          * 2. The shadow page table entry is present and the fault
3457          *    is caused by write-protect, that means we just need change the W
3458          *    bit of the spte which can be done out of mmu-lock.
3459          *
3460          * However, if access tracking is disabled we know that a non-present
3461          * page must be a genuine page fault where we have to create a new SPTE.
3462          * So, if access tracking is disabled, we return true only for write
3463          * accesses to a present page.
3464          */
3465
3466         return shadow_acc_track_mask != 0 ||
3467                ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3468                 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
3469 }
3470
3471 /*
3472  * Returns true if the SPTE was fixed successfully. Otherwise,
3473  * someone else modified the SPTE from its original value.
3474  */
3475 static bool
3476 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
3477                         u64 *sptep, u64 old_spte, u64 new_spte)
3478 {
3479         gfn_t gfn;
3480
3481         WARN_ON(!sp->role.direct);
3482
3483         /*
3484          * Theoretically we could also set dirty bit (and flush TLB) here in
3485          * order to eliminate unnecessary PML logging. See comments in
3486          * set_spte. But fast_page_fault is very unlikely to happen with PML
3487          * enabled, so we do not do this. This might result in the same GPA
3488          * to be logged in PML buffer again when the write really happens, and
3489          * eventually to be called by mark_page_dirty twice. But it's also no
3490          * harm. This also avoids the TLB flush needed after setting dirty bit
3491          * so non-PML cases won't be impacted.
3492          *
3493          * Compare with set_spte where instead shadow_dirty_mask is set.
3494          */
3495         if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3496                 return false;
3497
3498         if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3499                 /*
3500                  * The gfn of direct spte is stable since it is
3501                  * calculated by sp->gfn.
3502                  */
3503                 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3504                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3505         }
3506
3507         return true;
3508 }
3509
3510 static bool is_access_allowed(u32 fault_err_code, u64 spte)
3511 {
3512         if (fault_err_code & PFERR_FETCH_MASK)
3513                 return is_executable_pte(spte);
3514
3515         if (fault_err_code & PFERR_WRITE_MASK)
3516                 return is_writable_pte(spte);
3517
3518         /* Fault was on Read access */
3519         return spte & PT_PRESENT_MASK;
3520 }
3521
3522 /*
3523  * Return value:
3524  * - true: let the vcpu to access on the same address again.
3525  * - false: let the real page fault path to fix it.
3526  */
3527 static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
3528                             u32 error_code)
3529 {
3530         struct kvm_shadow_walk_iterator iterator;
3531         struct kvm_mmu_page *sp;
3532         bool fault_handled = false;
3533         u64 spte = 0ull;
3534         uint retry_count = 0;
3535
3536         if (!page_fault_can_be_fast(error_code))
3537                 return false;
3538
3539         walk_shadow_page_lockless_begin(vcpu);
3540
3541         do {
3542                 u64 new_spte;
3543
3544                 for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
3545                         if (!is_shadow_present_pte(spte))
3546                                 break;
3547
3548                 sp = sptep_to_sp(iterator.sptep);
3549                 if (!is_last_spte(spte, sp->role.level))
3550                         break;
3551
3552                 /*
3553                  * Check whether the memory access that caused the fault would
3554                  * still cause it if it were to be performed right now. If not,
3555                  * then this is a spurious fault caused by TLB lazily flushed,
3556                  * or some other CPU has already fixed the PTE after the
3557                  * current CPU took the fault.
3558                  *
3559                  * Need not check the access of upper level table entries since
3560                  * they are always ACC_ALL.
3561                  */
3562                 if (is_access_allowed(error_code, spte)) {
3563                         fault_handled = true;
3564                         break;
3565                 }
3566
3567                 new_spte = spte;
3568
3569                 if (is_access_track_spte(spte))
3570                         new_spte = restore_acc_track_spte(new_spte);
3571
3572                 /*
3573                  * Currently, to simplify the code, write-protection can
3574                  * be removed in the fast path only if the SPTE was
3575                  * write-protected for dirty-logging or access tracking.
3576                  */
3577                 if ((error_code & PFERR_WRITE_MASK) &&
3578                     spte_can_locklessly_be_made_writable(spte)) {
3579                         new_spte |= PT_WRITABLE_MASK;
3580
3581                         /*
3582                          * Do not fix write-permission on the large spte.  Since
3583                          * we only dirty the first page into the dirty-bitmap in
3584                          * fast_pf_fix_direct_spte(), other pages are missed
3585                          * if its slot has dirty logging enabled.
3586                          *
3587                          * Instead, we let the slow page fault path create a
3588                          * normal spte to fix the access.
3589                          *
3590                          * See the comments in kvm_arch_commit_memory_region().
3591                          */
3592                         if (sp->role.level > PG_LEVEL_4K)
3593                                 break;
3594                 }
3595
3596                 /* Verify that the fault can be handled in the fast path */
3597                 if (new_spte == spte ||
3598                     !is_access_allowed(error_code, new_spte))
3599                         break;
3600
3601                 /*
3602                  * Currently, fast page fault only works for direct mapping
3603                  * since the gfn is not stable for indirect shadow page. See
3604                  * Documentation/virt/kvm/locking.rst to get more detail.
3605                  */
3606                 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3607                                                         iterator.sptep, spte,
3608                                                         new_spte);
3609                 if (fault_handled)
3610                         break;
3611
3612                 if (++retry_count > 4) {
3613                         printk_once(KERN_WARNING
3614                                 "kvm: Fast #PF retrying more than 4 times.\n");
3615                         break;
3616                 }
3617
3618         } while (true);
3619
3620         trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
3621                               spte, fault_handled);
3622         walk_shadow_page_lockless_end(vcpu);
3623
3624         return fault_handled;
3625 }
3626
3627 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3628                                struct list_head *invalid_list)
3629 {
3630         struct kvm_mmu_page *sp;
3631
3632         if (!VALID_PAGE(*root_hpa))
3633                 return;
3634
3635         sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
3636         --sp->root_count;
3637         if (!sp->root_count && sp->role.invalid)
3638                 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3639
3640         *root_hpa = INVALID_PAGE;
3641 }
3642
3643 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3644 void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3645                         ulong roots_to_free)
3646 {
3647         int i;
3648         LIST_HEAD(invalid_list);
3649         bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
3650
3651         BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3652
3653         /* Before acquiring the MMU lock, see if we need to do any real work. */
3654         if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
3655                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3656                         if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3657                             VALID_PAGE(mmu->prev_roots[i].hpa))
3658                                 break;
3659
3660                 if (i == KVM_MMU_NUM_PREV_ROOTS)
3661                         return;
3662         }
3663
3664         spin_lock(&vcpu->kvm->mmu_lock);
3665
3666         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3667                 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3668                         mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
3669                                            &invalid_list);
3670
3671         if (free_active_root) {
3672                 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
3673                     (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3674                         mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
3675                                            &invalid_list);
3676                 } else {
3677                         for (i = 0; i < 4; ++i)
3678                                 if (mmu->pae_root[i] != 0)
3679                                         mmu_free_root_page(vcpu->kvm,
3680                                                            &mmu->pae_root[i],
3681                                                            &invalid_list);
3682                         mmu->root_hpa = INVALID_PAGE;
3683                 }
3684                 mmu->root_pgd = 0;
3685         }
3686
3687         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3688         spin_unlock(&vcpu->kvm->mmu_lock);
3689 }
3690 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3691
3692 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3693 {
3694         int ret = 0;
3695
3696         if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
3697                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3698                 ret = 1;
3699         }
3700
3701         return ret;
3702 }
3703
3704 static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
3705                             u8 level, bool direct)
3706 {
3707         struct kvm_mmu_page *sp;
3708
3709         spin_lock(&vcpu->kvm->mmu_lock);
3710
3711         if (make_mmu_pages_available(vcpu)) {
3712                 spin_unlock(&vcpu->kvm->mmu_lock);
3713                 return INVALID_PAGE;
3714         }
3715         sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
3716         ++sp->root_count;
3717
3718         spin_unlock(&vcpu->kvm->mmu_lock);
3719         return __pa(sp->spt);
3720 }
3721
3722 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3723 {
3724         u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;
3725         hpa_t root;
3726         unsigned i;
3727
3728         if (shadow_root_level >= PT64_ROOT_4LEVEL) {
3729                 root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
3730                 if (!VALID_PAGE(root))
3731                         return -ENOSPC;
3732                 vcpu->arch.mmu->root_hpa = root;
3733         } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
3734                 for (i = 0; i < 4; ++i) {
3735                         MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
3736
3737                         root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
3738                                               i << 30, PT32_ROOT_LEVEL, true);
3739                         if (!VALID_PAGE(root))
3740                                 return -ENOSPC;
3741                         vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
3742                 }
3743                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3744         } else
3745                 BUG();
3746
3747         /* root_pgd is ignored for direct MMUs. */
3748         vcpu->arch.mmu->root_pgd = 0;
3749
3750         return 0;
3751 }
3752
3753 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3754 {
3755         u64 pdptr, pm_mask;
3756         gfn_t root_gfn, root_pgd;
3757         hpa_t root;
3758         int i;
3759
3760         root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
3761         root_gfn = root_pgd >> PAGE_SHIFT;
3762
3763         if (mmu_check_root(vcpu, root_gfn))
3764                 return 1;
3765
3766         /*
3767          * Do we shadow a long mode page table? If so we need to
3768          * write-protect the guests page table root.
3769          */
3770         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3771                 MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));
3772
3773                 root = mmu_alloc_root(vcpu, root_gfn, 0,
3774                                       vcpu->arch.mmu->shadow_root_level, false);
3775                 if (!VALID_PAGE(root))
3776                         return -ENOSPC;
3777                 vcpu->arch.mmu->root_hpa = root;
3778                 goto set_root_pgd;
3779         }
3780
3781         /*
3782          * We shadow a 32 bit page table. This may be a legacy 2-level
3783          * or a PAE 3-level page table. In either case we need to be aware that
3784          * the shadow page table may be a PAE or a long mode page table.
3785          */
3786         pm_mask = PT_PRESENT_MASK;
3787         if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3788                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3789
3790         for (i = 0; i < 4; ++i) {
3791                 MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
3792                 if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
3793                         pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
3794                         if (!(pdptr & PT_PRESENT_MASK)) {
3795                                 vcpu->arch.mmu->pae_root[i] = 0;
3796                                 continue;
3797                         }
3798                         root_gfn = pdptr >> PAGE_SHIFT;
3799                         if (mmu_check_root(vcpu, root_gfn))
3800                                 return 1;
3801                 }
3802
3803                 root = mmu_alloc_root(vcpu, root_gfn, i << 30,
3804                                       PT32_ROOT_LEVEL, false);
3805                 if (!VALID_PAGE(root))
3806                         return -ENOSPC;
3807                 vcpu->arch.mmu->pae_root[i] = root | pm_mask;
3808         }
3809         vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3810
3811         /*
3812          * If we shadow a 32 bit page table with a long mode page
3813          * table we enter this path.
3814          */
3815         if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
3816                 if (vcpu->arch.mmu->lm_root == NULL) {
3817                         /*
3818                          * The additional page necessary for this is only
3819                          * allocated on demand.
3820                          */
3821
3822                         u64 *lm_root;
3823
3824                         lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3825                         if (lm_root == NULL)
3826                                 return 1;
3827
3828                         lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
3829
3830                         vcpu->arch.mmu->lm_root = lm_root;
3831                 }
3832
3833                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
3834         }
3835
3836 set_root_pgd:
3837         vcpu->arch.mmu->root_pgd = root_pgd;
3838
3839         return 0;
3840 }
3841
3842 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3843 {
3844         if (vcpu->arch.mmu->direct_map)
3845                 return mmu_alloc_direct_roots(vcpu);
3846         else
3847                 return mmu_alloc_shadow_roots(vcpu);
3848 }
3849
3850 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3851 {
3852         int i;
3853         struct kvm_mmu_page *sp;
3854
3855         if (vcpu->arch.mmu->direct_map)
3856                 return;
3857
3858         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3859                 return;
3860
3861         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3862
3863         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3864                 hpa_t root = vcpu->arch.mmu->root_hpa;
3865                 sp = page_header(root);
3866
3867                 /*
3868                  * Even if another CPU was marking the SP as unsync-ed
3869                  * simultaneously, any guest page table changes are not
3870                  * guaranteed to be visible anyway until this VCPU issues a TLB
3871                  * flush strictly after those changes are made. We only need to
3872                  * ensure that the other CPU sets these flags before any actual
3873                  * changes to the page tables are made. The comments in
3874                  * mmu_need_write_protect() describe what could go wrong if this
3875                  * requirement isn't satisfied.
3876                  */
3877                 if (!smp_load_acquire(&sp->unsync) &&
3878                     !smp_load_acquire(&sp->unsync_children))
3879                         return;
3880
3881                 spin_lock(&vcpu->kvm->mmu_lock);
3882                 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3883
3884                 mmu_sync_children(vcpu, sp);
3885
3886                 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3887                 spin_unlock(&vcpu->kvm->mmu_lock);
3888                 return;
3889         }
3890
3891         spin_lock(&vcpu->kvm->mmu_lock);
3892         kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3893
3894         for (i = 0; i < 4; ++i) {
3895                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3896
3897                 if (root && VALID_PAGE(root)) {
3898                         root &= PT64_BASE_ADDR_MASK;
3899                         sp = page_header(root);
3900                         mmu_sync_children(vcpu, sp);
3901                 }
3902         }
3903
3904         kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3905         spin_unlock(&vcpu->kvm->mmu_lock);
3906 }
3907 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3908
3909 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
3910                                   u32 access, struct x86_exception *exception)
3911 {
3912         if (exception)
3913                 exception->error_code = 0;
3914         return vaddr;
3915 }
3916
3917 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
3918                                          u32 access,
3919                                          struct x86_exception *exception)
3920 {
3921         if (exception)
3922                 exception->error_code = 0;
3923         return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
3924 }
3925
3926 static bool
3927 __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
3928 {
3929         int bit7 = (pte >> 7) & 1;
3930
3931         return pte & rsvd_check->rsvd_bits_mask[bit7][level-1];
3932 }
3933
3934 static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte)
3935 {
3936         return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
3937 }
3938
3939 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3940 {
3941         /*
3942          * A nested guest cannot use the MMIO cache if it is using nested
3943          * page tables, because cr2 is a nGPA while the cache stores GPAs.
3944          */
3945         if (mmu_is_nested(vcpu))
3946                 return false;
3947
3948         if (direct)
3949                 return vcpu_match_mmio_gpa(vcpu, addr);
3950
3951         return vcpu_match_mmio_gva(vcpu, addr);
3952 }
3953
3954 /* return true if reserved bit is detected on spte. */
3955 static bool
3956 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3957 {
3958         struct kvm_shadow_walk_iterator iterator;
3959         u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
3960         struct rsvd_bits_validate *rsvd_check;
3961         int root, leaf;
3962         bool reserved = false;
3963
3964         rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
3965
3966         walk_shadow_page_lockless_begin(vcpu);
3967
3968         for (shadow_walk_init(&iterator, vcpu, addr),
3969                  leaf = root = iterator.level;
3970              shadow_walk_okay(&iterator);
3971              __shadow_walk_next(&iterator, spte)) {
3972                 spte = mmu_spte_get_lockless(iterator.sptep);
3973
3974                 sptes[leaf - 1] = spte;
3975                 leaf--;
3976
3977                 if (!is_shadow_present_pte(spte))
3978                         break;
3979
3980                 /*
3981                  * Use a bitwise-OR instead of a logical-OR to aggregate the
3982                  * reserved bit and EPT's invalid memtype/XWR checks to avoid
3983                  * adding a Jcc in the loop.
3984                  */
3985                 reserved |= __is_bad_mt_xwr(rsvd_check, spte) |
3986                             __is_rsvd_bits_set(rsvd_check, spte, iterator.level);
3987         }
3988
3989         walk_shadow_page_lockless_end(vcpu);
3990
3991         if (reserved) {
3992                 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
3993                        __func__, addr);
3994                 while (root > leaf) {
3995                         pr_err("------ spte 0x%llx level %d.\n",
3996                                sptes[root - 1], root);
3997                         root--;
3998                 }
3999         }
4000
4001         *sptep = spte;
4002         return reserved;
4003 }
4004
4005 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4006 {
4007         u64 spte;
4008         bool reserved;
4009
4010         if (mmio_info_in_cache(vcpu, addr, direct))
4011                 return RET_PF_EMULATE;
4012
4013         reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
4014         if (WARN_ON(reserved))
4015                 return -EINVAL;
4016
4017         if (is_mmio_spte(spte)) {
4018                 gfn_t gfn = get_mmio_spte_gfn(spte);
4019                 unsigned int access = get_mmio_spte_access(spte);
4020
4021                 if (!check_mmio_spte(vcpu, spte))
4022                         return RET_PF_INVALID;
4023
4024                 if (direct)
4025                         addr = 0;
4026
4027                 trace_handle_mmio_page_fault(addr, gfn, access);
4028                 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
4029                 return RET_PF_EMULATE;
4030         }
4031
4032         /*
4033          * If the page table is zapped by other cpus, let CPU fault again on
4034          * the address.
4035          */
4036         return RET_PF_RETRY;
4037 }
4038
4039 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
4040                                          u32 error_code, gfn_t gfn)
4041 {
4042         if (unlikely(error_code & PFERR_RSVD_MASK))
4043                 return false;
4044
4045         if (!(error_code & PFERR_PRESENT_MASK) ||
4046               !(error_code & PFERR_WRITE_MASK))
4047                 return false;
4048
4049         /*
4050          * guest is writing the page which is write tracked which can
4051          * not be fixed by page fault handler.
4052          */
4053         if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
4054                 return true;
4055
4056         return false;
4057 }
4058
4059 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4060 {
4061         struct kvm_shadow_walk_iterator iterator;
4062         u64 spte;
4063
4064         walk_shadow_page_lockless_begin(vcpu);
4065         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
4066                 clear_sp_write_flooding_count(iterator.sptep);
4067                 if (!is_shadow_present_pte(spte))
4068                         break;
4069         }
4070         walk_shadow_page_lockless_end(vcpu);
4071 }
4072
4073 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
4074                                     gfn_t gfn)
4075 {
4076         struct kvm_arch_async_pf arch;
4077
4078         arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4079         arch.gfn = gfn;
4080         arch.direct_map = vcpu->arch.mmu->direct_map;
4081         arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
4082
4083         return kvm_setup_async_pf(vcpu, cr2_or_gpa,
4084                                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4085 }
4086
4087 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
4088                          gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
4089                          bool *writable)
4090 {
4091         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
4092         bool async;
4093
4094         /* Don't expose private memslots to L2. */
4095         if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
4096                 *pfn = KVM_PFN_NOSLOT;
4097                 *writable = false;
4098                 return false;
4099         }
4100
4101         async = false;
4102         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
4103         if (!async)
4104                 return false; /* *pfn has correct page already */
4105
4106         if (!prefault && kvm_can_do_async_pf(vcpu)) {
4107                 trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
4108                 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
4109                         trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
4110                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4111                         return true;
4112                 } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
4113                         return true;
4114         }
4115
4116         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
4117         return false;
4118 }
4119
4120 static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
4121                              bool prefault, int max_level, bool is_tdp)
4122 {
4123         bool write = error_code & PFERR_WRITE_MASK;
4124         bool exec = error_code & PFERR_FETCH_MASK;
4125         bool lpage_disallowed = exec && is_nx_huge_page_enabled();
4126         bool map_writable;
4127
4128         gfn_t gfn = gpa >> PAGE_SHIFT;
4129         unsigned long mmu_seq;
4130         kvm_pfn_t pfn;
4131         int r;
4132
4133         if (page_fault_handle_page_track(vcpu, error_code, gfn))
4134                 return RET_PF_EMULATE;
4135
4136         r = mmu_topup_memory_caches(vcpu);
4137         if (r)
4138                 return r;
4139
4140         if (lpage_disallowed)
4141                 max_level = PG_LEVEL_4K;
4142
4143         if (fast_page_fault(vcpu, gpa, error_code))
4144                 return RET_PF_RETRY;
4145
4146         mmu_seq = vcpu->kvm->mmu_notifier_seq;
4147         smp_rmb();
4148
4149         if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
4150                 return RET_PF_RETRY;
4151
4152         if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
4153                 return r;
4154
4155         r = RET_PF_RETRY;
4156         spin_lock(&vcpu->kvm->mmu_lock);
4157         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
4158                 goto out_unlock;
4159         r = make_mmu_pages_available(vcpu);
4160         if (r)
4161                 goto out_unlock;
4162         r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn,
4163                          prefault, is_tdp && lpage_disallowed);
4164
4165 out_unlock:
4166         spin_unlock(&vcpu->kvm->mmu_lock);
4167         kvm_release_pfn_clean(pfn);
4168         return r;
4169 }
4170
4171 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
4172                                 u32 error_code, bool prefault)
4173 {
4174         pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
4175
4176         /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
4177         return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
4178                                  PG_LEVEL_2M, false);
4179 }
4180
4181 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4182                                 u64 fault_address, char *insn, int insn_len)
4183 {
4184         int r = 1;
4185         u32 flags = vcpu->arch.apf.host_apf_flags;
4186
4187 #ifndef CONFIG_X86_64
4188         /* A 64-bit CR2 should be impossible on 32-bit KVM. */
4189         if (WARN_ON_ONCE(fault_address >> 32))
4190                 return -EFAULT;
4191 #endif
4192
4193         vcpu->arch.l1tf_flush_l1d = true;
4194         if (!flags) {
4195                 trace_kvm_page_fault(fault_address, error_code);
4196
4197                 if (kvm_event_needs_reinjection(vcpu))
4198                         kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4199                 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4200                                 insn_len);
4201         } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
4202                 vcpu->arch.apf.host_apf_flags = 0;
4203                 local_irq_disable();
4204                 kvm_async_pf_task_wait_schedule(fault_address);
4205                 local_irq_enable();
4206         } else {
4207                 WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
4208         }
4209
4210         return r;
4211 }
4212 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4213
4214 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
4215                        bool prefault)
4216 {
4217         int max_level;
4218
4219         for (max_level = KVM_MAX_HUGEPAGE_LEVEL;
4220              max_level > PG_LEVEL_4K;
4221              max_level--) {
4222                 int page_num = KVM_PAGES_PER_HPAGE(max_level);
4223                 gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
4224
4225                 if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
4226                         break;
4227         }
4228
4229         return direct_page_fault(vcpu, gpa, error_code, prefault,
4230                                  max_level, true);
4231 }
4232
4233 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
4234                                    struct kvm_mmu *context)
4235 {
4236         context->page_fault = nonpaging_page_fault;
4237         context->gva_to_gpa = nonpaging_gva_to_gpa;
4238         context->sync_page = nonpaging_sync_page;
4239         context->invlpg = NULL;
4240         context->update_pte = nonpaging_update_pte;
4241         context->root_level = 0;
4242         context->shadow_root_level = PT32E_ROOT_LEVEL;
4243         context->direct_map = true;
4244         context->nx = false;
4245 }
4246
4247 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
4248                                   union kvm_mmu_page_role role)
4249 {
4250         return (role.direct || pgd == root->pgd) &&
4251                VALID_PAGE(root->hpa) && page_header(root->hpa) &&
4252                role.word == page_header(root->hpa)->role.word;
4253 }
4254
4255 /*
4256  * Find out if a previously cached root matching the new pgd/role is available.
4257  * The current root is also inserted into the cache.
4258  * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
4259  * returned.
4260  * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
4261  * false is returned. This root should now be freed by the caller.
4262  */
4263 static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd,
4264                                   union kvm_mmu_page_role new_role)
4265 {
4266         uint i;
4267         struct kvm_mmu_root_info root;
4268         struct kvm_mmu *mmu = vcpu->arch.mmu;
4269
4270         root.pgd = mmu->root_pgd;
4271         root.hpa = mmu->root_hpa;
4272
4273         if (is_root_usable(&root, new_pgd, new_role))
4274                 return true;
4275
4276         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4277                 swap(root, mmu->prev_roots[i]);
4278
4279                 if (is_root_usable(&root, new_pgd, new_role))
4280                         break;
4281         }
4282
4283         mmu->root_hpa = root.hpa;
4284         mmu->root_pgd = root.pgd;
4285
4286         return i < KVM_MMU_NUM_PREV_ROOTS;
4287 }
4288
4289 static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
4290                             union kvm_mmu_page_role new_role)
4291 {
4292         struct kvm_mmu *mmu = vcpu->arch.mmu;
4293
4294         /*
4295          * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
4296          * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4297          * later if necessary.
4298          */
4299         if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4300             mmu->root_level >= PT64_ROOT_4LEVEL)
4301                 return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) &&
4302                        cached_root_available(vcpu, new_pgd, new_role);
4303
4304         return false;
4305 }
4306
4307 static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
4308                               union kvm_mmu_page_role new_role,
4309                               bool skip_tlb_flush, bool skip_mmu_sync)
4310 {
4311         if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
4312                 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
4313                 return;
4314         }
4315
4316         /*
4317          * It's possible that the cached previous root page is obsolete because
4318          * of a change in the MMU generation number. However, changing the
4319          * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will
4320          * free the root set here and allocate a new one.
4321          */
4322         kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
4323
4324         if (!skip_mmu_sync || force_flush_and_sync_on_reuse)
4325                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4326         if (!skip_tlb_flush || force_flush_and_sync_on_reuse)
4327                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
4328
4329         /*
4330          * The last MMIO access's GVA and GPA are cached in the VCPU. When
4331          * switching to a new CR3, that GVA->GPA mapping may no longer be
4332          * valid. So clear any cached MMIO info even when we don't need to sync
4333          * the shadow page tables.
4334          */
4335         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4336
4337         __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa));
4338 }
4339
4340 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
4341                      bool skip_mmu_sync)
4342 {
4343         __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu),
4344                           skip_tlb_flush, skip_mmu_sync);
4345 }
4346 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
4347
4348 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4349 {
4350         return kvm_read_cr3(vcpu);
4351 }
4352
4353 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4354                            unsigned int access, int *nr_present)
4355 {
4356         if (unlikely(is_mmio_spte(*sptep))) {
4357                 if (gfn != get_mmio_spte_gfn(*sptep)) {
4358                         mmu_spte_clear_no_track(sptep);
4359                         return true;
4360                 }
4361
4362                 (*nr_present)++;
4363                 mark_mmio_spte(vcpu, sptep, gfn, access);
4364                 return true;
4365         }
4366
4367         return false;
4368 }
4369
4370 static inline bool is_last_gpte(struct kvm_mmu *mmu,
4371                                 unsigned level, unsigned gpte)
4372 {
4373         /*
4374          * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
4375          * If it is clear, there are no large pages at this level, so clear
4376          * PT_PAGE_SIZE_MASK in gpte if that is the case.
4377          */
4378         gpte &= level - mmu->last_nonleaf_level;
4379
4380         /*
4381          * PG_LEVEL_4K always terminates.  The RHS has bit 7 set
4382          * iff level <= PG_LEVEL_4K, which for our purpose means
4383          * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
4384          */
4385         gpte |= level - PG_LEVEL_4K - 1;
4386
4387         return gpte & PT_PAGE_SIZE_MASK;
4388 }
4389
4390 #define PTTYPE_EPT 18 /* arbitrary */
4391 #define PTTYPE PTTYPE_EPT
4392 #include "paging_tmpl.h"
4393 #undef PTTYPE
4394
4395 #define PTTYPE 64
4396 #include "paging_tmpl.h"
4397 #undef PTTYPE
4398
4399 #define PTTYPE 32
4400 #include "paging_tmpl.h"
4401 #undef PTTYPE
4402
4403 static void
4404 __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4405                         struct rsvd_bits_validate *rsvd_check,
4406                         int maxphyaddr, int level, bool nx, bool gbpages,
4407                         bool pse, bool amd)
4408 {
4409         u64 exb_bit_rsvd = 0;
4410         u64 gbpages_bit_rsvd = 0;
4411         u64 nonleaf_bit8_rsvd = 0;
4412
4413         rsvd_check->bad_mt_xwr = 0;
4414
4415         if (!nx)
4416                 exb_bit_rsvd = rsvd_bits(63, 63);
4417         if (!gbpages)
4418                 gbpages_bit_rsvd = rsvd_bits(7, 7);
4419
4420         /*
4421          * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4422          * leaf entries) on AMD CPUs only.
4423          */
4424         if (amd)
4425                 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4426
4427         switch (level) {
4428         case PT32_ROOT_LEVEL:
4429                 /* no rsvd bits for 2 level 4K page table entries */
4430                 rsvd_check->rsvd_bits_mask[0][1] = 0;
4431                 rsvd_check->rsvd_bits_mask[0][0] = 0;
4432                 rsvd_check->rsvd_bits_mask[1][0] =
4433                         rsvd_check->rsvd_bits_mask[0][0];
4434
4435                 if (!pse) {
4436                         rsvd_check->rsvd_bits_mask[1][1] = 0;
4437                         break;
4438                 }
4439
4440                 if (is_cpuid_PSE36())
4441                         /* 36bits PSE 4MB page */
4442                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4443                 else
4444                         /* 32 bits PSE 4MB page */
4445                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4446                 break;
4447         case PT32E_ROOT_LEVEL:
4448                 rsvd_check->rsvd_bits_mask[0][2] =
4449                         rsvd_bits(maxphyaddr, 63) |
4450                         rsvd_bits(5, 8) | rsvd_bits(1, 2);      /* PDPTE */
4451                 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4452                         rsvd_bits(maxphyaddr, 62);      /* PDE */
4453                 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4454                         rsvd_bits(maxphyaddr, 62);      /* PTE */
4455                 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4456                         rsvd_bits(maxphyaddr, 62) |
4457                         rsvd_bits(13, 20);              /* large page */
4458                 rsvd_check->rsvd_bits_mask[1][0] =
4459                         rsvd_check->rsvd_bits_mask[0][0];
4460                 break;
4461         case PT64_ROOT_5LEVEL:
4462                 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4463                         nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4464                         rsvd_bits(maxphyaddr, 51);
4465                 rsvd_check->rsvd_bits_mask[1][4] =
4466                         rsvd_check->rsvd_bits_mask[0][4];
4467                 /* fall through */
4468         case PT64_ROOT_4LEVEL:
4469                 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4470                         nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4471                         rsvd_bits(maxphyaddr, 51);
4472                 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
4473                         gbpages_bit_rsvd |
4474                         rsvd_bits(maxphyaddr, 51);
4475                 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4476                         rsvd_bits(maxphyaddr, 51);
4477                 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4478                         rsvd_bits(maxphyaddr, 51);
4479                 rsvd_check->rsvd_bits_mask[1][3] =
4480                         rsvd_check->rsvd_bits_mask[0][3];
4481                 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
4482                         gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
4483                         rsvd_bits(13, 29);
4484                 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4485                         rsvd_bits(maxphyaddr, 51) |
4486                         rsvd_bits(13, 20);              /* large page */
4487                 rsvd_check->rsvd_bits_mask[1][0] =
4488                         rsvd_check->rsvd_bits_mask[0][0];
4489                 break;
4490         }
4491 }
4492
4493 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4494                                   struct kvm_mmu *context)
4495 {
4496         __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4497                                 cpuid_maxphyaddr(vcpu), context->root_level,
4498                                 context->nx,
4499                                 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4500                                 is_pse(vcpu),
4501                                 guest_cpuid_is_amd_or_hygon(vcpu));
4502 }
4503
4504 static void
4505 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4506                             int maxphyaddr, bool execonly)
4507 {
4508         u64 bad_mt_xwr;
4509
4510         rsvd_check->rsvd_bits_mask[0][4] =
4511                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4512         rsvd_check->rsvd_bits_mask[0][3] =
4513                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4514         rsvd_check->rsvd_bits_mask[0][2] =
4515                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4516         rsvd_check->rsvd_bits_mask[0][1] =
4517                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4518         rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
4519
4520         /* large page */
4521         rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4522         rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4523         rsvd_check->rsvd_bits_mask[1][2] =
4524                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
4525         rsvd_check->rsvd_bits_mask[1][1] =
4526                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
4527         rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4528
4529         bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
4530         bad_mt_xwr |= 0xFFull << (3 * 8);       /* bits 3..5 must not be 3 */
4531         bad_mt_xwr |= 0xFFull << (7 * 8);       /* bits 3..5 must not be 7 */
4532         bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   /* bits 0..2 must not be 010 */
4533         bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   /* bits 0..2 must not be 110 */
4534         if (!execonly) {
4535                 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4536                 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4537         }
4538         rsvd_check->bad_mt_xwr = bad_mt_xwr;
4539 }
4540
4541 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4542                 struct kvm_mmu *context, bool execonly)
4543 {
4544         __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4545                                     cpuid_maxphyaddr(vcpu), execonly);
4546 }
4547
4548 /*
4549  * the page table on host is the shadow page table for the page
4550  * table in guest or amd nested guest, its mmu features completely
4551  * follow the features in guest.
4552  */
4553 void
4554 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4555 {
4556         bool uses_nx = context->nx ||
4557                 context->mmu_role.base.smep_andnot_wp;
4558         struct rsvd_bits_validate *shadow_zero_check;
4559         int i;
4560
4561         /*
4562          * Passing "true" to the last argument is okay; it adds a check
4563          * on bit 8 of the SPTEs which KVM doesn't use anyway.
4564          */
4565         shadow_zero_check = &context->shadow_zero_check;
4566         __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4567                                 shadow_phys_bits,
4568                                 context->shadow_root_level, uses_nx,
4569                                 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4570                                 is_pse(vcpu), true);
4571
4572         if (!shadow_me_mask)
4573                 return;
4574
4575         for (i = context->shadow_root_level; --i >= 0;) {
4576                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4577                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4578         }
4579
4580 }
4581 EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4582
4583 static inline bool boot_cpu_is_amd(void)
4584 {
4585         WARN_ON_ONCE(!tdp_enabled);
4586         return shadow_x_mask == 0;
4587 }
4588
4589 /*
4590  * the direct page table on host, use as much mmu features as
4591  * possible, however, kvm currently does not do execution-protection.
4592  */
4593 static void
4594 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4595                                 struct kvm_mmu *context)
4596 {
4597         struct rsvd_bits_validate *shadow_zero_check;
4598         int i;
4599
4600         shadow_zero_check = &context->shadow_zero_check;
4601
4602         if (boot_cpu_is_amd())
4603                 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4604                                         shadow_phys_bits,
4605                                         context->shadow_root_level, false,
4606                                         boot_cpu_has(X86_FEATURE_GBPAGES),
4607                                         true, true);
4608         else
4609                 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4610                                             shadow_phys_bits,
4611                                             false);
4612
4613         if (!shadow_me_mask)
4614                 return;
4615
4616         for (i = context->shadow_root_level; --i >= 0;) {
4617                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4618                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4619         }
4620 }
4621
4622 /*
4623  * as the comments in reset_shadow_zero_bits_mask() except it
4624  * is the shadow page table for intel nested guest.
4625  */
4626 static void
4627 reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4628                                 struct kvm_mmu *context, bool execonly)
4629 {
4630         __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4631                                     shadow_phys_bits, execonly);
4632 }
4633
4634 #define BYTE_MASK(access) \
4635         ((1 & (access) ? 2 : 0) | \
4636          (2 & (access) ? 4 : 0) | \
4637          (3 & (access) ? 8 : 0) | \
4638          (4 & (access) ? 16 : 0) | \
4639          (5 & (access) ? 32 : 0) | \
4640          (6 & (access) ? 64 : 0) | \
4641          (7 & (access) ? 128 : 0))
4642
4643
4644 static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4645                                       struct kvm_mmu *mmu, bool ept)
4646 {
4647         unsigned byte;
4648
4649         const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4650         const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4651         const u8 u = BYTE_MASK(ACC_USER_MASK);
4652
4653         bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4654         bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4655         bool cr0_wp = is_write_protection(vcpu);
4656
4657         for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4658                 unsigned pfec = byte << 1;
4659
4660                 /*
4661                  * Each "*f" variable has a 1 bit for each UWX value
4662                  * that causes a fault with the given PFEC.
4663                  */
4664
4665                 /* Faults from writes to non-writable pages */
4666                 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4667                 /* Faults from user mode accesses to supervisor pages */
4668                 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4669                 /* Faults from fetches of non-executable pages*/
4670                 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4671                 /* Faults from kernel mode fetches of user pages */
4672                 u8 smepf = 0;
4673                 /* Faults from kernel mode accesses of user pages */
4674                 u8 smapf = 0;
4675
4676                 if (!ept) {
4677                         /* Faults from kernel mode accesses to user pages */
4678                         u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4679
4680                         /* Not really needed: !nx will cause pte.nx to fault */
4681                         if (!mmu->nx)
4682                                 ff = 0;
4683
4684                         /* Allow supervisor writes if !cr0.wp */
4685                         if (!cr0_wp)
4686                                 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4687
4688                         /* Disallow supervisor fetches of user code if cr4.smep */
4689                         if (cr4_smep)
4690                                 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4691
4692                         /*
4693                          * SMAP:kernel-mode data accesses from user-mode
4694                          * mappings should fault. A fault is considered
4695                          * as a SMAP violation if all of the following
4696                          * conditions are true:
4697                          *   - X86_CR4_SMAP is set in CR4
4698                          *   - A user page is accessed
4699                          *   - The access is not a fetch
4700                          *   - Page fault in kernel mode
4701                          *   - if CPL = 3 or X86_EFLAGS_AC is clear
4702                          *
4703                          * Here, we cover the first three conditions.
4704                          * The fourth is computed dynamically in permission_fault();
4705                          * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4706                          * *not* subject to SMAP restrictions.
4707                          */
4708                         if (cr4_smap)
4709                                 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4710                 }
4711
4712                 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4713         }
4714 }
4715
4716 /*
4717 * PKU is an additional mechanism by which the paging controls access to
4718 * user-mode addresses based on the value in the PKRU register.  Protection
4719 * key violations are reported through a bit in the page fault error code.
4720 * Unlike other bits of the error code, the PK bit is not known at the
4721 * call site of e.g. gva_to_gpa; it must be computed directly in
4722 * permission_fault based on two bits of PKRU, on some machine state (CR4,
4723 * CR0, EFER, CPL), and on other bits of the error code and the page tables.
4724 *
4725 * In particular the following conditions come from the error code, the
4726 * page tables and the machine state:
4727 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4728 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4729 * - PK is always zero if U=0 in the page tables
4730 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4731 *
4732 * The PKRU bitmask caches the result of these four conditions.  The error
4733 * code (minus the P bit) and the page table's U bit form an index into the
4734 * PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
4735 * with the two bits of the PKRU register corresponding to the protection key.
4736 * For the first three conditions above the bits will be 00, thus masking
4737 * away both AD and WD.  For all reads or if the last condition holds, WD
4738 * only will be masked away.
4739 */
4740 static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4741                                 bool ept)
4742 {
4743         unsigned bit;
4744         bool wp;
4745
4746         if (ept) {
4747                 mmu->pkru_mask = 0;
4748                 return;
4749         }
4750
4751         /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
4752         if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
4753                 mmu->pkru_mask = 0;
4754                 return;
4755         }
4756
4757         wp = is_write_protection(vcpu);
4758
4759         for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4760                 unsigned pfec, pkey_bits;
4761                 bool check_pkey, check_write, ff, uf, wf, pte_user;
4762
4763                 pfec = bit << 1;
4764                 ff = pfec & PFERR_FETCH_MASK;
4765                 uf = pfec & PFERR_USER_MASK;
4766                 wf = pfec & PFERR_WRITE_MASK;
4767
4768                 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
4769                 pte_user = pfec & PFERR_RSVD_MASK;
4770
4771                 /*
4772                  * Only need to check the access which is not an
4773                  * instruction fetch and is to a user page.
4774                  */
4775                 check_pkey = (!ff && pte_user);
4776                 /*
4777                  * write access is controlled by PKRU if it is a
4778                  * user access or CR0.WP = 1.
4779                  */
4780                 check_write = check_pkey && wf && (uf || wp);
4781
4782                 /* PKRU.AD stops both read and write access. */
4783                 pkey_bits = !!check_pkey;
4784                 /* PKRU.WD stops write access. */
4785                 pkey_bits |= (!!check_write) << 1;
4786
4787                 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4788         }
4789 }
4790
4791 static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
4792 {
4793         unsigned root_level = mmu->root_level;
4794
4795         mmu->last_nonleaf_level = root_level;
4796         if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
4797                 mmu->last_nonleaf_level++;
4798 }
4799
4800 static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4801                                          struct kvm_mmu *context,
4802                                          int level)
4803 {
4804         context->nx = is_nx(vcpu);
4805         context->root_level = level;
4806
4807         reset_rsvds_bits_mask(vcpu, context);
4808         update_permission_bitmask(vcpu, context, false);
4809         update_pkru_bitmask(vcpu, context, false);
4810         update_last_nonleaf_level(vcpu, context);
4811
4812         MMU_WARN_ON(!is_pae(vcpu));
4813         context->page_fault = paging64_page_fault;
4814         context->gva_to_gpa = paging64_gva_to_gpa;
4815         context->sync_page = paging64_sync_page;
4816         context->invlpg = paging64_invlpg;
4817         context->update_pte = paging64_update_pte;
4818         context->shadow_root_level = level;
4819         context->direct_map = false;
4820 }
4821
4822 static void paging64_init_context(struct kvm_vcpu *vcpu,
4823                                   struct kvm_mmu *context)
4824 {
4825         int root_level = is_la57_mode(vcpu) ?
4826                          PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4827
4828         paging64_init_context_common(vcpu, context, root_level);
4829 }
4830
4831 static void paging32_init_context(struct kvm_vcpu *vcpu,
4832                                   struct kvm_mmu *context)
4833 {
4834         context->nx = false;
4835         context->root_level = PT32_ROOT_LEVEL;
4836
4837         reset_rsvds_bits_mask(vcpu, context);
4838         update_permission_bitmask(vcpu, context, false);
4839         update_pkru_bitmask(vcpu, context, false);
4840         update_last_nonleaf_level(vcpu, context);
4841
4842         context->page_fault = paging32_page_fault;
4843         context->gva_to_gpa = paging32_gva_to_gpa;
4844         context->sync_page = paging32_sync_page;
4845         context->invlpg = paging32_invlpg;
4846         context->update_pte = paging32_update_pte;
4847         context->shadow_root_level = PT32E_ROOT_LEVEL;
4848         context->direct_map = false;
4849 }
4850
4851 static void paging32E_init_context(struct kvm_vcpu *vcpu,
4852                                    struct kvm_mmu *context)
4853 {
4854         paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
4855 }
4856
4857 static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
4858 {
4859         union kvm_mmu_extended_role ext = {0};
4860
4861         ext.cr0_pg = !!is_paging(vcpu);
4862         ext.cr4_pae = !!is_pae(vcpu);
4863         ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
4864         ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
4865         ext.cr4_pse = !!is_pse(vcpu);
4866         ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
4867         ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
4868
4869         ext.valid = 1;
4870
4871         return ext;
4872 }
4873
4874 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4875                                                    bool base_only)
4876 {
4877         union kvm_mmu_role role = {0};
4878
4879         role.base.access = ACC_ALL;
4880         role.base.nxe = !!is_nx(vcpu);
4881         role.base.cr0_wp = is_write_protection(vcpu);
4882         role.base.smm = is_smm(vcpu);
4883         role.base.guest_mode = is_guest_mode(vcpu);
4884
4885         if (base_only)
4886                 return role;
4887
4888         role.ext = kvm_calc_mmu_role_ext(vcpu);
4889
4890         return role;
4891 }
4892
4893 static union kvm_mmu_role
4894 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4895 {
4896         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4897
4898         role.base.ad_disabled = (shadow_accessed_mask == 0);
4899         role.base.level = vcpu->arch.tdp_level;
4900         role.base.direct = true;
4901         role.base.gpte_is_8_bytes = true;
4902
4903         return role;
4904 }
4905
4906 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4907 {
4908         struct kvm_mmu *context = vcpu->arch.mmu;
4909         union kvm_mmu_role new_role =
4910                 kvm_calc_tdp_mmu_root_page_role(vcpu, false);
4911
4912         if (new_role.as_u64 == context->mmu_role.as_u64)
4913                 return;
4914
4915         context->mmu_role.as_u64 = new_role.as_u64;
4916         context->page_fault = kvm_tdp_page_fault;
4917         context->sync_page = nonpaging_sync_page;
4918         context->invlpg = NULL;
4919         context->update_pte = nonpaging_update_pte;
4920         context->shadow_root_level = vcpu->arch.tdp_level;
4921         context->direct_map = true;
4922         context->get_guest_pgd = get_cr3;
4923         context->get_pdptr = kvm_pdptr_read;
4924         context->inject_page_fault = kvm_inject_page_fault;
4925
4926         if (!is_paging(vcpu)) {
4927                 context->nx = false;
4928                 context->gva_to_gpa = nonpaging_gva_to_gpa;
4929                 context->root_level = 0;
4930         } else if (is_long_mode(vcpu)) {
4931                 context->nx = is_nx(vcpu);
4932                 context->root_level = is_la57_mode(vcpu) ?
4933                                 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4934                 reset_rsvds_bits_mask(vcpu, context);
4935                 context->gva_to_gpa = paging64_gva_to_gpa;
4936         } else if (is_pae(vcpu)) {
4937                 context->nx = is_nx(vcpu);
4938                 context->root_level = PT32E_ROOT_LEVEL;
4939                 reset_rsvds_bits_mask(vcpu, context);
4940                 context->gva_to_gpa = paging64_gva_to_gpa;
4941         } else {
4942                 context->nx = false;
4943                 context->root_level = PT32_ROOT_LEVEL;
4944                 reset_rsvds_bits_mask(vcpu, context);
4945                 context->gva_to_gpa = paging32_gva_to_gpa;
4946         }
4947
4948         update_permission_bitmask(vcpu, context, false);
4949         update_pkru_bitmask(vcpu, context, false);
4950         update_last_nonleaf_level(vcpu, context);
4951         reset_tdp_shadow_zero_bits_mask(vcpu, context);
4952 }
4953
4954 static union kvm_mmu_role
4955 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4956 {
4957         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4958
4959         role.base.smep_andnot_wp = role.ext.cr4_smep &&
4960                 !is_write_protection(vcpu);
4961         role.base.smap_andnot_wp = role.ext.cr4_smap &&
4962                 !is_write_protection(vcpu);
4963         role.base.direct = !is_paging(vcpu);
4964         role.base.gpte_is_8_bytes = !!is_pae(vcpu);
4965
4966         if (!is_long_mode(vcpu))
4967                 role.base.level = PT32E_ROOT_LEVEL;
4968         else if (is_la57_mode(vcpu))
4969                 role.base.level = PT64_ROOT_5LEVEL;
4970         else
4971                 role.base.level = PT64_ROOT_4LEVEL;
4972
4973         return role;
4974 }
4975
4976 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
4977 {
4978         struct kvm_mmu *context = vcpu->arch.mmu;
4979         union kvm_mmu_role new_role =
4980                 kvm_calc_shadow_mmu_root_page_role(vcpu, false);
4981
4982         if (new_role.as_u64 == context->mmu_role.as_u64)
4983                 return;
4984
4985         if (!(cr0 & X86_CR0_PG))
4986                 nonpaging_init_context(vcpu, context);
4987         else if (efer & EFER_LMA)
4988                 paging64_init_context(vcpu, context);
4989         else if (cr4 & X86_CR4_PAE)
4990                 paging32E_init_context(vcpu, context);
4991         else
4992                 paging32_init_context(vcpu, context);
4993
4994         context->mmu_role.as_u64 = new_role.as_u64;
4995         reset_shadow_zero_bits_mask(vcpu, context);
4996 }
4997 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
4998
4999 static union kvm_mmu_role
5000 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
5001                                    bool execonly, u8 level)
5002 {
5003         union kvm_mmu_role role = {0};
5004
5005         /* SMM flag is inherited from root_mmu */
5006         role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
5007
5008         role.base.level = level;
5009         role.base.gpte_is_8_bytes = true;
5010         role.base.direct = false;
5011         role.base.ad_disabled = !accessed_dirty;
5012         role.base.guest_mode = true;
5013         role.base.access = ACC_ALL;
5014
5015         /*
5016          * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
5017          * SMAP variation to denote shadow EPT entries.
5018          */
5019         role.base.cr0_wp = true;
5020         role.base.smap_andnot_wp = true;
5021
5022         role.ext = kvm_calc_mmu_role_ext(vcpu);
5023         role.ext.execonly = execonly;
5024
5025         return role;
5026 }
5027
5028 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
5029                              bool accessed_dirty, gpa_t new_eptp)
5030 {
5031         struct kvm_mmu *context = vcpu->arch.mmu;
5032         u8 level = vmx_eptp_page_walk_level(new_eptp);
5033         union kvm_mmu_role new_role =
5034                 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
5035                                                    execonly, level);
5036
5037         __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true);
5038
5039         if (new_role.as_u64 == context->mmu_role.as_u64)
5040                 return;
5041
5042         context->shadow_root_level = level;
5043
5044         context->nx = true;
5045         context->ept_ad = accessed_dirty;
5046         context->page_fault = ept_page_fault;
5047         context->gva_to_gpa = ept_gva_to_gpa;
5048         context->sync_page = ept_sync_page;
5049         context->invlpg = ept_invlpg;
5050         context->update_pte = ept_update_pte;
5051         context->root_level = level;
5052         context->direct_map = false;
5053         context->mmu_role.as_u64 = new_role.as_u64;
5054
5055         update_permission_bitmask(vcpu, context, true);
5056         update_pkru_bitmask(vcpu, context, true);
5057         update_last_nonleaf_level(vcpu, context);
5058         reset_rsvds_bits_mask_ept(vcpu, context, execonly);
5059         reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
5060 }
5061 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5062
5063 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
5064 {
5065         struct kvm_mmu *context = vcpu->arch.mmu;
5066
5067         kvm_init_shadow_mmu(vcpu,
5068                             kvm_read_cr0_bits(vcpu, X86_CR0_PG),
5069                             kvm_read_cr4_bits(vcpu, X86_CR4_PAE),
5070                             vcpu->arch.efer);
5071
5072         context->get_guest_pgd     = get_cr3;
5073         context->get_pdptr         = kvm_pdptr_read;
5074         context->inject_page_fault = kvm_inject_page_fault;
5075 }
5076
5077 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
5078 {
5079         union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
5080         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5081
5082         if (new_role.as_u64 == g_context->mmu_role.as_u64)
5083                 return;
5084
5085         g_context->mmu_role.as_u64 = new_role.as_u64;
5086         g_context->get_guest_pgd     = get_cr3;
5087         g_context->get_pdptr         = kvm_pdptr_read;
5088         g_context->inject_page_fault = kvm_inject_page_fault;
5089
5090         /*
5091          * L2 page tables are never shadowed, so there is no need to sync
5092          * SPTEs.
5093          */
5094         g_context->invlpg            = NULL;
5095
5096         /*
5097          * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5098          * L1's nested page tables (e.g. EPT12). The nested translation
5099          * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5100          * L2's page tables as the first level of translation and L1's
5101          * nested page tables as the second level of translation. Basically
5102          * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5103          */
5104         if (!is_paging(vcpu)) {
5105                 g_context->nx = false;
5106                 g_context->root_level = 0;
5107                 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
5108         } else if (is_long_mode(vcpu)) {
5109                 g_context->nx = is_nx(vcpu);
5110                 g_context->root_level = is_la57_mode(vcpu) ?
5111                                         PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5112                 reset_rsvds_bits_mask(vcpu, g_context);
5113                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5114         } else if (is_pae(vcpu)) {
5115                 g_context->nx = is_nx(vcpu);
5116                 g_context->root_level = PT32E_ROOT_LEVEL;
5117                 reset_rsvds_bits_mask(vcpu, g_context);
5118                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5119         } else {
5120                 g_context->nx = false;
5121                 g_context->root_level = PT32_ROOT_LEVEL;
5122                 reset_rsvds_bits_mask(vcpu, g_context);
5123                 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
5124         }
5125
5126         update_permission_bitmask(vcpu, g_context, false);
5127         update_pkru_bitmask(vcpu, g_context, false);
5128         update_last_nonleaf_level(vcpu, g_context);
5129 }
5130
5131 void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
5132 {
5133         if (reset_roots) {
5134                 uint i;
5135
5136                 vcpu->arch.mmu->root_hpa = INVALID_PAGE;
5137
5138                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5139                         vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5140         }
5141
5142         if (mmu_is_nested(vcpu))
5143                 init_kvm_nested_mmu(vcpu);
5144         else if (tdp_enabled)
5145                 init_kvm_tdp_mmu(vcpu);
5146         else
5147                 init_kvm_softmmu(vcpu);
5148 }
5149 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5150
5151 static union kvm_mmu_page_role
5152 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
5153 {
5154         union kvm_mmu_role role;
5155
5156         if (tdp_enabled)
5157                 role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
5158         else
5159                 role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
5160
5161         return role.base;
5162 }
5163
5164 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5165 {
5166         kvm_mmu_unload(vcpu);
5167         kvm_init_mmu(vcpu, true);
5168 }
5169 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5170
5171 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5172 {
5173         int r;
5174
5175         r = mmu_topup_memory_caches(vcpu);
5176         if (r)
5177                 goto out;
5178         r = mmu_alloc_roots(vcpu);
5179         kvm_mmu_sync_roots(vcpu);
5180         if (r)
5181                 goto out;
5182         kvm_mmu_load_pgd(vcpu);
5183         kvm_x86_ops.tlb_flush_current(vcpu);
5184 out:
5185         return r;
5186 }
5187 EXPORT_SYMBOL_GPL(kvm_mmu_load);
5188
5189 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5190 {
5191         kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5192         WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
5193         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5194         WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
5195 }
5196 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
5197
5198 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
5199                                   struct kvm_mmu_page *sp, u64 *spte,
5200                                   const void *new)
5201 {
5202         if (sp->role.level != PG_LEVEL_4K) {
5203                 ++vcpu->kvm->stat.mmu_pde_zapped;
5204                 return;
5205         }
5206
5207         ++vcpu->kvm->stat.mmu_pte_updated;
5208         vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
5209 }
5210
5211 static bool need_remote_flush(u64 old, u64 new)
5212 {
5213         if (!is_shadow_present_pte(old))
5214                 return false;
5215         if (!is_shadow_present_pte(new))
5216                 return true;
5217         if ((old ^ new) & PT64_BASE_ADDR_MASK)
5218                 return true;
5219         old ^= shadow_nx_mask;
5220         new ^= shadow_nx_mask;
5221         return (old & ~new & PT64_PERM_MASK) != 0;
5222 }
5223
5224 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5225                                     int *bytes)
5226 {
5227         u64 gentry = 0;
5228         int r;
5229
5230         /*
5231          * Assume that the pte write on a page table of the same type
5232          * as the current vcpu paging mode since we update the sptes only
5233          * when they have the same mode.
5234          */
5235         if (is_pae(vcpu) && *bytes == 4) {
5236                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5237                 *gpa &= ~(gpa_t)7;
5238                 *bytes = 8;
5239         }
5240
5241         if (*bytes == 4 || *bytes == 8) {
5242                 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5243                 if (r)
5244                         gentry = 0;
5245         }
5246
5247         return gentry;
5248 }
5249
5250 /*
5251  * If we're seeing too many writes to a page, it may no longer be a page table,
5252  * or we may be forking, in which case it is better to unmap the page.
5253  */
5254 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5255 {
5256         /*
5257          * Skip write-flooding detected for the sp whose level is 1, because
5258          * it can become unsync, then the guest page is not write-protected.
5259          */
5260         if (sp->role.level == PG_LEVEL_4K)
5261                 return false;
5262
5263         atomic_inc(&sp->write_flooding_count);
5264         return atomic_read(&sp->write_flooding_count) >= 3;
5265 }
5266
5267 /*
5268  * Misaligned accesses are too much trouble to fix up; also, they usually
5269  * indicate a page is not used as a page table.
5270  */
5271 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5272                                     int bytes)
5273 {
5274         unsigned offset, pte_size, misaligned;
5275
5276         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5277                  gpa, bytes, sp->role.word);
5278
5279         offset = offset_in_page(gpa);
5280         pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5281
5282         /*
5283          * Sometimes, the OS only writes the last one bytes to update status
5284          * bits, for example, in linux, andb instruction is used in clear_bit().
5285          */
5286         if (!(offset & (pte_size - 1)) && bytes == 1)
5287                 return false;
5288
5289         misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5290         misaligned |= bytes < 4;
5291
5292         return misaligned;
5293 }
5294
5295 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5296 {
5297         unsigned page_offset, quadrant;
5298         u64 *spte;
5299         int level;
5300
5301         page_offset = offset_in_page(gpa);
5302         level = sp->role.level;
5303         *nspte = 1;
5304         if (!sp->role.gpte_is_8_bytes) {
5305                 page_offset <<= 1;      /* 32->64 */
5306                 /*
5307                  * A 32-bit pde maps 4MB while the shadow pdes map
5308                  * only 2MB.  So we need to double the offset again
5309                  * and zap two pdes instead of one.
5310                  */
5311                 if (level == PT32_ROOT_LEVEL) {
5312                         page_offset &= ~7; /* kill rounding error */
5313                         page_offset <<= 1;
5314                         *nspte = 2;
5315                 }
5316                 quadrant = page_offset >> PAGE_SHIFT;
5317                 page_offset &= ~PAGE_MASK;
5318                 if (quadrant != sp->role.quadrant)
5319                         return NULL;
5320         }
5321
5322         spte = &sp->spt[page_offset / sizeof(*spte)];
5323         return spte;
5324 }
5325
5326 /*
5327  * Ignore various flags when determining if a SPTE can be immediately
5328  * overwritten for the current MMU.
5329  *  - level: explicitly checked in mmu_pte_write_new_pte(), and will never
5330  *    match the current MMU role, as MMU's level tracks the root level.
5331  *  - access: updated based on the new guest PTE
5332  *  - quadrant: handled by get_written_sptes()
5333  *  - invalid: always false (loop only walks valid shadow pages)
5334  */
5335 static const union kvm_mmu_page_role role_ign = {
5336         .level = 0xf,
5337         .access = 0x7,
5338         .quadrant = 0x3,
5339         .invalid = 0x1,
5340 };
5341
5342 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5343                               const u8 *new, int bytes,
5344                               struct kvm_page_track_notifier_node *node)
5345 {
5346         gfn_t gfn = gpa >> PAGE_SHIFT;
5347         struct kvm_mmu_page *sp;
5348         LIST_HEAD(invalid_list);
5349         u64 entry, gentry, *spte;
5350         int npte;
5351         bool remote_flush, local_flush;
5352
5353         /*
5354          * If we don't have indirect shadow pages, it means no page is
5355          * write-protected, so we can exit simply.
5356          */
5357         if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5358                 return;
5359
5360         remote_flush = local_flush = false;
5361
5362         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5363
5364         /*
5365          * No need to care whether allocation memory is successful
5366          * or not since pte prefetch is skiped if it does not have
5367          * enough objects in the cache.
5368          */
5369         mmu_topup_memory_caches(vcpu);
5370
5371         spin_lock(&vcpu->kvm->mmu_lock);
5372
5373         gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5374
5375         ++vcpu->kvm->stat.mmu_pte_write;
5376         kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
5377
5378         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5379                 if (detect_write_misaligned(sp, gpa, bytes) ||
5380                       detect_write_flooding(sp)) {
5381                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5382                         ++vcpu->kvm->stat.mmu_flooded;
5383                         continue;
5384                 }
5385
5386                 spte = get_written_sptes(sp, gpa, &npte);
5387                 if (!spte)
5388                         continue;
5389
5390                 local_flush = true;
5391                 while (npte--) {
5392                         u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
5393
5394                         entry = *spte;
5395                         mmu_page_zap_pte(vcpu->kvm, sp, spte);
5396                         if (gentry &&
5397                             !((sp->role.word ^ base_role) & ~role_ign.word) &&
5398                             rmap_can_add(vcpu))
5399                                 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
5400                         if (need_remote_flush(entry, *spte))
5401                                 remote_flush = true;
5402                         ++spte;
5403                 }
5404         }
5405         kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
5406         kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
5407         spin_unlock(&vcpu->kvm->mmu_lock);
5408 }
5409
5410 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
5411 {
5412         gpa_t gpa;
5413         int r;
5414
5415         if (vcpu->arch.mmu->direct_map)
5416                 return 0;
5417
5418         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
5419
5420         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
5421
5422         return r;
5423 }
5424 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
5425
5426 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5427                        void *insn, int insn_len)
5428 {
5429         int r, emulation_type = EMULTYPE_PF;
5430         bool direct = vcpu->arch.mmu->direct_map;
5431
5432         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
5433                 return RET_PF_RETRY;
5434
5435         r = RET_PF_INVALID;
5436         if (unlikely(error_code & PFERR_RSVD_MASK)) {
5437                 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5438                 if (r == RET_PF_EMULATE)
5439                         goto emulate;
5440         }
5441
5442         if (r == RET_PF_INVALID) {
5443                 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
5444                                           lower_32_bits(error_code), false);
5445                 WARN_ON(r == RET_PF_INVALID);
5446         }
5447
5448         if (r == RET_PF_RETRY)
5449                 return 1;
5450         if (r < 0)
5451                 return r;
5452
5453         /*
5454          * Before emulating the instruction, check if the error code
5455          * was due to a RO violation while translating the guest page.
5456          * This can occur when using nested virtualization with nested
5457          * paging in both guests. If true, we simply unprotect the page
5458          * and resume the guest.
5459          */
5460         if (vcpu->arch.mmu->direct_map &&
5461             (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5462                 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5463                 return 1;
5464         }
5465
5466         /*
5467          * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5468          * optimistically try to just unprotect the page and let the processor
5469          * re-execute the instruction that caused the page fault.  Do not allow
5470          * retrying MMIO emulation, as it's not only pointless but could also
5471          * cause us to enter an infinite loop because the processor will keep
5472          * faulting on the non-existent MMIO address.  Retrying an instruction
5473          * from a nested guest is also pointless and dangerous as we are only
5474          * explicitly shadowing L1's page tables, i.e. unprotecting something
5475          * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5476          */
5477         if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5478                 emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
5479 emulate:
5480         /*
5481          * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
5482          * This can happen if a guest gets a page-fault on data access but the HW
5483          * table walker is not able to read the instruction page (e.g instruction
5484          * page is not present in memory). In those cases we simply restart the
5485          * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
5486          */
5487         if (unlikely(insn && !insn_len)) {
5488                 if (!kvm_x86_ops.need_emulation_on_page_fault(vcpu))
5489                         return 1;
5490         }
5491
5492         return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5493                                        insn_len);
5494 }
5495 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5496
5497 void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
5498                             gva_t gva, hpa_t root_hpa)
5499 {
5500         int i;
5501
5502         /* It's actually a GPA for vcpu->arch.guest_mmu.  */
5503         if (mmu != &vcpu->arch.guest_mmu) {
5504                 /* INVLPG on a non-canonical address is a NOP according to the SDM.  */
5505                 if (is_noncanonical_address(gva, vcpu))
5506                         return;
5507
5508                 kvm_x86_ops.tlb_flush_gva(vcpu, gva);
5509         }
5510
5511         if (!mmu->invlpg)
5512                 return;
5513
5514         if (root_hpa == INVALID_PAGE) {
5515                 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5516
5517                 /*
5518                  * INVLPG is required to invalidate any global mappings for the VA,
5519                  * irrespective of PCID. Since it would take us roughly similar amount
5520                  * of work to determine whether any of the prev_root mappings of the VA
5521                  * is marked global, or to just sync it blindly, so we might as well
5522                  * just always sync it.
5523                  *
5524                  * Mappings not reachable via the current cr3 or the prev_roots will be
5525                  * synced when switching to that cr3, so nothing needs to be done here
5526                  * for them.
5527                  */
5528                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5529                         if (VALID_PAGE(mmu->prev_roots[i].hpa))
5530                                 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5531         } else {
5532                 mmu->invlpg(vcpu, gva, root_hpa);
5533         }
5534 }
5535 EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva);
5536
5537 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5538 {
5539         kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE);
5540         ++vcpu->stat.invlpg;
5541 }
5542 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5543
5544
5545 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5546 {
5547         struct kvm_mmu *mmu = vcpu->arch.mmu;
5548         bool tlb_flush = false;
5549         uint i;
5550
5551         if (pcid == kvm_get_active_pcid(vcpu)) {
5552                 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5553                 tlb_flush = true;
5554         }
5555
5556         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5557                 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5558                     pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
5559                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5560                         tlb_flush = true;
5561                 }
5562         }
5563
5564         if (tlb_flush)
5565                 kvm_x86_ops.tlb_flush_gva(vcpu, gva);
5566
5567         ++vcpu->stat.invlpg;
5568
5569         /*
5570          * Mappings not reachable via the current cr3 or the prev_roots will be
5571          * synced when switching to that cr3, so nothing needs to be done here
5572          * for them.
5573          */
5574 }
5575 EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
5576
5577 void kvm_configure_mmu(bool enable_tdp, int tdp_page_level)
5578 {
5579         tdp_enabled = enable_tdp;
5580
5581         /*
5582          * max_page_level reflects the capabilities of KVM's MMU irrespective
5583          * of kernel support, e.g. KVM may be capable of using 1GB pages when
5584          * the kernel is not.  But, KVM never creates a page size greater than
5585          * what is used by the kernel for any given HVA, i.e. the kernel's
5586          * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
5587          */
5588         if (tdp_enabled)
5589                 max_page_level = tdp_page_level;
5590         else if (boot_cpu_has(X86_FEATURE_GBPAGES))
5591                 max_page_level = PG_LEVEL_1G;
5592         else
5593                 max_page_level = PG_LEVEL_2M;
5594 }
5595 EXPORT_SYMBOL_GPL(kvm_configure_mmu);
5596
5597 /* The return value indicates if tlb flush on all vcpus is needed. */
5598 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5599
5600 /* The caller should hold mmu-lock before calling this function. */
5601 static __always_inline bool
5602 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5603                         slot_level_handler fn, int start_level, int end_level,
5604                         gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5605 {
5606         struct slot_rmap_walk_iterator iterator;
5607         bool flush = false;
5608
5609         for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5610                         end_gfn, &iterator) {
5611                 if (iterator.rmap)
5612                         flush |= fn(kvm, iterator.rmap);
5613
5614                 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5615                         if (flush && lock_flush_tlb) {
5616                                 kvm_flush_remote_tlbs_with_address(kvm,
5617                                                 start_gfn,
5618                                                 iterator.gfn - start_gfn + 1);
5619                                 flush = false;
5620                         }
5621                         cond_resched_lock(&kvm->mmu_lock);
5622                 }
5623         }
5624
5625         if (flush && lock_flush_tlb) {
5626                 kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
5627                                                    end_gfn - start_gfn + 1);
5628                 flush = false;
5629         }
5630
5631         return flush;
5632 }
5633
5634 static __always_inline bool
5635 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5636                   slot_level_handler fn, int start_level, int end_level,
5637                   bool lock_flush_tlb)
5638 {
5639         return slot_handle_level_range(kvm, memslot, fn, start_level,
5640                         end_level, memslot->base_gfn,
5641                         memslot->base_gfn + memslot->npages - 1,
5642                         lock_flush_tlb);
5643 }
5644
5645 static __always_inline bool
5646 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5647                       slot_level_handler fn, bool lock_flush_tlb)
5648 {
5649         return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
5650                                  KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5651 }
5652
5653 static __always_inline bool
5654 slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5655                         slot_level_handler fn, bool lock_flush_tlb)
5656 {
5657         return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1,
5658                                  KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5659 }
5660
5661 static __always_inline bool
5662 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5663                  slot_level_handler fn, bool lock_flush_tlb)
5664 {
5665         return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
5666                                  PG_LEVEL_4K, lock_flush_tlb);
5667 }
5668
5669 static void free_mmu_pages(struct kvm_mmu *mmu)
5670 {
5671         free_page((unsigned long)mmu->pae_root);
5672         free_page((unsigned long)mmu->lm_root);
5673 }
5674
5675 static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5676 {
5677         struct page *page;
5678         int i;
5679
5680         /*
5681          * When using PAE paging, the four PDPTEs are treated as 'root' pages,
5682          * while the PDP table is a per-vCPU construct that's allocated at MMU
5683          * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
5684          * x86_64.  Therefore we need to allocate the PDP table in the first
5685          * 4GB of memory, which happens to fit the DMA32 zone.  Except for
5686          * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
5687          * skip allocating the PDP table.
5688          */
5689         if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL)
5690                 return 0;
5691
5692         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5693         if (!page)
5694                 return -ENOMEM;
5695
5696         mmu->pae_root = page_address(page);
5697         for (i = 0; i < 4; ++i)
5698                 mmu->pae_root[i] = INVALID_PAGE;
5699
5700         return 0;
5701 }
5702
5703 int kvm_mmu_create(struct kvm_vcpu *vcpu)
5704 {
5705         uint i;
5706         int ret;
5707
5708         vcpu->arch.mmu = &vcpu->arch.root_mmu;
5709         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5710
5711         vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
5712         vcpu->arch.root_mmu.root_pgd = 0;
5713         vcpu->arch.root_mmu.translate_gpa = translate_gpa;
5714         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5715                 vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5716
5717         vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
5718         vcpu->arch.guest_mmu.root_pgd = 0;
5719         vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
5720         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5721                 vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5722
5723         vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5724
5725         ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu);
5726         if (ret)
5727                 return ret;
5728
5729         ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu);
5730         if (ret)
5731                 goto fail_allocate_root;
5732
5733         return ret;
5734  fail_allocate_root:
5735         free_mmu_pages(&vcpu->arch.guest_mmu);
5736         return ret;
5737 }
5738
5739 #define BATCH_ZAP_PAGES 10
5740 static void kvm_zap_obsolete_pages(struct kvm *kvm)
5741 {
5742         struct kvm_mmu_page *sp, *node;
5743         int nr_zapped, batch = 0;
5744
5745 restart:
5746         list_for_each_entry_safe_reverse(sp, node,
5747               &kvm->arch.active_mmu_pages, link) {
5748                 /*
5749                  * No obsolete valid page exists before a newly created page
5750                  * since active_mmu_pages is a FIFO list.
5751                  */
5752                 if (!is_obsolete_sp(kvm, sp))
5753                         break;
5754
5755                 /*
5756                  * Invalid pages should never land back on the list of active
5757                  * pages.  Skip the bogus page, otherwise we'll get stuck in an
5758                  * infinite loop if the page gets put back on the list (again).
5759                  */
5760                 if (WARN_ON(sp->role.invalid))
5761                         continue;
5762
5763                 /*
5764                  * No need to flush the TLB since we're only zapping shadow
5765                  * pages with an obsolete generation number and all vCPUS have
5766                  * loaded a new root, i.e. the shadow pages being zapped cannot
5767                  * be in active use by the guest.
5768                  */
5769                 if (batch >= BATCH_ZAP_PAGES &&
5770                     cond_resched_lock(&kvm->mmu_lock)) {
5771                         batch = 0;
5772                         goto restart;
5773                 }
5774
5775                 if (__kvm_mmu_prepare_zap_page(kvm, sp,
5776                                 &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
5777                         batch += nr_zapped;
5778                         goto restart;
5779                 }
5780         }
5781
5782         /*
5783          * Trigger a remote TLB flush before freeing the page tables to ensure
5784          * KVM is not in the middle of a lockless shadow page table walk, which
5785          * may reference the pages.
5786          */
5787         kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5788 }
5789
5790 /*
5791  * Fast invalidate all shadow pages and use lock-break technique
5792  * to zap obsolete pages.
5793  *
5794  * It's required when memslot is being deleted or VM is being
5795  * destroyed, in these cases, we should ensure that KVM MMU does
5796  * not use any resource of the being-deleted slot or all slots
5797  * after calling the function.
5798  */
5799 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5800 {
5801         lockdep_assert_held(&kvm->slots_lock);
5802
5803         spin_lock(&kvm->mmu_lock);
5804         trace_kvm_mmu_zap_all_fast(kvm);
5805
5806         /*
5807          * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
5808          * held for the entire duration of zapping obsolete pages, it's
5809          * impossible for there to be multiple invalid generations associated
5810          * with *valid* shadow pages at any given time, i.e. there is exactly
5811          * one valid generation and (at most) one invalid generation.
5812          */
5813         kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
5814
5815         /*
5816          * Notify all vcpus to reload its shadow page table and flush TLB.
5817          * Then all vcpus will switch to new shadow page table with the new
5818          * mmu_valid_gen.
5819          *
5820          * Note: we need to do this under the protection of mmu_lock,
5821          * otherwise, vcpu would purge shadow page but miss tlb flush.
5822          */
5823         kvm_reload_remote_mmus(kvm);
5824
5825         kvm_zap_obsolete_pages(kvm);
5826         spin_unlock(&kvm->mmu_lock);
5827 }
5828
5829 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5830 {
5831         return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5832 }
5833
5834 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5835                         struct kvm_memory_slot *slot,
5836                         struct kvm_page_track_notifier_node *node)
5837 {
5838         kvm_mmu_zap_all_fast(kvm);
5839 }
5840
5841 void kvm_mmu_init_vm(struct kvm *kvm)
5842 {
5843         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5844
5845         node->track_write = kvm_mmu_pte_write;
5846         node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5847         kvm_page_track_register_notifier(kvm, node);
5848 }
5849
5850 void kvm_mmu_uninit_vm(struct kvm *kvm)
5851 {
5852         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5853
5854         kvm_page_track_unregister_notifier(kvm, node);
5855 }
5856
5857 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5858 {
5859         struct kvm_memslots *slots;
5860         struct kvm_memory_slot *memslot;
5861         int i;
5862
5863         spin_lock(&kvm->mmu_lock);
5864         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5865                 slots = __kvm_memslots(kvm, i);
5866                 kvm_for_each_memslot(memslot, slots) {
5867                         gfn_t start, end;
5868
5869                         start = max(gfn_start, memslot->base_gfn);
5870                         end = min(gfn_end, memslot->base_gfn + memslot->npages);
5871                         if (start >= end)
5872                                 continue;
5873
5874                         slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5875                                                 PG_LEVEL_4K,
5876                                                 KVM_MAX_HUGEPAGE_LEVEL,
5877                                                 start, end - 1, true);
5878                 }
5879         }
5880
5881         spin_unlock(&kvm->mmu_lock);
5882 }
5883
5884 static bool slot_rmap_write_protect(struct kvm *kvm,
5885                                     struct kvm_rmap_head *rmap_head)
5886 {
5887         return __rmap_write_protect(kvm, rmap_head, false);
5888 }
5889
5890 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5891                                       struct kvm_memory_slot *memslot,
5892                                       int start_level)
5893 {
5894         bool flush;
5895
5896         spin_lock(&kvm->mmu_lock);
5897         flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
5898                                 start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
5899         spin_unlock(&kvm->mmu_lock);
5900
5901         /*
5902          * We can flush all the TLBs out of the mmu lock without TLB
5903          * corruption since we just change the spte from writable to
5904          * readonly so that we only need to care the case of changing
5905          * spte from present to present (changing the spte from present
5906          * to nonpresent will flush all the TLBs immediately), in other
5907          * words, the only case we care is mmu_spte_update() where we
5908          * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
5909          * instead of PT_WRITABLE_MASK, that means it does not depend
5910          * on PT_WRITABLE_MASK anymore.
5911          */
5912         if (flush)
5913                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
5914 }
5915
5916 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
5917                                          struct kvm_rmap_head *rmap_head)
5918 {
5919         u64 *sptep;
5920         struct rmap_iterator iter;
5921         int need_tlb_flush = 0;
5922         kvm_pfn_t pfn;
5923         struct kvm_mmu_page *sp;
5924
5925 restart:
5926         for_each_rmap_spte(rmap_head, &iter, sptep) {
5927                 sp = sptep_to_sp(sptep);
5928                 pfn = spte_to_pfn(*sptep);
5929
5930                 /*
5931                  * We cannot do huge page mapping for indirect shadow pages,
5932                  * which are found on the last rmap (level = 1) when not using
5933                  * tdp; such shadow pages are synced with the page table in
5934                  * the guest, and the guest page table is using 4K page size
5935                  * mapping if the indirect sp has level = 1.
5936                  */
5937                 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
5938                     (kvm_is_zone_device_pfn(pfn) ||
5939                      PageCompound(pfn_to_page(pfn)))) {
5940                         pte_list_remove(rmap_head, sptep);
5941
5942                         if (kvm_available_flush_tlb_with_range())
5943                                 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
5944                                         KVM_PAGES_PER_HPAGE(sp->role.level));
5945                         else
5946                                 need_tlb_flush = 1;
5947
5948                         goto restart;
5949                 }
5950         }
5951
5952         return need_tlb_flush;
5953 }
5954
5955 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
5956                                    const struct kvm_memory_slot *memslot)
5957 {
5958         /* FIXME: const-ify all uses of struct kvm_memory_slot.  */
5959         spin_lock(&kvm->mmu_lock);
5960         slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
5961                          kvm_mmu_zap_collapsible_spte, true);
5962         spin_unlock(&kvm->mmu_lock);
5963 }
5964
5965 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
5966                                         struct kvm_memory_slot *memslot)
5967 {
5968         /*
5969          * All current use cases for flushing the TLBs for a specific memslot
5970          * are related to dirty logging, and do the TLB flush out of mmu_lock.
5971          * The interaction between the various operations on memslot must be
5972          * serialized by slots_locks to ensure the TLB flush from one operation
5973          * is observed by any other operation on the same memslot.
5974          */
5975         lockdep_assert_held(&kvm->slots_lock);
5976         kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5977                                            memslot->npages);
5978 }
5979
5980 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
5981                                    struct kvm_memory_slot *memslot)
5982 {
5983         bool flush;
5984
5985         spin_lock(&kvm->mmu_lock);
5986         flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
5987         spin_unlock(&kvm->mmu_lock);
5988
5989         /*
5990          * It's also safe to flush TLBs out of mmu lock here as currently this
5991          * function is only used for dirty logging, in which case flushing TLB
5992          * out of mmu lock also guarantees no dirty pages will be lost in
5993          * dirty_bitmap.
5994          */
5995         if (flush)
5996                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
5997 }
5998 EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
5999
6000 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
6001                                         struct kvm_memory_slot *memslot)
6002 {
6003         bool flush;
6004
6005         spin_lock(&kvm->mmu_lock);
6006         flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
6007                                         false);
6008         spin_unlock(&kvm->mmu_lock);
6009
6010         if (flush)
6011                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
6012 }
6013 EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
6014
6015 void kvm_mmu_slot_set_dirty(struct kvm *kvm,
6016                             struct kvm_memory_slot *memslot)
6017 {
6018         bool flush;
6019
6020         spin_lock(&kvm->mmu_lock);
6021         flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
6022         spin_unlock(&kvm->mmu_lock);
6023
6024         if (flush)
6025                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
6026 }
6027 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
6028
6029 void kvm_mmu_zap_all(struct kvm *kvm)
6030 {
6031         struct kvm_mmu_page *sp, *node;
6032         LIST_HEAD(invalid_list);
6033         int ign;
6034
6035         spin_lock(&kvm->mmu_lock);
6036 restart:
6037         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6038                 if (WARN_ON(sp->role.invalid))
6039                         continue;
6040                 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
6041                         goto restart;
6042                 if (cond_resched_lock(&kvm->mmu_lock))
6043                         goto restart;
6044         }
6045
6046         kvm_mmu_commit_zap_page(kvm, &invalid_list);
6047         spin_unlock(&kvm->mmu_lock);
6048 }
6049
6050 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
6051 {
6052         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
6053
6054         gen &= MMIO_SPTE_GEN_MASK;
6055
6056         /*
6057          * Generation numbers are incremented in multiples of the number of
6058          * address spaces in order to provide unique generations across all
6059          * address spaces.  Strip what is effectively the address space
6060          * modifier prior to checking for a wrap of the MMIO generation so
6061          * that a wrap in any address space is detected.
6062          */
6063         gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6064
6065         /*
6066          * The very rare case: if the MMIO generation number has wrapped,
6067          * zap all shadow pages.
6068          */
6069         if (unlikely(gen == 0)) {
6070                 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
6071                 kvm_mmu_zap_all_fast(kvm);
6072         }
6073 }
6074
6075 static unsigned long
6076 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
6077 {
6078         struct kvm *kvm;
6079         int nr_to_scan = sc->nr_to_scan;
6080         unsigned long freed = 0;
6081
6082         mutex_lock(&kvm_lock);
6083
6084         list_for_each_entry(kvm, &vm_list, vm_list) {
6085                 int idx;
6086                 LIST_HEAD(invalid_list);
6087
6088                 /*
6089                  * Never scan more than sc->nr_to_scan VM instances.
6090                  * Will not hit this condition practically since we do not try
6091                  * to shrink more than one VM and it is very unlikely to see
6092                  * !n_used_mmu_pages so many times.
6093                  */
6094                 if (!nr_to_scan--)
6095                         break;
6096                 /*
6097                  * n_used_mmu_pages is accessed without holding kvm->mmu_lock
6098                  * here. We may skip a VM instance errorneosly, but we do not
6099                  * want to shrink a VM that only started to populate its MMU
6100                  * anyway.
6101                  */
6102                 if (!kvm->arch.n_used_mmu_pages &&
6103                     !kvm_has_zapped_obsolete_pages(kvm))
6104                         continue;
6105
6106                 idx = srcu_read_lock(&kvm->srcu);
6107                 spin_lock(&kvm->mmu_lock);
6108
6109                 if (kvm_has_zapped_obsolete_pages(kvm)) {
6110                         kvm_mmu_commit_zap_page(kvm,
6111                               &kvm->arch.zapped_obsolete_pages);
6112                         goto unlock;
6113                 }
6114
6115                 freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
6116
6117 unlock:
6118                 spin_unlock(&kvm->mmu_lock);
6119                 srcu_read_unlock(&kvm->srcu, idx);
6120
6121                 /*
6122                  * unfair on small ones
6123                  * per-vm shrinkers cry out
6124                  * sadness comes quickly
6125                  */
6126                 list_move_tail(&kvm->vm_list, &vm_list);
6127                 break;
6128         }
6129
6130         mutex_unlock(&kvm_lock);
6131         return freed;
6132 }
6133
6134 static unsigned long
6135 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6136 {
6137         return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6138 }
6139
6140 static struct shrinker mmu_shrinker = {
6141         .count_objects = mmu_shrink_count,
6142         .scan_objects = mmu_shrink_scan,
6143         .seeks = DEFAULT_SEEKS * 10,
6144 };
6145
6146 static void mmu_destroy_caches(void)
6147 {
6148         kmem_cache_destroy(pte_list_desc_cache);
6149         kmem_cache_destroy(mmu_page_header_cache);
6150 }
6151
6152 static void kvm_set_mmio_spte_mask(void)
6153 {
6154         u64 mask;
6155
6156         /*
6157          * Set a reserved PA bit in MMIO SPTEs to generate page faults with
6158          * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
6159          * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
6160          * 52-bit physical addresses then there are no reserved PA bits in the
6161          * PTEs and so the reserved PA approach must be disabled.
6162          */
6163         if (shadow_phys_bits < 52)
6164                 mask = BIT_ULL(51) | PT_PRESENT_MASK;
6165         else
6166                 mask = 0;
6167
6168         kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK);
6169 }
6170
6171 static bool get_nx_auto_mode(void)
6172 {
6173         /* Return true when CPU has the bug, and mitigations are ON */
6174         return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6175 }
6176
6177 static void __set_nx_huge_pages(bool val)
6178 {
6179         nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6180 }
6181
6182 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6183 {
6184         bool old_val = nx_huge_pages;
6185         bool new_val;
6186
6187         /* In "auto" mode deploy workaround only if CPU has the bug. */
6188         if (sysfs_streq(val, "off"))
6189                 new_val = 0;
6190         else if (sysfs_streq(val, "force"))
6191                 new_val = 1;
6192         else if (sysfs_streq(val, "auto"))
6193                 new_val = get_nx_auto_mode();
6194         else if (strtobool(val, &new_val) < 0)
6195                 return -EINVAL;
6196
6197         __set_nx_huge_pages(new_val);
6198
6199         if (new_val != old_val) {
6200                 struct kvm *kvm;
6201
6202                 mutex_lock(&kvm_lock);
6203
6204                 list_for_each_entry(kvm, &vm_list, vm_list) {
6205                         mutex_lock(&kvm->slots_lock);
6206                         kvm_mmu_zap_all_fast(kvm);
6207                         mutex_unlock(&kvm->slots_lock);
6208
6209                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6210                 }
6211                 mutex_unlock(&kvm_lock);
6212         }
6213
6214         return 0;
6215 }
6216
6217 int kvm_mmu_module_init(void)
6218 {
6219         int ret = -ENOMEM;
6220
6221         if (nx_huge_pages == -1)
6222                 __set_nx_huge_pages(get_nx_auto_mode());
6223
6224         /*
6225          * MMU roles use union aliasing which is, generally speaking, an
6226          * undefined behavior. However, we supposedly know how compilers behave
6227          * and the current status quo is unlikely to change. Guardians below are
6228          * supposed to let us know if the assumption becomes false.
6229          */
6230         BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6231         BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6232         BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6233
6234         kvm_mmu_reset_all_pte_masks();
6235
6236         kvm_set_mmio_spte_mask();
6237
6238         pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6239                                             sizeof(struct pte_list_desc),
6240                                             0, SLAB_ACCOUNT, NULL);
6241         if (!pte_list_desc_cache)
6242                 goto out;
6243
6244         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6245                                                   sizeof(struct kvm_mmu_page),
6246                                                   0, SLAB_ACCOUNT, NULL);
6247         if (!mmu_page_header_cache)
6248                 goto out;
6249
6250         if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6251                 goto out;
6252
6253         ret = register_shrinker(&mmu_shrinker);
6254         if (ret)
6255                 goto out;
6256
6257         return 0;
6258
6259 out:
6260         mmu_destroy_caches();
6261         return ret;
6262 }
6263
6264 /*
6265  * Calculate mmu pages needed for kvm.
6266  */
6267 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
6268 {
6269         unsigned long nr_mmu_pages;
6270         unsigned long nr_pages = 0;
6271         struct kvm_memslots *slots;
6272         struct kvm_memory_slot *memslot;
6273         int i;
6274
6275         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6276                 slots = __kvm_memslots(kvm, i);
6277
6278                 kvm_for_each_memslot(memslot, slots)
6279                         nr_pages += memslot->npages;
6280         }
6281
6282         nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
6283         nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
6284
6285         return nr_mmu_pages;
6286 }
6287
6288 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6289 {
6290         kvm_mmu_unload(vcpu);
6291         free_mmu_pages(&vcpu->arch.root_mmu);
6292         free_mmu_pages(&vcpu->arch.guest_mmu);
6293         mmu_free_memory_caches(vcpu);
6294 }
6295
6296 void kvm_mmu_module_exit(void)
6297 {
6298         mmu_destroy_caches();
6299         percpu_counter_destroy(&kvm_total_used_mmu_pages);
6300         unregister_shrinker(&mmu_shrinker);
6301         mmu_audit_disable();
6302 }
6303
6304 static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
6305 {
6306         unsigned int old_val;
6307         int err;
6308
6309         old_val = nx_huge_pages_recovery_ratio;
6310         err = param_set_uint(val, kp);
6311         if (err)
6312                 return err;
6313
6314         if (READ_ONCE(nx_huge_pages) &&
6315             !old_val && nx_huge_pages_recovery_ratio) {
6316                 struct kvm *kvm;
6317
6318                 mutex_lock(&kvm_lock);
6319
6320                 list_for_each_entry(kvm, &vm_list, vm_list)
6321                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6322
6323                 mutex_unlock(&kvm_lock);
6324         }
6325
6326         return err;
6327 }
6328
6329 static void kvm_recover_nx_lpages(struct kvm *kvm)
6330 {
6331         int rcu_idx;
6332         struct kvm_mmu_page *sp;
6333         unsigned int ratio;
6334         LIST_HEAD(invalid_list);
6335         ulong to_zap;
6336
6337         rcu_idx = srcu_read_lock(&kvm->srcu);
6338         spin_lock(&kvm->mmu_lock);
6339
6340         ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6341         to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
6342         while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
6343                 /*
6344                  * We use a separate list instead of just using active_mmu_pages
6345                  * because the number of lpage_disallowed pages is expected to
6346                  * be relatively small compared to the total.
6347                  */
6348                 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6349                                       struct kvm_mmu_page,
6350                                       lpage_disallowed_link);
6351                 WARN_ON_ONCE(!sp->lpage_disallowed);
6352                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6353                 WARN_ON_ONCE(sp->lpage_disallowed);
6354
6355                 if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
6356                         kvm_mmu_commit_zap_page(kvm, &invalid_list);
6357                         if (to_zap)
6358                                 cond_resched_lock(&kvm->mmu_lock);
6359                 }
6360         }
6361
6362         spin_unlock(&kvm->mmu_lock);
6363         srcu_read_unlock(&kvm->srcu, rcu_idx);
6364 }
6365
6366 static long get_nx_lpage_recovery_timeout(u64 start_time)
6367 {
6368         return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
6369                 ? start_time + 60 * HZ - get_jiffies_64()
6370                 : MAX_SCHEDULE_TIMEOUT;
6371 }
6372
6373 static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6374 {
6375         u64 start_time;
6376         long remaining_time;
6377
6378         while (true) {
6379                 start_time = get_jiffies_64();
6380                 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6381
6382                 set_current_state(TASK_INTERRUPTIBLE);
6383                 while (!kthread_should_stop() && remaining_time > 0) {
6384                         schedule_timeout(remaining_time);
6385                         remaining_time = get_nx_lpage_recovery_timeout(start_time);
6386                         set_current_state(TASK_INTERRUPTIBLE);
6387                 }
6388
6389                 set_current_state(TASK_RUNNING);
6390
6391                 if (kthread_should_stop())
6392                         return 0;
6393
6394                 kvm_recover_nx_lpages(kvm);
6395         }
6396 }
6397
6398 int kvm_mmu_post_init_vm(struct kvm *kvm)
6399 {
6400         int err;
6401
6402         err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6403                                           "kvm-nx-lpage-recovery",
6404                                           &kvm->arch.nx_lpage_recovery_thread);
6405         if (!err)
6406                 kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6407
6408         return err;
6409 }
6410
6411 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6412 {
6413         if (kvm->arch.nx_lpage_recovery_thread)
6414                 kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6415 }