arch/x86/kvm/mmu/mmu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * This module enables machines with Intel VT-x extensions to run virtual
   6  * machines without emulation or binary translation.
   7  *
   8  * MMU support
   9  *
  10  * Copyright (C) 2006 Qumranet, Inc.
  11  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  12  *
  13  * Authors:
  14  *   Yaniv Kamay  <yaniv@qumranet.com>
  15  *   Avi Kivity   <avi@qumranet.com>
  16  */
  17
  18 #include "irq.h"
  19 #include "ioapic.h"
  20 #include "mmu.h"
  21 #include "mmu_internal.h"
  22 #include "tdp_mmu.h"
  23 #include "x86.h"
  24 #include "kvm_cache_regs.h"
  25 #include "kvm_emulate.h"
  26 #include "cpuid.h"
  27 #include "spte.h"
  28
  29 #include <linux/kvm_host.h>
  30 #include <linux/types.h>
  31 #include <linux/string.h>
  32 #include <linux/mm.h>
  33 #include <linux/highmem.h>
  34 #include <linux/moduleparam.h>
  35 #include <linux/export.h>
  36 #include <linux/swap.h>
  37 #include <linux/hugetlb.h>
  38 #include <linux/compiler.h>
  39 #include <linux/srcu.h>
  40 #include <linux/slab.h>
  41 #include <linux/sched/signal.h>
  42 #include <linux/uaccess.h>
  43 #include <linux/hash.h>
  44 #include <linux/kern_levels.h>
  45 #include <linux/kthread.h>
  46
  47 #include <asm/page.h>
  48 #include <asm/memtype.h>
  49 #include <asm/cmpxchg.h>
  50 #include <asm/io.h>
  51 #include <asm/set_memory.h>
  52 #include <asm/vmx.h>
  53 #include <asm/kvm_page_track.h>
  54 #include "trace.h"
  55
  56 #include "paging.h"
  57
  58 extern bool itlb_multihit_kvm_mitigation;
  59
  60 int __read_mostly nx_huge_pages = -1;
  61 static uint __read_mostly nx_huge_pages_recovery_period_ms;
  62 #ifdef CONFIG_PREEMPT_RT
  63 /* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
  64 static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
  65 #else
  66 static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
  67 #endif
  68
  69 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
  70 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
  71
  72 static const struct kernel_param_ops nx_huge_pages_ops = {
  73         .set = set_nx_huge_pages,
  74         .get = param_get_bool,
  75 };
  76
  77 static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
  78         .set = set_nx_huge_pages_recovery_param,
  79         .get = param_get_uint,
  80 };
  81
  82 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
  83 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
  84 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
  85                 &nx_huge_pages_recovery_ratio, 0644);
  86 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
  87 module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
  88                 &nx_huge_pages_recovery_period_ms, 0644);
  89 __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
  90
  91 static bool __read_mostly force_flush_and_sync_on_reuse;
  92 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
  93
  94 /*
  95  * When setting this variable to true it enables Two-Dimensional-Paging
  96  * where the hardware walks 2 page tables:
  97  * 1. the guest-virtual to guest-physical
  98  * 2. while doing 1. it walks guest-physical to host-physical
  99  * If the hardware supports that we don't need to do shadow paging.
 100  */
 101 bool tdp_enabled = false;
 102
 103 static int max_huge_page_level __read_mostly;
 104 static int tdp_root_level __read_mostly;
 105 static int max_tdp_level __read_mostly;
 106
 107 #ifdef MMU_DEBUG
 108 bool dbg = 0;
 109 module_param(dbg, bool, 0644);
 110 #endif
 111
 112 #define PTE_PREFETCH_NUM                8
 113
 114 #define PT32_LEVEL_BITS 10
 115
 116 #define PT32_LEVEL_SHIFT(level) \
 117                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
 118
 119 #define PT32_LVL_OFFSET_MASK(level) \
 120         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 121                                                 * PT32_LEVEL_BITS))) - 1))
 122
 123 #define PT32_INDEX(address, level)\
 124         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 125
 126
 127 #define PT32_BASE_ADDR_MASK PAGE_MASK
 128 #define PT32_DIR_BASE_ADDR_MASK \
 129         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
 130 #define PT32_LVL_ADDR_MASK(level) \
 131         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 132                                             * PT32_LEVEL_BITS))) - 1))
 133
 134 #include <trace/events/kvm.h>
 135
 136 /* make pte_list_desc fit well in cache lines */
 137 #define PTE_LIST_EXT 14
 138
 139 /*
 140  * Slight optimization of cacheline layout, by putting `more' and `spte_count'
 141  * at the start; then accessing it will only use one single cacheline for
 142  * either full (entries==PTE_LIST_EXT) case or entries<=6.
 143  */
 144 struct pte_list_desc {
 145         struct pte_list_desc *more;
 146         /*
 147          * Stores number of entries stored in the pte_list_desc.  No need to be
 148          * u64 but just for easier alignment.  When PTE_LIST_EXT, means full.
 149          */
 150         u64 spte_count;
 151         u64 *sptes[PTE_LIST_EXT];
 152 };
 153
 154 struct kvm_shadow_walk_iterator {
 155         u64 addr;
 156         hpa_t shadow_addr;
 157         u64 *sptep;
 158         int level;
 159         unsigned index;
 160 };
 161
 162 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
 163         for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
 164                                          (_root), (_addr));                \
 165              shadow_walk_okay(&(_walker));                                 \
 166              shadow_walk_next(&(_walker)))
 167
 168 #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
 169         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
 170              shadow_walk_okay(&(_walker));                      \
 171              shadow_walk_next(&(_walker)))
 172
 173 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)     \
 174         for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
 175              shadow_walk_okay(&(_walker)) &&                            \
 176                 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
 177              __shadow_walk_next(&(_walker), spte))
 178
 179 static struct kmem_cache *pte_list_desc_cache;
 180 struct kmem_cache *mmu_page_header_cache;
 181 static struct percpu_counter kvm_total_used_mmu_pages;
 182
 183 static void mmu_spte_set(u64 *sptep, u64 spte);
 184
 185 struct kvm_mmu_role_regs {
 186         const unsigned long cr0;
 187         const unsigned long cr4;
 188         const u64 efer;
 189 };
 190
 191 #define CREATE_TRACE_POINTS
 192 #include "mmutrace.h"
 193
 194 /*
 195  * Yes, lot's of underscores.  They're a hint that you probably shouldn't be
 196  * reading from the role_regs.  Once the mmu_role is constructed, it becomes
 197  * the single source of truth for the MMU's state.
 198  */
 199 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)                   \
 200 static inline bool __maybe_unused                                       \
 201 ____is_##reg##_##name(const struct kvm_mmu_role_regs *regs)             \
 202 {                                                                       \
 203         return !!(regs->reg & flag);                                    \
 204 }
 205 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
 206 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
 207 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
 208 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
 209 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
 210 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
 211 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
 212 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
 213 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
 214 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
 215
 216 /*
 217  * The MMU itself (with a valid role) is the single source of truth for the
 218  * MMU.  Do not use the regs used to build the MMU/role, nor the vCPU.  The
 219  * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
 220  * and the vCPU may be incorrect/irrelevant.
 221  */
 222 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)         \
 223 static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)        \
 224 {                                                               \
 225         return !!(mmu->cpu_role. base_or_ext . reg##_##name);   \
 226 }
 227 BUILD_MMU_ROLE_ACCESSOR(ext,  cr0, pg);
 228 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
 229 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pse);
 230 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pae);
 231 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
 232 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
 233 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
 234 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
 235 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
 236
 237 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
 238 {
 239         struct kvm_mmu_role_regs regs = {
 240                 .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
 241                 .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
 242                 .efer = vcpu->arch.efer,
 243         };
 244
 245         return regs;
 246 }
 247
 248 static inline bool kvm_available_flush_tlb_with_range(void)
 249 {
 250         return kvm_x86_ops.tlb_remote_flush_with_range;
 251 }
 252
 253 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
 254                 struct kvm_tlb_range *range)
 255 {
 256         int ret = -ENOTSUPP;
 257
 258         if (range && kvm_x86_ops.tlb_remote_flush_with_range)
 259                 ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
 260
 261         if (ret)
 262                 kvm_flush_remote_tlbs(kvm);
 263 }
 264
 265 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
 266                 u64 start_gfn, u64 pages)
 267 {
 268         struct kvm_tlb_range range;
 269
 270         range.start_gfn = start_gfn;
 271         range.pages = pages;
 272
 273         kvm_flush_remote_tlbs_with_range(kvm, &range);
 274 }
 275
 276 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 277                            unsigned int access)
 278 {
 279         u64 spte = make_mmio_spte(vcpu, gfn, access);
 280
 281         trace_mark_mmio_spte(sptep, gfn, spte);
 282         mmu_spte_set(sptep, spte);
 283 }
 284
 285 static gfn_t get_mmio_spte_gfn(u64 spte)
 286 {
 287         u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
 288
 289         gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
 290                & shadow_nonpresent_or_rsvd_mask;
 291
 292         return gpa >> PAGE_SHIFT;
 293 }
 294
 295 static unsigned get_mmio_spte_access(u64 spte)
 296 {
 297         return spte & shadow_mmio_access_mask;
 298 }
 299
 300 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 301 {
 302         u64 kvm_gen, spte_gen, gen;
 303
 304         gen = kvm_vcpu_memslots(vcpu)->generation;
 305         if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
 306                 return false;
 307
 308         kvm_gen = gen & MMIO_SPTE_GEN_MASK;
 309         spte_gen = get_mmio_spte_generation(spte);
 310
 311         trace_check_mmio_spte(spte, kvm_gen, spte_gen);
 312         return likely(kvm_gen == spte_gen);
 313 }
 314
 315 static int is_cpuid_PSE36(void)
 316 {
 317         return 1;
 318 }
 319
 320 static gfn_t pse36_gfn_delta(u32 gpte)
 321 {
 322         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
 323
 324         return (gpte & PT32_DIR_PSE36_MASK) << shift;
 325 }
 326
 327 #ifdef CONFIG_X86_64
 328 static void __set_spte(u64 *sptep, u64 spte)
 329 {
 330         WRITE_ONCE(*sptep, spte);
 331 }
 332
 333 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 334 {
 335         WRITE_ONCE(*sptep, spte);
 336 }
 337
 338 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 339 {
 340         return xchg(sptep, spte);
 341 }
 342
 343 static u64 __get_spte_lockless(u64 *sptep)
 344 {
 345         return READ_ONCE(*sptep);
 346 }
 347 #else
 348 union split_spte {
 349         struct {
 350                 u32 spte_low;
 351                 u32 spte_high;
 352         };
 353         u64 spte;
 354 };
 355
 356 static void count_spte_clear(u64 *sptep, u64 spte)
 357 {
 358         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
 359
 360         if (is_shadow_present_pte(spte))
 361                 return;
 362
 363         /* Ensure the spte is completely set before we increase the count */
 364         smp_wmb();
 365         sp->clear_spte_count++;
 366 }
 367
 368 static void __set_spte(u64 *sptep, u64 spte)
 369 {
 370         union split_spte *ssptep, sspte;
 371
 372         ssptep = (union split_spte *)sptep;
 373         sspte = (union split_spte)spte;
 374
 375         ssptep->spte_high = sspte.spte_high;
 376
 377         /*
 378          * If we map the spte from nonpresent to present, We should store
 379          * the high bits firstly, then set present bit, so cpu can not
 380          * fetch this spte while we are setting the spte.
 381          */
 382         smp_wmb();
 383
 384         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 385 }
 386
 387 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 388 {
 389         union split_spte *ssptep, sspte;
 390
 391         ssptep = (union split_spte *)sptep;
 392         sspte = (union split_spte)spte;
 393
 394         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 395
 396         /*
 397          * If we map the spte from present to nonpresent, we should clear
 398          * present bit firstly to avoid vcpu fetch the old high bits.
 399          */
 400         smp_wmb();
 401
 402         ssptep->spte_high = sspte.spte_high;
 403         count_spte_clear(sptep, spte);
 404 }
 405
 406 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 407 {
 408         union split_spte *ssptep, sspte, orig;
 409
 410         ssptep = (union split_spte *)sptep;
 411         sspte = (union split_spte)spte;
 412
 413         /* xchg acts as a barrier before the setting of the high bits */
 414         orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
 415         orig.spte_high = ssptep->spte_high;
 416         ssptep->spte_high = sspte.spte_high;
 417         count_spte_clear(sptep, spte);
 418
 419         return orig.spte;
 420 }
 421
 422 /*
 423  * The idea using the light way get the spte on x86_32 guest is from
 424  * gup_get_pte (mm/gup.c).
 425  *
 426  * An spte tlb flush may be pending, because kvm_set_pte_rmapp
 427  * coalesces them and we are running out of the MMU lock.  Therefore
 428  * we need to protect against in-progress updates of the spte.
 429  *
 430  * Reading the spte while an update is in progress may get the old value
 431  * for the high part of the spte.  The race is fine for a present->non-present
 432  * change (because the high part of the spte is ignored for non-present spte),
 433  * but for a present->present change we must reread the spte.
 434  *
 435  * All such changes are done in two steps (present->non-present and
 436  * non-present->present), hence it is enough to count the number of
 437  * present->non-present updates: if it changed while reading the spte,
 438  * we might have hit the race.  This is done using clear_spte_count.
 439  */
 440 static u64 __get_spte_lockless(u64 *sptep)
 441 {
 442         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
 443         union split_spte spte, *orig = (union split_spte *)sptep;
 444         int count;
 445
 446 retry:
 447         count = sp->clear_spte_count;
 448         smp_rmb();
 449
 450         spte.spte_low = orig->spte_low;
 451         smp_rmb();
 452
 453         spte.spte_high = orig->spte_high;
 454         smp_rmb();
 455
 456         if (unlikely(spte.spte_low != orig->spte_low ||
 457               count != sp->clear_spte_count))
 458                 goto retry;
 459
 460         return spte.spte;
 461 }
 462 #endif
 463
 464 static bool spte_has_volatile_bits(u64 spte)
 465 {
 466         if (!is_shadow_present_pte(spte))
 467                 return false;
 468
 469         /*
 470          * Always atomically update spte if it can be updated
 471          * out of mmu-lock, it can ensure dirty bit is not lost,
 472          * also, it can help us to get a stable is_writable_pte()
 473          * to ensure tlb flush is not missed.
 474          */
 475         if (spte_can_locklessly_be_made_writable(spte) ||
 476             is_access_track_spte(spte))
 477                 return true;
 478
 479         if (spte_ad_enabled(spte)) {
 480                 if ((spte & shadow_accessed_mask) == 0 ||
 481                     (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
 482                         return true;
 483         }
 484
 485         return false;
 486 }
 487
 488 /* Rules for using mmu_spte_set:
 489  * Set the sptep from nonpresent to present.
 490  * Note: the sptep being assigned *must* be either not present
 491  * or in a state where the hardware will not attempt to update
 492  * the spte.
 493  */
 494 static void mmu_spte_set(u64 *sptep, u64 new_spte)
 495 {
 496         WARN_ON(is_shadow_present_pte(*sptep));
 497         __set_spte(sptep, new_spte);
 498 }
 499
 500 /*
 501  * Update the SPTE (excluding the PFN), but do not track changes in its
 502  * accessed/dirty status.
 503  */
 504 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
 505 {
 506         u64 old_spte = *sptep;
 507
 508         WARN_ON(!is_shadow_present_pte(new_spte));
 509         check_spte_writable_invariants(new_spte);
 510
 511         if (!is_shadow_present_pte(old_spte)) {
 512                 mmu_spte_set(sptep, new_spte);
 513                 return old_spte;
 514         }
 515
 516         if (!spte_has_volatile_bits(old_spte))
 517                 __update_clear_spte_fast(sptep, new_spte);
 518         else
 519                 old_spte = __update_clear_spte_slow(sptep, new_spte);
 520
 521         WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
 522
 523         return old_spte;
 524 }
 525
 526 /* Rules for using mmu_spte_update:
 527  * Update the state bits, it means the mapped pfn is not changed.
 528  *
 529  * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
 530  * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
 531  * spte, even though the writable spte might be cached on a CPU's TLB.
 532  *
 533  * Returns true if the TLB needs to be flushed
 534  */
 535 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 536 {
 537         bool flush = false;
 538         u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
 539
 540         if (!is_shadow_present_pte(old_spte))
 541                 return false;
 542
 543         /*
 544          * For the spte updated out of mmu-lock is safe, since
 545          * we always atomically update it, see the comments in
 546          * spte_has_volatile_bits().
 547          */
 548         if (spte_can_locklessly_be_made_writable(old_spte) &&
 549               !is_writable_pte(new_spte))
 550                 flush = true;
 551
 552         /*
 553          * Flush TLB when accessed/dirty states are changed in the page tables,
 554          * to guarantee consistency between TLB and page tables.
 555          */
 556
 557         if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
 558                 flush = true;
 559                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 560         }
 561
 562         if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
 563                 flush = true;
 564                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 565         }
 566
 567         return flush;
 568 }
 569
 570 /*
 571  * Rules for using mmu_spte_clear_track_bits:
 572  * It sets the sptep from present to nonpresent, and track the
 573  * state bits, it is used to clear the last level sptep.
 574  * Returns the old PTE.
 575  */
 576 static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
 577 {
 578         kvm_pfn_t pfn;
 579         u64 old_spte = *sptep;
 580         int level = sptep_to_sp(sptep)->role.level;
 581
 582         if (!spte_has_volatile_bits(old_spte))
 583                 __update_clear_spte_fast(sptep, 0ull);
 584         else
 585                 old_spte = __update_clear_spte_slow(sptep, 0ull);
 586
 587         if (!is_shadow_present_pte(old_spte))
 588                 return old_spte;
 589
 590         kvm_update_page_stats(kvm, level, -1);
 591
 592         pfn = spte_to_pfn(old_spte);
 593
 594         /*
 595          * KVM does not hold the refcount of the page used by
 596          * kvm mmu, before reclaiming the page, we should
 597          * unmap it from mmu first.
 598          */
 599         WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
 600
 601         if (is_accessed_spte(old_spte))
 602                 kvm_set_pfn_accessed(pfn);
 603
 604         if (is_dirty_spte(old_spte))
 605                 kvm_set_pfn_dirty(pfn);
 606
 607         return old_spte;
 608 }
 609
 610 /*
 611  * Rules for using mmu_spte_clear_no_track:
 612  * Directly clear spte without caring the state bits of sptep,
 613  * it is used to set the upper level spte.
 614  */
 615 static void mmu_spte_clear_no_track(u64 *sptep)
 616 {
 617         __update_clear_spte_fast(sptep, 0ull);
 618 }
 619
 620 static u64 mmu_spte_get_lockless(u64 *sptep)
 621 {
 622         return __get_spte_lockless(sptep);
 623 }
 624
 625 /* Returns the Accessed status of the PTE and resets it at the same time. */
 626 static bool mmu_spte_age(u64 *sptep)
 627 {
 628         u64 spte = mmu_spte_get_lockless(sptep);
 629
 630         if (!is_accessed_spte(spte))
 631                 return false;
 632
 633         if (spte_ad_enabled(spte)) {
 634                 clear_bit((ffs(shadow_accessed_mask) - 1),
 635                           (unsigned long *)sptep);
 636         } else {
 637                 /*
 638                  * Capture the dirty status of the page, so that it doesn't get
 639                  * lost when the SPTE is marked for access tracking.
 640                  */
 641                 if (is_writable_pte(spte))
 642                         kvm_set_pfn_dirty(spte_to_pfn(spte));
 643
 644                 spte = mark_spte_for_access_track(spte);
 645                 mmu_spte_update_no_track(sptep, spte);
 646         }
 647
 648         return true;
 649 }
 650
 651 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 652 {
 653         if (is_tdp_mmu(vcpu->arch.mmu)) {
 654                 kvm_tdp_mmu_walk_lockless_begin();
 655         } else {
 656                 /*
 657                  * Prevent page table teardown by making any free-er wait during
 658                  * kvm_flush_remote_tlbs() IPI to all active vcpus.
 659                  */
 660                 local_irq_disable();
 661
 662                 /*
 663                  * Make sure a following spte read is not reordered ahead of the write
 664                  * to vcpu->mode.
 665                  */
 666                 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
 667         }
 668 }
 669
 670 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 671 {
 672         if (is_tdp_mmu(vcpu->arch.mmu)) {
 673                 kvm_tdp_mmu_walk_lockless_end();
 674         } else {
 675                 /*
 676                  * Make sure the write to vcpu->mode is not reordered in front of
 677                  * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
 678                  * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
 679                  */
 680                 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
 681                 local_irq_enable();
 682         }
 683 }
 684
 685 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 686 {
 687         int r;
 688
 689         /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
 690         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
 691                                        1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
 692         if (r)
 693                 return r;
 694         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
 695                                        PT64_ROOT_MAX_LEVEL);
 696         if (r)
 697                 return r;
 698         if (maybe_indirect) {
 699                 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
 700                                                PT64_ROOT_MAX_LEVEL);
 701                 if (r)
 702                         return r;
 703         }
 704         return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
 705                                           PT64_ROOT_MAX_LEVEL);
 706 }
 707
 708 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 709 {
 710         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
 711         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
 712         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
 713         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
 714 }
 715
 716 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 717 {
 718         return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
 719 }
 720
 721 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
 722 {
 723         kmem_cache_free(pte_list_desc_cache, pte_list_desc);
 724 }
 725
 726 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
 727 {
 728         if (!sp->role.direct)
 729                 return sp->gfns[index];
 730
 731         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
 732 }
 733
 734 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
 735 {
 736         if (!sp->role.direct) {
 737                 sp->gfns[index] = gfn;
 738                 return;
 739         }
 740
 741         if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
 742                 pr_err_ratelimited("gfn mismatch under direct page %llx "
 743                                    "(expected %llx, got %llx)\n",
 744                                    sp->gfn,
 745                                    kvm_mmu_page_get_gfn(sp, index), gfn);
 746 }
 747
 748 /*
 749  * Return the pointer to the large page information for a given gfn,
 750  * handling slots that are not large page aligned.
 751  */
 752 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
 753                 const struct kvm_memory_slot *slot, int level)
 754 {
 755         unsigned long idx;
 756
 757         idx = gfn_to_index(gfn, slot->base_gfn, level);
 758         return &slot->arch.lpage_info[level - 2][idx];
 759 }
 760
 761 static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
 762                                             gfn_t gfn, int count)
 763 {
 764         struct kvm_lpage_info *linfo;
 765         int i;
 766
 767         for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
 768                 linfo = lpage_info_slot(gfn, slot, i);
 769                 linfo->disallow_lpage += count;
 770                 WARN_ON(linfo->disallow_lpage < 0);
 771         }
 772 }
 773
 774 void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
 775 {
 776         update_gfn_disallow_lpage_count(slot, gfn, 1);
 777 }
 778
 779 void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
 780 {
 781         update_gfn_disallow_lpage_count(slot, gfn, -1);
 782 }
 783
 784 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 785 {
 786         struct kvm_memslots *slots;
 787         struct kvm_memory_slot *slot;
 788         gfn_t gfn;
 789
 790         kvm->arch.indirect_shadow_pages++;
 791         gfn = sp->gfn;
 792         slots = kvm_memslots_for_spte_role(kvm, sp->role);
 793         slot = __gfn_to_memslot(slots, gfn);
 794
 795         /* the non-leaf shadow pages are keeping readonly. */
 796         if (sp->role.level > PG_LEVEL_4K)
 797                 return kvm_slot_page_track_add_page(kvm, slot, gfn,
 798                                                     KVM_PAGE_TRACK_WRITE);
 799
 800         kvm_mmu_gfn_disallow_lpage(slot, gfn);
 801 }
 802
 803 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 804 {
 805         if (sp->lpage_disallowed)
 806                 return;
 807
 808         ++kvm->stat.nx_lpage_splits;
 809         list_add_tail(&sp->lpage_disallowed_link,
 810                       &kvm->arch.lpage_disallowed_mmu_pages);
 811         sp->lpage_disallowed = true;
 812 }
 813
 814 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 815 {
 816         struct kvm_memslots *slots;
 817         struct kvm_memory_slot *slot;
 818         gfn_t gfn;
 819
 820         kvm->arch.indirect_shadow_pages--;
 821         gfn = sp->gfn;
 822         slots = kvm_memslots_for_spte_role(kvm, sp->role);
 823         slot = __gfn_to_memslot(slots, gfn);
 824         if (sp->role.level > PG_LEVEL_4K)
 825                 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
 826                                                        KVM_PAGE_TRACK_WRITE);
 827
 828         kvm_mmu_gfn_allow_lpage(slot, gfn);
 829 }
 830
 831 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 832 {
 833         --kvm->stat.nx_lpage_splits;
 834         sp->lpage_disallowed = false;
 835         list_del(&sp->lpage_disallowed_link);
 836 }
 837
 838 static struct kvm_memory_slot *
 839 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 840                             bool no_dirty_log)
 841 {
 842         struct kvm_memory_slot *slot;
 843
 844         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 845         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 846                 return NULL;
 847         if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
 848                 return NULL;
 849
 850         return slot;
 851 }
 852
 853 /*
 854  * About rmap_head encoding:
 855  *
 856  * If the bit zero of rmap_head->val is clear, then it points to the only spte
 857  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
 858  * pte_list_desc containing more mappings.
 859  */
 860
 861 /*
 862  * Returns the number of pointers in the rmap chain, not counting the new one.
 863  */
 864 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
 865                         struct kvm_rmap_head *rmap_head)
 866 {
 867         struct pte_list_desc *desc;
 868         int count = 0;
 869
 870         if (!rmap_head->val) {
 871                 rmap_printk("%p %llx 0->1\n", spte, *spte);
 872                 rmap_head->val = (unsigned long)spte;
 873         } else if (!(rmap_head->val & 1)) {
 874                 rmap_printk("%p %llx 1->many\n", spte, *spte);
 875                 desc = mmu_alloc_pte_list_desc(vcpu);
 876                 desc->sptes[0] = (u64 *)rmap_head->val;
 877                 desc->sptes[1] = spte;
 878                 desc->spte_count = 2;
 879                 rmap_head->val = (unsigned long)desc | 1;
 880                 ++count;
 881         } else {
 882                 rmap_printk("%p %llx many->many\n", spte, *spte);
 883                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
 884                 while (desc->spte_count == PTE_LIST_EXT) {
 885                         count += PTE_LIST_EXT;
 886                         if (!desc->more) {
 887                                 desc->more = mmu_alloc_pte_list_desc(vcpu);
 888                                 desc = desc->more;
 889                                 desc->spte_count = 0;
 890                                 break;
 891                         }
 892                         desc = desc->more;
 893                 }
 894                 count += desc->spte_count;
 895                 desc->sptes[desc->spte_count++] = spte;
 896         }
 897         return count;
 898 }
 899
 900 static void
 901 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
 902                            struct pte_list_desc *desc, int i,
 903                            struct pte_list_desc *prev_desc)
 904 {
 905         int j = desc->spte_count - 1;
 906
 907         desc->sptes[i] = desc->sptes[j];
 908         desc->sptes[j] = NULL;
 909         desc->spte_count--;
 910         if (desc->spte_count)
 911                 return;
 912         if (!prev_desc && !desc->more)
 913                 rmap_head->val = 0;
 914         else
 915                 if (prev_desc)
 916                         prev_desc->more = desc->more;
 917                 else
 918                         rmap_head->val = (unsigned long)desc->more | 1;
 919         mmu_free_pte_list_desc(desc);
 920 }
 921
 922 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
 923 {
 924         struct pte_list_desc *desc;
 925         struct pte_list_desc *prev_desc;
 926         int i;
 927
 928         if (!rmap_head->val) {
 929                 pr_err("%s: %p 0->BUG\n", __func__, spte);
 930                 BUG();
 931         } else if (!(rmap_head->val & 1)) {
 932                 rmap_printk("%p 1->0\n", spte);
 933                 if ((u64 *)rmap_head->val != spte) {
 934                         pr_err("%s:  %p 1->BUG\n", __func__, spte);
 935                         BUG();
 936                 }
 937                 rmap_head->val = 0;
 938         } else {
 939                 rmap_printk("%p many->many\n", spte);
 940                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
 941                 prev_desc = NULL;
 942                 while (desc) {
 943                         for (i = 0; i < desc->spte_count; ++i) {
 944                                 if (desc->sptes[i] == spte) {
 945                                         pte_list_desc_remove_entry(rmap_head,
 946                                                         desc, i, prev_desc);
 947                                         return;
 948                                 }
 949                         }
 950                         prev_desc = desc;
 951                         desc = desc->more;
 952                 }
 953                 pr_err("%s: %p many->many\n", __func__, spte);
 954                 BUG();
 955         }
 956 }
 957
 958 static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 959                             u64 *sptep)
 960 {
 961         mmu_spte_clear_track_bits(kvm, sptep);
 962         __pte_list_remove(sptep, rmap_head);
 963 }
 964
 965 /* Return true if rmap existed, false otherwise */
 966 static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
 967 {
 968         struct pte_list_desc *desc, *next;
 969         int i;
 970
 971         if (!rmap_head->val)
 972                 return false;
 973
 974         if (!(rmap_head->val & 1)) {
 975                 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
 976                 goto out;
 977         }
 978
 979         desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
 980
 981         for (; desc; desc = next) {
 982                 for (i = 0; i < desc->spte_count; i++)
 983                         mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
 984                 next = desc->more;
 985                 mmu_free_pte_list_desc(desc);
 986         }
 987 out:
 988         /* rmap_head is meaningless now, remember to reset it */
 989         rmap_head->val = 0;
 990         return true;
 991 }
 992
 993 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
 994 {
 995         struct pte_list_desc *desc;
 996         unsigned int count = 0;
 997
 998         if (!rmap_head->val)
 999                 return 0;
1000         else if (!(rmap_head->val & 1))
1001                 return 1;
1002
1003         desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1004
1005         while (desc) {
1006                 count += desc->spte_count;
1007                 desc = desc->more;
1008         }
1009
1010         return count;
1011 }
1012
1013 static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
1014                                          const struct kvm_memory_slot *slot)
1015 {
1016         unsigned long idx;
1017
1018         idx = gfn_to_index(gfn, slot->base_gfn, level);
1019         return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
1020 }
1021
1022 static bool rmap_can_add(struct kvm_vcpu *vcpu)
1023 {
1024         struct kvm_mmu_memory_cache *mc;
1025
1026         mc = &vcpu->arch.mmu_pte_list_desc_cache;
1027         return kvm_mmu_memory_cache_nr_free_objects(mc);
1028 }
1029
1030 static void rmap_remove(struct kvm *kvm, u64 *spte)
1031 {
1032         struct kvm_memslots *slots;
1033         struct kvm_memory_slot *slot;
1034         struct kvm_mmu_page *sp;
1035         gfn_t gfn;
1036         struct kvm_rmap_head *rmap_head;
1037
1038         sp = sptep_to_sp(spte);
1039         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1040
1041         /*
1042          * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
1043          * so we have to determine which memslots to use based on context
1044          * information in sp->role.
1045          */
1046         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1047
1048         slot = __gfn_to_memslot(slots, gfn);
1049         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1050
1051         __pte_list_remove(spte, rmap_head);
1052 }
1053
1054 /*
1055  * Used by the following functions to iterate through the sptes linked by a
1056  * rmap.  All fields are private and not assumed to be used outside.
1057  */
1058 struct rmap_iterator {
1059         /* private fields */
1060         struct pte_list_desc *desc;     /* holds the sptep if not NULL */
1061         int pos;                        /* index of the sptep */
1062 };
1063
1064 /*
1065  * Iteration must be started by this function.  This should also be used after
1066  * removing/dropping sptes from the rmap link because in such cases the
1067  * information in the iterator may not be valid.
1068  *
1069  * Returns sptep if found, NULL otherwise.
1070  */
1071 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1072                            struct rmap_iterator *iter)
1073 {
1074         u64 *sptep;
1075
1076         if (!rmap_head->val)
1077                 return NULL;
1078
1079         if (!(rmap_head->val & 1)) {
1080                 iter->desc = NULL;
1081                 sptep = (u64 *)rmap_head->val;
1082                 goto out;
1083         }
1084
1085         iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1086         iter->pos = 0;
1087         sptep = iter->desc->sptes[iter->pos];
1088 out:
1089         BUG_ON(!is_shadow_present_pte(*sptep));
1090         return sptep;
1091 }
1092
1093 /*
1094  * Must be used with a valid iterator: e.g. after rmap_get_first().
1095  *
1096  * Returns sptep if found, NULL otherwise.
1097  */
1098 static u64 *rmap_get_next(struct rmap_iterator *iter)
1099 {
1100         u64 *sptep;
1101
1102         if (iter->desc) {
1103                 if (iter->pos < PTE_LIST_EXT - 1) {
1104                         ++iter->pos;
1105                         sptep = iter->desc->sptes[iter->pos];
1106                         if (sptep)
1107                                 goto out;
1108                 }
1109
1110                 iter->desc = iter->desc->more;
1111
1112                 if (iter->desc) {
1113                         iter->pos = 0;
1114                         /* desc->sptes[0] cannot be NULL */
1115                         sptep = iter->desc->sptes[iter->pos];
1116                         goto out;
1117                 }
1118         }
1119
1120         return NULL;
1121 out:
1122         BUG_ON(!is_shadow_present_pte(*sptep));
1123         return sptep;
1124 }
1125
1126 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)                 \
1127         for (_spte_ = rmap_get_first(_rmap_head_, _iter_);              \
1128              _spte_; _spte_ = rmap_get_next(_iter_))
1129
1130 static void drop_spte(struct kvm *kvm, u64 *sptep)
1131 {
1132         u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
1133
1134         if (is_shadow_present_pte(old_spte))
1135                 rmap_remove(kvm, sptep);
1136 }
1137
1138
1139 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1140 {
1141         if (is_large_pte(*sptep)) {
1142                 WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
1143                 drop_spte(kvm, sptep);
1144                 return true;
1145         }
1146
1147         return false;
1148 }
1149
1150 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1151 {
1152         if (__drop_large_spte(vcpu->kvm, sptep)) {
1153                 struct kvm_mmu_page *sp = sptep_to_sp(sptep);
1154
1155                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1156                         KVM_PAGES_PER_HPAGE(sp->role.level));
1157         }
1158 }
1159
1160 /*
1161  * Write-protect on the specified @sptep, @pt_protect indicates whether
1162  * spte write-protection is caused by protecting shadow page table.
1163  *
1164  * Note: write protection is difference between dirty logging and spte
1165  * protection:
1166  * - for dirty logging, the spte can be set to writable at anytime if
1167  *   its dirty bitmap is properly set.
1168  * - for spte protection, the spte can be writable only after unsync-ing
1169  *   shadow page.
1170  *
1171  * Return true if tlb need be flushed.
1172  */
1173 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1174 {
1175         u64 spte = *sptep;
1176
1177         if (!is_writable_pte(spte) &&
1178               !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1179                 return false;
1180
1181         rmap_printk("spte %p %llx\n", sptep, *sptep);
1182
1183         if (pt_protect)
1184                 spte &= ~shadow_mmu_writable_mask;
1185         spte = spte & ~PT_WRITABLE_MASK;
1186
1187         return mmu_spte_update(sptep, spte);
1188 }
1189
1190 static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
1191                                bool pt_protect)
1192 {
1193         u64 *sptep;
1194         struct rmap_iterator iter;
1195         bool flush = false;
1196
1197         for_each_rmap_spte(rmap_head, &iter, sptep)
1198                 flush |= spte_write_protect(sptep, pt_protect);
1199
1200         return flush;
1201 }
1202
1203 static bool spte_clear_dirty(u64 *sptep)
1204 {
1205         u64 spte = *sptep;
1206
1207         rmap_printk("spte %p %llx\n", sptep, *sptep);
1208
1209         MMU_WARN_ON(!spte_ad_enabled(spte));
1210         spte &= ~shadow_dirty_mask;
1211         return mmu_spte_update(sptep, spte);
1212 }
1213
1214 static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1215 {
1216         bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1217                                                (unsigned long *)sptep);
1218         if (was_writable && !spte_ad_enabled(*sptep))
1219                 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1220
1221         return was_writable;
1222 }
1223
1224 /*
1225  * Gets the GFN ready for another round of dirty logging by clearing the
1226  *      - D bit on ad-enabled SPTEs, and
1227  *      - W bit on ad-disabled SPTEs.
1228  * Returns true iff any D or W bits were cleared.
1229  */
1230 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1231                                const struct kvm_memory_slot *slot)
1232 {
1233         u64 *sptep;
1234         struct rmap_iterator iter;
1235         bool flush = false;
1236
1237         for_each_rmap_spte(rmap_head, &iter, sptep)
1238                 if (spte_ad_need_write_protect(*sptep))
1239                         flush |= spte_wrprot_for_clear_dirty(sptep);
1240                 else
1241                         flush |= spte_clear_dirty(sptep);
1242
1243         return flush;
1244 }
1245
1246 /**
1247  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1248  * @kvm: kvm instance
1249  * @slot: slot to protect
1250  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1251  * @mask: indicates which pages we should protect
1252  *
1253  * Used when we do not need to care about huge page mappings.
1254  */
1255 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1256                                      struct kvm_memory_slot *slot,
1257                                      gfn_t gfn_offset, unsigned long mask)
1258 {
1259         struct kvm_rmap_head *rmap_head;
1260
1261         if (is_tdp_mmu_enabled(kvm))
1262                 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1263                                 slot->base_gfn + gfn_offset, mask, true);
1264
1265         if (!kvm_memslots_have_rmaps(kvm))
1266                 return;
1267
1268         while (mask) {
1269                 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1270                                         PG_LEVEL_4K, slot);
1271                 rmap_write_protect(rmap_head, false);
1272
1273                 /* clear the first set bit */
1274                 mask &= mask - 1;
1275         }
1276 }
1277
1278 /**
1279  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1280  * protect the page if the D-bit isn't supported.
1281  * @kvm: kvm instance
1282  * @slot: slot to clear D-bit
1283  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1284  * @mask: indicates which pages we should clear D-bit
1285  *
1286  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1287  */
1288 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1289                                          struct kvm_memory_slot *slot,
1290                                          gfn_t gfn_offset, unsigned long mask)
1291 {
1292         struct kvm_rmap_head *rmap_head;
1293
1294         if (is_tdp_mmu_enabled(kvm))
1295                 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1296                                 slot->base_gfn + gfn_offset, mask, false);
1297
1298         if (!kvm_memslots_have_rmaps(kvm))
1299                 return;
1300
1301         while (mask) {
1302                 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1303                                         PG_LEVEL_4K, slot);
1304                 __rmap_clear_dirty(kvm, rmap_head, slot);
1305
1306                 /* clear the first set bit */
1307                 mask &= mask - 1;
1308         }
1309 }
1310
1311 /**
1312  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1313  * PT level pages.
1314  *
1315  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1316  * enable dirty logging for them.
1317  *
1318  * We need to care about huge page mappings: e.g. during dirty logging we may
1319  * have such mappings.
1320  */
1321 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1322                                 struct kvm_memory_slot *slot,
1323                                 gfn_t gfn_offset, unsigned long mask)
1324 {
1325         /*
1326          * Huge pages are NOT write protected when we start dirty logging in
1327          * initially-all-set mode; must write protect them here so that they
1328          * are split to 4K on the first write.
1329          *
1330          * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
1331          * of memslot has no such restriction, so the range can cross two large
1332          * pages.
1333          */
1334         if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1335                 gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
1336                 gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
1337
1338                 if (READ_ONCE(eager_page_split))
1339                         kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
1340
1341                 kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
1342
1343                 /* Cross two large pages? */
1344                 if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
1345                     ALIGN(end << PAGE_SHIFT, PMD_SIZE))
1346                         kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
1347                                                        PG_LEVEL_2M);
1348         }
1349
1350         /* Now handle 4K PTEs.  */
1351         if (kvm_x86_ops.cpu_dirty_log_size)
1352                 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
1353         else
1354                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1355 }
1356
1357 int kvm_cpu_dirty_log_size(void)
1358 {
1359         return kvm_x86_ops.cpu_dirty_log_size;
1360 }
1361
1362 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1363                                     struct kvm_memory_slot *slot, u64 gfn,
1364                                     int min_level)
1365 {
1366         struct kvm_rmap_head *rmap_head;
1367         int i;
1368         bool write_protected = false;
1369
1370         if (kvm_memslots_have_rmaps(kvm)) {
1371                 for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
1372                         rmap_head = gfn_to_rmap(gfn, i, slot);
1373                         write_protected |= rmap_write_protect(rmap_head, true);
1374                 }
1375         }
1376
1377         if (is_tdp_mmu_enabled(kvm))
1378                 write_protected |=
1379                         kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
1380
1381         return write_protected;
1382 }
1383
1384 static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
1385 {
1386         struct kvm_memory_slot *slot;
1387
1388         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1389         return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
1390 }
1391
1392 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1393                           const struct kvm_memory_slot *slot)
1394 {
1395         return pte_list_destroy(kvm, rmap_head);
1396 }
1397
1398 static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1399                             struct kvm_memory_slot *slot, gfn_t gfn, int level,
1400                             pte_t unused)
1401 {
1402         return kvm_zap_rmapp(kvm, rmap_head, slot);
1403 }
1404
1405 static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1406                               struct kvm_memory_slot *slot, gfn_t gfn, int level,
1407                               pte_t pte)
1408 {
1409         u64 *sptep;
1410         struct rmap_iterator iter;
1411         bool need_flush = false;
1412         u64 new_spte;
1413         kvm_pfn_t new_pfn;
1414
1415         WARN_ON(pte_huge(pte));
1416         new_pfn = pte_pfn(pte);
1417
1418 restart:
1419         for_each_rmap_spte(rmap_head, &iter, sptep) {
1420                 rmap_printk("spte %p %llx gfn %llx (%d)\n",
1421                             sptep, *sptep, gfn, level);
1422
1423                 need_flush = true;
1424
1425                 if (pte_write(pte)) {
1426                         pte_list_remove(kvm, rmap_head, sptep);
1427                         goto restart;
1428                 } else {
1429                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1430                                         *sptep, new_pfn);
1431
1432                         mmu_spte_clear_track_bits(kvm, sptep);
1433                         mmu_spte_set(sptep, new_spte);
1434                 }
1435         }
1436
1437         if (need_flush && kvm_available_flush_tlb_with_range()) {
1438                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1439                 return false;
1440         }
1441
1442         return need_flush;
1443 }
1444
1445 struct slot_rmap_walk_iterator {
1446         /* input fields. */
1447         const struct kvm_memory_slot *slot;
1448         gfn_t start_gfn;
1449         gfn_t end_gfn;
1450         int start_level;
1451         int end_level;
1452
1453         /* output fields. */
1454         gfn_t gfn;
1455         struct kvm_rmap_head *rmap;
1456         int level;
1457
1458         /* private field. */
1459         struct kvm_rmap_head *end_rmap;
1460 };
1461
1462 static void
1463 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1464 {
1465         iterator->level = level;
1466         iterator->gfn = iterator->start_gfn;
1467         iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
1468         iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
1469 }
1470
1471 static void
1472 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1473                     const struct kvm_memory_slot *slot, int start_level,
1474                     int end_level, gfn_t start_gfn, gfn_t end_gfn)
1475 {
1476         iterator->slot = slot;
1477         iterator->start_level = start_level;
1478         iterator->end_level = end_level;
1479         iterator->start_gfn = start_gfn;
1480         iterator->end_gfn = end_gfn;
1481
1482         rmap_walk_init_level(iterator, iterator->start_level);
1483 }
1484
1485 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1486 {
1487         return !!iterator->rmap;
1488 }
1489
1490 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1491 {
1492         if (++iterator->rmap <= iterator->end_rmap) {
1493                 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1494                 return;
1495         }
1496
1497         if (++iterator->level > iterator->end_level) {
1498                 iterator->rmap = NULL;
1499                 return;
1500         }
1501
1502         rmap_walk_init_level(iterator, iterator->level);
1503 }
1504
1505 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1506            _start_gfn, _end_gfn, _iter_)                                \
1507         for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
1508                                  _end_level_, _start_gfn, _end_gfn);    \
1509              slot_rmap_walk_okay(_iter_);                               \
1510              slot_rmap_walk_next(_iter_))
1511
1512 typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1513                                struct kvm_memory_slot *slot, gfn_t gfn,
1514                                int level, pte_t pte);
1515
1516 static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
1517                                                  struct kvm_gfn_range *range,
1518                                                  rmap_handler_t handler)
1519 {
1520         struct slot_rmap_walk_iterator iterator;
1521         bool ret = false;
1522
1523         for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
1524                                  range->start, range->end - 1, &iterator)
1525                 ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
1526                                iterator.level, range->pte);
1527
1528         return ret;
1529 }
1530
1531 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1532 {
1533         bool flush = false;
1534
1535         if (kvm_memslots_have_rmaps(kvm))
1536                 flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
1537
1538         if (is_tdp_mmu_enabled(kvm))
1539                 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
1540
1541         return flush;
1542 }
1543
1544 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1545 {
1546         bool flush = false;
1547
1548         if (kvm_memslots_have_rmaps(kvm))
1549                 flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
1550
1551         if (is_tdp_mmu_enabled(kvm))
1552                 flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
1553
1554         return flush;
1555 }
1556
1557 static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1558                           struct kvm_memory_slot *slot, gfn_t gfn, int level,
1559                           pte_t unused)
1560 {
1561         u64 *sptep;
1562         struct rmap_iterator iter;
1563         int young = 0;
1564
1565         for_each_rmap_spte(rmap_head, &iter, sptep)
1566                 young |= mmu_spte_age(sptep);
1567
1568         return young;
1569 }
1570
1571 static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1572                                struct kvm_memory_slot *slot, gfn_t gfn,
1573                                int level, pte_t unused)
1574 {
1575         u64 *sptep;
1576         struct rmap_iterator iter;
1577
1578         for_each_rmap_spte(rmap_head, &iter, sptep)
1579                 if (is_accessed_spte(*sptep))
1580                         return true;
1581         return false;
1582 }
1583
1584 #define RMAP_RECYCLE_THRESHOLD 1000
1585
1586 static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
1587                      u64 *spte, gfn_t gfn)
1588 {
1589         struct kvm_mmu_page *sp;
1590         struct kvm_rmap_head *rmap_head;
1591         int rmap_count;
1592
1593         sp = sptep_to_sp(spte);
1594         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1595         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1596         rmap_count = pte_list_add(vcpu, spte, rmap_head);
1597
1598         if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
1599                 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
1600                 kvm_flush_remote_tlbs_with_address(
1601                                 vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
1602         }
1603 }
1604
1605 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1606 {
1607         bool young = false;
1608
1609         if (kvm_memslots_have_rmaps(kvm))
1610                 young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
1611
1612         if (is_tdp_mmu_enabled(kvm))
1613                 young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
1614
1615         return young;
1616 }
1617
1618 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1619 {
1620         bool young = false;
1621
1622         if (kvm_memslots_have_rmaps(kvm))
1623                 young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
1624
1625         if (is_tdp_mmu_enabled(kvm))
1626                 young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
1627
1628         return young;
1629 }
1630
1631 #ifdef MMU_DEBUG
1632 static int is_empty_shadow_page(u64 *spt)
1633 {
1634         u64 *pos;
1635         u64 *end;
1636
1637         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1638                 if (is_shadow_present_pte(*pos)) {
1639                         printk(KERN_ERR "%s: %p %llx\n", __func__,
1640                                pos, *pos);
1641                         return 0;
1642                 }
1643         return 1;
1644 }
1645 #endif
1646
1647 /*
1648  * This value is the sum of all of the kvm instances's
1649  * kvm->arch.n_used_mmu_pages values.  We need a global,
1650  * aggregate version in order to make the slab shrinker
1651  * faster
1652  */
1653 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
1654 {
1655         kvm->arch.n_used_mmu_pages += nr;
1656         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1657 }
1658
1659 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1660 {
1661         MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
1662         hlist_del(&sp->hash_link);
1663         list_del(&sp->link);
1664         free_page((unsigned long)sp->spt);
1665         if (!sp->role.direct)
1666                 free_page((unsigned long)sp->gfns);
1667         kmem_cache_free(mmu_page_header_cache, sp);
1668 }
1669
1670 static unsigned kvm_page_table_hashfn(gfn_t gfn)
1671 {
1672         return hash_64(gfn, KVM_MMU_HASH_SHIFT);
1673 }
1674
1675 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1676                                     struct kvm_mmu_page *sp, u64 *parent_pte)
1677 {
1678         if (!parent_pte)
1679                 return;
1680
1681         pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
1682 }
1683
1684 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1685                                        u64 *parent_pte)
1686 {
1687         __pte_list_remove(parent_pte, &sp->parent_ptes);
1688 }
1689
1690 static void drop_parent_pte(struct kvm_mmu_page *sp,
1691                             u64 *parent_pte)
1692 {
1693         mmu_page_remove_parent_pte(sp, parent_pte);
1694         mmu_spte_clear_no_track(parent_pte);
1695 }
1696
1697 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
1698 {
1699         struct kvm_mmu_page *sp;
1700
1701         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
1702         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
1703         if (!direct)
1704                 sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
1705         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1706
1707         /*
1708          * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
1709          * depends on valid pages being added to the head of the list.  See
1710          * comments in kvm_zap_obsolete_pages().
1711          */
1712         sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
1713         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1714         kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1715         return sp;
1716 }
1717
1718 static void mark_unsync(u64 *spte);
1719 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1720 {
1721         u64 *sptep;
1722         struct rmap_iterator iter;
1723
1724         for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
1725                 mark_unsync(sptep);
1726         }
1727 }
1728
1729 static void mark_unsync(u64 *spte)
1730 {
1731         struct kvm_mmu_page *sp;
1732         unsigned int index;
1733
1734         sp = sptep_to_sp(spte);
1735         index = spte - sp->spt;
1736         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1737                 return;
1738         if (sp->unsync_children++)
1739                 return;
1740         kvm_mmu_mark_parents_unsync(sp);
1741 }
1742
1743 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1744                                struct kvm_mmu_page *sp)
1745 {
1746         return -1;
1747 }
1748
1749 #define KVM_PAGE_ARRAY_NR 16
1750
1751 struct kvm_mmu_pages {
1752         struct mmu_page_and_offset {
1753                 struct kvm_mmu_page *sp;
1754                 unsigned int idx;
1755         } page[KVM_PAGE_ARRAY_NR];
1756         unsigned int nr;
1757 };
1758
1759 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1760                          int idx)
1761 {
1762         int i;
1763
1764         if (sp->unsync)
1765                 for (i=0; i < pvec->nr; i++)
1766                         if (pvec->page[i].sp == sp)
1767                                 return 0;
1768
1769         pvec->page[pvec->nr].sp = sp;
1770         pvec->page[pvec->nr].idx = idx;
1771         pvec->nr++;
1772         return (pvec->nr == KVM_PAGE_ARRAY_NR);
1773 }
1774
1775 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
1776 {
1777         --sp->unsync_children;
1778         WARN_ON((int)sp->unsync_children < 0);
1779         __clear_bit(idx, sp->unsync_child_bitmap);
1780 }
1781
1782 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1783                            struct kvm_mmu_pages *pvec)
1784 {
1785         int i, ret, nr_unsync_leaf = 0;
1786
1787         for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1788                 struct kvm_mmu_page *child;
1789                 u64 ent = sp->spt[i];
1790
1791                 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
1792                         clear_unsync_child_bit(sp, i);
1793                         continue;
1794                 }
1795
1796                 child = to_shadow_page(ent & PT64_BASE_ADDR_MASK);
1797
1798                 if (child->unsync_children) {
1799                         if (mmu_pages_add(pvec, child, i))
1800                                 return -ENOSPC;
1801
1802                         ret = __mmu_unsync_walk(child, pvec);
1803                         if (!ret) {
1804                                 clear_unsync_child_bit(sp, i);
1805                                 continue;
1806                         } else if (ret > 0) {
1807                                 nr_unsync_leaf += ret;
1808                         } else
1809                                 return ret;
1810                 } else if (child->unsync) {
1811                         nr_unsync_leaf++;
1812                         if (mmu_pages_add(pvec, child, i))
1813                                 return -ENOSPC;
1814                 } else
1815                         clear_unsync_child_bit(sp, i);
1816         }
1817
1818         return nr_unsync_leaf;
1819 }
1820
1821 #define INVALID_INDEX (-1)
1822
1823 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1824                            struct kvm_mmu_pages *pvec)
1825 {
1826         pvec->nr = 0;
1827         if (!sp->unsync_children)
1828                 return 0;
1829
1830         mmu_pages_add(pvec, sp, INVALID_INDEX);
1831         return __mmu_unsync_walk(sp, pvec);
1832 }
1833
1834 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1835 {
1836         WARN_ON(!sp->unsync);
1837         trace_kvm_mmu_sync_page(sp);
1838         sp->unsync = 0;
1839         --kvm->stat.mmu_unsync;
1840 }
1841
1842 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1843                                      struct list_head *invalid_list);
1844 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1845                                     struct list_head *invalid_list);
1846
1847 #define for_each_valid_sp(_kvm, _sp, _list)                             \
1848         hlist_for_each_entry(_sp, _list, hash_link)                     \
1849                 if (is_obsolete_sp((_kvm), (_sp))) {                    \
1850                 } else
1851
1852 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                 \
1853         for_each_valid_sp(_kvm, _sp,                                    \
1854           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])     \
1855                 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
1856
1857 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1858                          struct list_head *invalid_list)
1859 {
1860         int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
1861
1862         if (ret < 0)
1863                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1864         return ret;
1865 }
1866
1867 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
1868                                         struct list_head *invalid_list,
1869                                         bool remote_flush)
1870 {
1871         if (!remote_flush && list_empty(invalid_list))
1872                 return false;
1873
1874         if (!list_empty(invalid_list))
1875                 kvm_mmu_commit_zap_page(kvm, invalid_list);
1876         else
1877                 kvm_flush_remote_tlbs(kvm);
1878         return true;
1879 }
1880
1881 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
1882 {
1883         if (sp->role.invalid)
1884                 return true;
1885
1886         /* TDP MMU pages due not use the MMU generation. */
1887         return !sp->tdp_mmu_page &&
1888                unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
1889 }
1890
1891 struct mmu_page_path {
1892         struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
1893         unsigned int idx[PT64_ROOT_MAX_LEVEL];
1894 };
1895
1896 #define for_each_sp(pvec, sp, parents, i)                       \
1897                 for (i = mmu_pages_first(&pvec, &parents);      \
1898                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
1899                         i = mmu_pages_next(&pvec, &parents, i))
1900
1901 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1902                           struct mmu_page_path *parents,
1903                           int i)
1904 {
1905         int n;
1906
1907         for (n = i+1; n < pvec->nr; n++) {
1908                 struct kvm_mmu_page *sp = pvec->page[n].sp;
1909                 unsigned idx = pvec->page[n].idx;
1910                 int level = sp->role.level;
1911
1912                 parents->idx[level-1] = idx;
1913                 if (level == PG_LEVEL_4K)
1914                         break;
1915
1916                 parents->parent[level-2] = sp;
1917         }
1918
1919         return n;
1920 }
1921
1922 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
1923                            struct mmu_page_path *parents)
1924 {
1925         struct kvm_mmu_page *sp;
1926         int level;
1927
1928         if (pvec->nr == 0)
1929                 return 0;
1930
1931         WARN_ON(pvec->page[0].idx != INVALID_INDEX);
1932
1933         sp = pvec->page[0].sp;
1934         level = sp->role.level;
1935         WARN_ON(level == PG_LEVEL_4K);
1936
1937         parents->parent[level-2] = sp;
1938
1939         /* Also set up a sentinel.  Further entries in pvec are all
1940          * children of sp, so this element is never overwritten.
1941          */
1942         parents->parent[level-1] = NULL;
1943         return mmu_pages_next(pvec, parents, 0);
1944 }
1945
1946 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1947 {
1948         struct kvm_mmu_page *sp;
1949         unsigned int level = 0;
1950
1951         do {
1952                 unsigned int idx = parents->idx[level];
1953                 sp = parents->parent[level];
1954                 if (!sp)
1955                         return;
1956
1957                 WARN_ON(idx == INVALID_INDEX);
1958                 clear_unsync_child_bit(sp, idx);
1959                 level++;
1960         } while (!sp->unsync_children);
1961 }
1962
1963 static int mmu_sync_children(struct kvm_vcpu *vcpu,
1964                              struct kvm_mmu_page *parent, bool can_yield)
1965 {
1966         int i;
1967         struct kvm_mmu_page *sp;
1968         struct mmu_page_path parents;
1969         struct kvm_mmu_pages pages;
1970         LIST_HEAD(invalid_list);
1971         bool flush = false;
1972
1973         while (mmu_unsync_walk(parent, &pages)) {
1974                 bool protected = false;
1975
1976                 for_each_sp(pages, sp, parents, i)
1977                         protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
1978
1979                 if (protected) {
1980                         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
1981                         flush = false;
1982                 }
1983
1984                 for_each_sp(pages, sp, parents, i) {
1985                         kvm_unlink_unsync_page(vcpu->kvm, sp);
1986                         flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
1987                         mmu_pages_clear_parents(&parents);
1988                 }
1989                 if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
1990                         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
1991                         if (!can_yield) {
1992                                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1993                                 return -EINTR;
1994                         }
1995
1996                         cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
1997                         flush = false;
1998                 }
1999         }
2000
2001         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
2002         return 0;
2003 }
2004
2005 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2006 {
2007         atomic_set(&sp->write_flooding_count,  0);
2008 }
2009
2010 static void clear_sp_write_flooding_count(u64 *spte)
2011 {
2012         __clear_sp_write_flooding_count(sptep_to_sp(spte));
2013 }
2014
2015 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2016                                              gfn_t gfn,
2017                                              gva_t gaddr,
2018                                              unsigned level,
2019                                              int direct,
2020                                              unsigned int access)
2021 {
2022         bool direct_mmu = vcpu->arch.mmu->direct_map;
2023         union kvm_mmu_page_role role;
2024         struct hlist_head *sp_list;
2025         unsigned quadrant;
2026         struct kvm_mmu_page *sp;
2027         int ret;
2028         int collisions = 0;
2029         LIST_HEAD(invalid_list);
2030
2031         role = vcpu->arch.mmu->mmu_role.base;
2032         role.level = level;
2033         role.direct = direct;
2034         role.access = access;
2035         if (role.has_4_byte_gpte) {
2036                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2037                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2038                 role.quadrant = quadrant;
2039         }
2040
2041         sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
2042         for_each_valid_sp(vcpu->kvm, sp, sp_list) {
2043                 if (sp->gfn != gfn) {
2044                         collisions++;
2045                         continue;
2046                 }
2047
2048                 if (sp->role.word != role.word) {
2049                         /*
2050                          * If the guest is creating an upper-level page, zap
2051                          * unsync pages for the same gfn.  While it's possible
2052                          * the guest is using recursive page tables, in all
2053                          * likelihood the guest has stopped using the unsync
2054                          * page and is installing a completely unrelated page.
2055                          * Unsync pages must not be left as is, because the new
2056                          * upper-level page will be write-protected.
2057                          */
2058                         if (level > PG_LEVEL_4K && sp->unsync)
2059                                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2060                                                          &invalid_list);
2061                         continue;
2062                 }
2063
2064                 if (direct_mmu)
2065                         goto trace_get_page;
2066
2067                 if (sp->unsync) {
2068                         /*
2069                          * The page is good, but is stale.  kvm_sync_page does
2070                          * get the latest guest state, but (unlike mmu_unsync_children)
2071                          * it doesn't write-protect the page or mark it synchronized!
2072                          * This way the validity of the mapping is ensured, but the
2073                          * overhead of write protection is not incurred until the
2074                          * guest invalidates the TLB mapping.  This allows multiple
2075                          * SPs for a single gfn to be unsync.
2076                          *
2077                          * If the sync fails, the page is zapped.  If so, break
2078                          * in order to rebuild it.
2079                          */
2080                         ret = kvm_sync_page(vcpu, sp, &invalid_list);
2081                         if (ret < 0)
2082                                 break;
2083
2084                         WARN_ON(!list_empty(&invalid_list));
2085                         if (ret > 0)
2086                                 kvm_flush_remote_tlbs(vcpu->kvm);
2087                 }
2088
2089                 __clear_sp_write_flooding_count(sp);
2090
2091 trace_get_page:
2092                 trace_kvm_mmu_get_page(sp, false);
2093                 goto out;
2094         }
2095
2096         ++vcpu->kvm->stat.mmu_cache_miss;
2097
2098         sp = kvm_mmu_alloc_page(vcpu, direct);
2099
2100         sp->gfn = gfn;
2101         sp->role = role;
2102         hlist_add_head(&sp->hash_link, sp_list);
2103         if (!direct) {
2104                 account_shadowed(vcpu->kvm, sp);
2105                 if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
2106                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2107         }
2108         trace_kvm_mmu_get_page(sp, true);
2109 out:
2110         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2111
2112         if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2113                 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2114         return sp;
2115 }
2116
2117 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2118                                         struct kvm_vcpu *vcpu, hpa_t root,
2119                                         u64 addr)
2120 {
2121         iterator->addr = addr;
2122         iterator->shadow_addr = root;
2123         iterator->level = vcpu->arch.mmu->shadow_root_level;
2124
2125         if (iterator->level >= PT64_ROOT_4LEVEL &&
2126             vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2127             !vcpu->arch.mmu->direct_map)
2128                 iterator->level = PT32E_ROOT_LEVEL;
2129
2130         if (iterator->level == PT32E_ROOT_LEVEL) {
2131                 /*
2132                  * prev_root is currently only used for 64-bit hosts. So only
2133                  * the active root_hpa is valid here.
2134                  */
2135                 BUG_ON(root != vcpu->arch.mmu->root.hpa);
2136
2137                 iterator->shadow_addr
2138                         = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2139                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2140                 --iterator->level;
2141                 if (!iterator->shadow_addr)
2142                         iterator->level = 0;
2143         }
2144 }
2145
2146 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2147                              struct kvm_vcpu *vcpu, u64 addr)
2148 {
2149         shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
2150                                     addr);
2151 }
2152
2153 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2154 {
2155         if (iterator->level < PG_LEVEL_4K)
2156                 return false;
2157
2158         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2159         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2160         return true;
2161 }
2162
2163 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2164                                u64 spte)
2165 {
2166         if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
2167                 iterator->level = 0;
2168                 return;
2169         }
2170
2171         iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2172         --iterator->level;
2173 }
2174
2175 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2176 {
2177         __shadow_walk_next(iterator, *iterator->sptep);
2178 }
2179
2180 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2181                              struct kvm_mmu_page *sp)
2182 {
2183         u64 spte;
2184
2185         BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2186
2187         spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
2188
2189         mmu_spte_set(sptep, spte);
2190
2191         mmu_page_add_parent_pte(vcpu, sp, sptep);
2192
2193         if (sp->unsync_children || sp->unsync)
2194                 mark_unsync(sptep);
2195 }
2196
2197 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2198                                    unsigned direct_access)
2199 {
2200         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2201                 struct kvm_mmu_page *child;
2202
2203                 /*
2204                  * For the direct sp, if the guest pte's dirty bit
2205                  * changed form clean to dirty, it will corrupt the
2206                  * sp's access: allow writable in the read-only sp,
2207                  * so we should update the spte at this point to get
2208                  * a new sp with the correct access.
2209                  */
2210                 child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK);
2211                 if (child->role.access == direct_access)
2212                         return;
2213
2214                 drop_parent_pte(child, sptep);
2215                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2216         }
2217 }
2218
2219 /* Returns the number of zapped non-leaf child shadow pages. */
2220 static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2221                             u64 *spte, struct list_head *invalid_list)
2222 {
2223         u64 pte;
2224         struct kvm_mmu_page *child;
2225
2226         pte = *spte;
2227         if (is_shadow_present_pte(pte)) {
2228                 if (is_last_spte(pte, sp->role.level)) {
2229                         drop_spte(kvm, spte);
2230                 } else {
2231                         child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
2232                         drop_parent_pte(child, spte);
2233
2234                         /*
2235                          * Recursively zap nested TDP SPs, parentless SPs are
2236                          * unlikely to be used again in the near future.  This
2237                          * avoids retaining a large number of stale nested SPs.
2238                          */
2239                         if (tdp_enabled && invalid_list &&
2240                             child->role.guest_mode && !child->parent_ptes.val)
2241                                 return kvm_mmu_prepare_zap_page(kvm, child,
2242                                                                 invalid_list);
2243                 }
2244         } else if (is_mmio_spte(pte)) {
2245                 mmu_spte_clear_no_track(spte);
2246         }
2247         return 0;
2248 }
2249
2250 static int kvm_mmu_page_unlink_children(struct kvm *kvm,
2251                                         struct kvm_mmu_page *sp,
2252                                         struct list_head *invalid_list)
2253 {
2254         int zapped = 0;
2255         unsigned i;
2256
2257         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2258                 zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
2259
2260         return zapped;
2261 }
2262
2263 static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
2264 {
2265         u64 *sptep;
2266         struct rmap_iterator iter;
2267
2268         while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2269                 drop_parent_pte(sp, sptep);
2270 }
2271
2272 static int mmu_zap_unsync_children(struct kvm *kvm,
2273                                    struct kvm_mmu_page *parent,
2274                                    struct list_head *invalid_list)
2275 {
2276         int i, zapped = 0;
2277         struct mmu_page_path parents;
2278         struct kvm_mmu_pages pages;
2279
2280         if (parent->role.level == PG_LEVEL_4K)
2281                 return 0;
2282
2283         while (mmu_unsync_walk(parent, &pages)) {
2284                 struct kvm_mmu_page *sp;
2285
2286                 for_each_sp(pages, sp, parents, i) {
2287                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2288                         mmu_pages_clear_parents(&parents);
2289                         zapped++;
2290                 }
2291         }
2292
2293         return zapped;
2294 }
2295
2296 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2297                                        struct kvm_mmu_page *sp,
2298                                        struct list_head *invalid_list,
2299                                        int *nr_zapped)
2300 {
2301         bool list_unstable, zapped_root = false;
2302
2303         trace_kvm_mmu_prepare_zap_page(sp);
2304         ++kvm->stat.mmu_shadow_zapped;
2305         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2306         *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
2307         kvm_mmu_unlink_parents(sp);
2308
2309         /* Zapping children means active_mmu_pages has become unstable. */
2310         list_unstable = *nr_zapped;
2311
2312         if (!sp->role.invalid && !sp->role.direct)
2313                 unaccount_shadowed(kvm, sp);
2314
2315         if (sp->unsync)
2316                 kvm_unlink_unsync_page(kvm, sp);
2317         if (!sp->root_count) {
2318                 /* Count self */
2319                 (*nr_zapped)++;
2320
2321                 /*
2322                  * Already invalid pages (previously active roots) are not on
2323                  * the active page list.  See list_del() in the "else" case of
2324                  * !sp->root_count.
2325                  */
2326                 if (sp->role.invalid)
2327                         list_add(&sp->link, invalid_list);
2328                 else
2329                         list_move(&sp->link, invalid_list);
2330                 kvm_mod_used_mmu_pages(kvm, -1);
2331         } else {
2332                 /*
2333                  * Remove the active root from the active page list, the root
2334                  * will be explicitly freed when the root_count hits zero.
2335                  */
2336                 list_del(&sp->link);
2337
2338                 /*
2339                  * Obsolete pages cannot be used on any vCPUs, see the comment
2340                  * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
2341                  * treats invalid shadow pages as being obsolete.
2342                  */
2343                 zapped_root = !is_obsolete_sp(kvm, sp);
2344         }
2345
2346         if (sp->lpage_disallowed)
2347                 unaccount_huge_nx_page(kvm, sp);
2348
2349         sp->role.invalid = 1;
2350
2351         /*
2352          * Make the request to free obsolete roots after marking the root
2353          * invalid, otherwise other vCPUs may not see it as invalid.
2354          */
2355         if (zapped_root)
2356                 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
2357         return list_unstable;
2358 }
2359
2360 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2361                                      struct list_head *invalid_list)
2362 {
2363         int nr_zapped;
2364
2365         __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2366         return nr_zapped;
2367 }
2368
2369 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2370                                     struct list_head *invalid_list)
2371 {
2372         struct kvm_mmu_page *sp, *nsp;
2373
2374         if (list_empty(invalid_list))
2375                 return;
2376
2377         /*
2378          * We need to make sure everyone sees our modifications to
2379          * the page tables and see changes to vcpu->mode here. The barrier
2380          * in the kvm_flush_remote_tlbs() achieves this. This pairs
2381          * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2382          *
2383          * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2384          * guest mode and/or lockless shadow page table walks.
2385          */
2386         kvm_flush_remote_tlbs(kvm);
2387
2388         list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2389                 WARN_ON(!sp->role.invalid || sp->root_count);
2390                 kvm_mmu_free_page(sp);
2391         }
2392 }
2393
2394 static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
2395                                                   unsigned long nr_to_zap)
2396 {
2397         unsigned long total_zapped = 0;
2398         struct kvm_mmu_page *sp, *tmp;
2399         LIST_HEAD(invalid_list);
2400         bool unstable;
2401         int nr_zapped;
2402
2403         if (list_empty(&kvm->arch.active_mmu_pages))
2404                 return 0;
2405
2406 restart:
2407         list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
2408                 /*
2409                  * Don't zap active root pages, the page itself can't be freed
2410                  * and zapping it will just force vCPUs to realloc and reload.
2411                  */
2412                 if (sp->root_count)
2413                         continue;
2414
2415                 unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
2416                                                       &nr_zapped);
2417                 total_zapped += nr_zapped;
2418                 if (total_zapped >= nr_to_zap)
2419                         break;
2420
2421                 if (unstable)
2422                         goto restart;
2423         }
2424
2425         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2426
2427         kvm->stat.mmu_recycled += total_zapped;
2428         return total_zapped;
2429 }
2430
2431 static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
2432 {
2433         if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
2434                 return kvm->arch.n_max_mmu_pages -
2435                         kvm->arch.n_used_mmu_pages;
2436
2437         return 0;
2438 }
2439
2440 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
2441 {
2442         unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
2443
2444         if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
2445                 return 0;
2446
2447         kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
2448
2449         /*
2450          * Note, this check is intentionally soft, it only guarantees that one
2451          * page is available, while the caller may end up allocating as many as
2452          * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
2453          * exceeding the (arbitrary by default) limit will not harm the host,
2454          * being too aggressive may unnecessarily kill the guest, and getting an
2455          * exact count is far more trouble than it's worth, especially in the
2456          * page fault paths.
2457          */
2458         if (!kvm_mmu_available_pages(vcpu->kvm))
2459                 return -ENOSPC;
2460         return 0;
2461 }
2462
2463 /*
2464  * Changing the number of mmu pages allocated to the vm
2465  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2466  */
2467 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2468 {
2469         write_lock(&kvm->mmu_lock);
2470
2471         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2472                 kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
2473                                                   goal_nr_mmu_pages);
2474
2475                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2476         }
2477
2478         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2479
2480         write_unlock(&kvm->mmu_lock);
2481 }
2482
2483 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2484 {
2485         struct kvm_mmu_page *sp;
2486         LIST_HEAD(invalid_list);
2487         int r;
2488
2489         pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2490         r = 0;
2491         write_lock(&kvm->mmu_lock);
2492         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2493                 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2494                          sp->role.word);
2495                 r = 1;
2496                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2497         }
2498         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2499         write_unlock(&kvm->mmu_lock);
2500
2501         return r;
2502 }
2503
2504 static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2505 {
2506         gpa_t gpa;
2507         int r;
2508
2509         if (vcpu->arch.mmu->direct_map)
2510                 return 0;
2511
2512         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2513
2514         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2515
2516         return r;
2517 }
2518
2519 static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2520 {
2521         trace_kvm_mmu_unsync_page(sp);
2522         ++kvm->stat.mmu_unsync;
2523         sp->unsync = 1;
2524
2525         kvm_mmu_mark_parents_unsync(sp);
2526 }
2527
2528 /*
2529  * Attempt to unsync any shadow pages that can be reached by the specified gfn,
2530  * KVM is creating a writable mapping for said gfn.  Returns 0 if all pages
2531  * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
2532  * be write-protected.
2533  */
2534 int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
2535                             gfn_t gfn, bool can_unsync, bool prefetch)
2536 {
2537         struct kvm_mmu_page *sp;
2538         bool locked = false;
2539
2540         /*
2541          * Force write-protection if the page is being tracked.  Note, the page
2542          * track machinery is used to write-protect upper-level shadow pages,
2543          * i.e. this guards the role.level == 4K assertion below!
2544          */
2545         if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
2546                 return -EPERM;
2547
2548         /*
2549          * The page is not write-tracked, mark existing shadow pages unsync
2550          * unless KVM is synchronizing an unsync SP (can_unsync = false).  In
2551          * that case, KVM must complete emulation of the guest TLB flush before
2552          * allowing shadow pages to become unsync (writable by the guest).
2553          */
2554         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2555                 if (!can_unsync)
2556                         return -EPERM;
2557
2558                 if (sp->unsync)
2559                         continue;
2560
2561                 if (prefetch)
2562                         return -EEXIST;
2563
2564                 /*
2565                  * TDP MMU page faults require an additional spinlock as they
2566                  * run with mmu_lock held for read, not write, and the unsync
2567                  * logic is not thread safe.  Take the spinklock regardless of
2568                  * the MMU type to avoid extra conditionals/parameters, there's
2569                  * no meaningful penalty if mmu_lock is held for write.
2570                  */
2571                 if (!locked) {
2572                         locked = true;
2573                         spin_lock(&kvm->arch.mmu_unsync_pages_lock);
2574
2575                         /*
2576                          * Recheck after taking the spinlock, a different vCPU
2577                          * may have since marked the page unsync.  A false
2578                          * positive on the unprotected check above is not
2579                          * possible as clearing sp->unsync _must_ hold mmu_lock
2580                          * for write, i.e. unsync cannot transition from 0->1
2581                          * while this CPU holds mmu_lock for read (or write).
2582                          */
2583                         if (READ_ONCE(sp->unsync))
2584                                 continue;
2585                 }
2586
2587                 WARN_ON(sp->role.level != PG_LEVEL_4K);
2588                 kvm_unsync_page(kvm, sp);
2589         }
2590         if (locked)
2591                 spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
2592
2593         /*
2594          * We need to ensure that the marking of unsync pages is visible
2595          * before the SPTE is updated to allow writes because
2596          * kvm_mmu_sync_roots() checks the unsync flags without holding
2597          * the MMU lock and so can race with this. If the SPTE was updated
2598          * before the page had been marked as unsync-ed, something like the
2599          * following could happen:
2600          *
2601          * CPU 1                    CPU 2
2602          * ---------------------------------------------------------------------
2603          * 1.2 Host updates SPTE
2604          *     to be writable
2605          *                      2.1 Guest writes a GPTE for GVA X.
2606          *                          (GPTE being in the guest page table shadowed
2607          *                           by the SP from CPU 1.)
2608          *                          This reads SPTE during the page table walk.
2609          *                          Since SPTE.W is read as 1, there is no
2610          *                          fault.
2611          *
2612          *                      2.2 Guest issues TLB flush.
2613          *                          That causes a VM Exit.
2614          *
2615          *                      2.3 Walking of unsync pages sees sp->unsync is
2616          *                          false and skips the page.
2617          *
2618          *                      2.4 Guest accesses GVA X.
2619          *                          Since the mapping in the SP was not updated,
2620          *                          so the old mapping for GVA X incorrectly
2621          *                          gets used.
2622          * 1.1 Host marks SP
2623          *     as unsync
2624          *     (sp->unsync = true)
2625          *
2626          * The write barrier below ensures that 1.1 happens before 1.2 and thus
2627          * the situation in 2.4 does not arise.  It pairs with the read barrier
2628          * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
2629          */
2630         smp_wmb();
2631
2632         return 0;
2633 }
2634
2635 static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
2636                         u64 *sptep, unsigned int pte_access, gfn_t gfn,
2637                         kvm_pfn_t pfn, struct kvm_page_fault *fault)
2638 {
2639         struct kvm_mmu_page *sp = sptep_to_sp(sptep);
2640         int level = sp->role.level;
2641         int was_rmapped = 0;
2642         int ret = RET_PF_FIXED;
2643         bool flush = false;
2644         bool wrprot;
2645         u64 spte;
2646
2647         /* Prefetching always gets a writable pfn.  */
2648         bool host_writable = !fault || fault->map_writable;
2649         bool prefetch = !fault || fault->prefetch;
2650         bool write_fault = fault && fault->write;
2651
2652         pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2653                  *sptep, write_fault, gfn);
2654
2655         if (unlikely(is_noslot_pfn(pfn))) {
2656                 mark_mmio_spte(vcpu, sptep, gfn, pte_access);
2657                 return RET_PF_EMULATE;
2658         }
2659
2660         if (is_shadow_present_pte(*sptep)) {
2661                 /*
2662                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2663                  * the parent of the now unreachable PTE.
2664                  */
2665                 if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
2666                         struct kvm_mmu_page *child;
2667                         u64 pte = *sptep;
2668
2669                         child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
2670                         drop_parent_pte(child, sptep);
2671                         flush = true;
2672                 } else if (pfn != spte_to_pfn(*sptep)) {
2673                         pgprintk("hfn old %llx new %llx\n",
2674                                  spte_to_pfn(*sptep), pfn);
2675                         drop_spte(vcpu->kvm, sptep);
2676                         flush = true;
2677                 } else
2678                         was_rmapped = 1;
2679         }
2680
2681         wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
2682                            true, host_writable, &spte);
2683
2684         if (*sptep == spte) {
2685                 ret = RET_PF_SPURIOUS;
2686         } else {
2687                 flush |= mmu_spte_update(sptep, spte);
2688                 trace_kvm_mmu_set_spte(level, gfn, sptep);
2689         }
2690
2691         if (wrprot) {
2692                 if (write_fault)
2693                         ret = RET_PF_EMULATE;
2694         }
2695
2696         if (flush)
2697                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
2698                                 KVM_PAGES_PER_HPAGE(level));
2699
2700         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2701
2702         if (!was_rmapped) {
2703                 WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
2704                 kvm_update_page_stats(vcpu->kvm, level, 1);
2705                 rmap_add(vcpu, slot, sptep, gfn);
2706         }
2707
2708         return ret;
2709 }
2710
2711 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2712                                     struct kvm_mmu_page *sp,
2713                                     u64 *start, u64 *end)
2714 {
2715         struct page *pages[PTE_PREFETCH_NUM];
2716         struct kvm_memory_slot *slot;
2717         unsigned int access = sp->role.access;
2718         int i, ret;
2719         gfn_t gfn;
2720
2721         gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2722         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
2723         if (!slot)
2724                 return -1;
2725
2726         ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
2727         if (ret <= 0)
2728                 return -1;
2729
2730         for (i = 0; i < ret; i++, gfn++, start++) {
2731                 mmu_set_spte(vcpu, slot, start, access, gfn,
2732                              page_to_pfn(pages[i]), NULL);
2733                 put_page(pages[i]);
2734         }
2735
2736         return 0;
2737 }
2738
2739 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2740                                   struct kvm_mmu_page *sp, u64 *sptep)
2741 {
2742         u64 *spte, *start = NULL;
2743         int i;
2744
2745         WARN_ON(!sp->role.direct);
2746
2747         i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2748         spte = sp->spt + i;
2749
2750         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2751                 if (is_shadow_present_pte(*spte) || spte == sptep) {
2752                         if (!start)
2753                                 continue;
2754                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2755                                 return;
2756                         start = NULL;
2757                 } else if (!start)
2758                         start = spte;
2759         }
2760         if (start)
2761                 direct_pte_prefetch_many(vcpu, sp, start, spte);
2762 }
2763
2764 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2765 {
2766         struct kvm_mmu_page *sp;
2767
2768         sp = sptep_to_sp(sptep);
2769
2770         /*
2771          * Without accessed bits, there's no way to distinguish between
2772          * actually accessed translations and prefetched, so disable pte
2773          * prefetch if accessed bits aren't available.
2774          */
2775         if (sp_ad_disabled(sp))
2776                 return;
2777
2778         if (sp->role.level > PG_LEVEL_4K)
2779                 return;
2780
2781         /*
2782          * If addresses are being invalidated, skip prefetching to avoid
2783          * accidentally prefetching those addresses.
2784          */
2785         if (unlikely(vcpu->kvm->mmu_notifier_count))
2786                 return;
2787
2788         __direct_pte_prefetch(vcpu, sp, sptep);
2789 }
2790
2791 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
2792                                   const struct kvm_memory_slot *slot)
2793 {
2794         unsigned long hva;
2795         unsigned long flags;
2796         int level = PG_LEVEL_4K;
2797         pgd_t pgd;
2798         p4d_t p4d;
2799         pud_t pud;
2800         pmd_t pmd;
2801
2802         if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
2803                 return PG_LEVEL_4K;
2804
2805         /*
2806          * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
2807          * is not solely for performance, it's also necessary to avoid the
2808          * "writable" check in __gfn_to_hva_many(), which will always fail on
2809          * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
2810          * page fault steps have already verified the guest isn't writing a
2811          * read-only memslot.
2812          */
2813         hva = __gfn_to_hva_memslot(slot, gfn);
2814
2815         /*
2816          * Lookup the mapping level in the current mm.  The information
2817          * may become stale soon, but it is safe to use as long as
2818          * 1) mmu_notifier_retry was checked after taking mmu_lock, and
2819          * 2) mmu_lock is taken now.
2820          *
2821          * We still need to disable IRQs to prevent concurrent tear down
2822          * of page tables.
2823          */
2824         local_irq_save(flags);
2825
2826         pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
2827         if (pgd_none(pgd))
2828                 goto out;
2829
2830         p4d = READ_ONCE(*p4d_offset(&pgd, hva));
2831         if (p4d_none(p4d) || !p4d_present(p4d))
2832                 goto out;
2833
2834         pud = READ_ONCE(*pud_offset(&p4d, hva));
2835         if (pud_none(pud) || !pud_present(pud))
2836                 goto out;
2837
2838         if (pud_large(pud)) {
2839                 level = PG_LEVEL_1G;
2840                 goto out;
2841         }
2842
2843         pmd = READ_ONCE(*pmd_offset(&pud, hva));
2844         if (pmd_none(pmd) || !pmd_present(pmd))
2845                 goto out;
2846
2847         if (pmd_large(pmd))
2848                 level = PG_LEVEL_2M;
2849
2850 out:
2851         local_irq_restore(flags);
2852         return level;
2853 }
2854
2855 int kvm_mmu_max_mapping_level(struct kvm *kvm,
2856                               const struct kvm_memory_slot *slot, gfn_t gfn,
2857                               kvm_pfn_t pfn, int max_level)
2858 {
2859         struct kvm_lpage_info *linfo;
2860         int host_level;
2861
2862         max_level = min(max_level, max_huge_page_level);
2863         for ( ; max_level > PG_LEVEL_4K; max_level--) {
2864                 linfo = lpage_info_slot(gfn, slot, max_level);
2865                 if (!linfo->disallow_lpage)
2866                         break;
2867         }
2868
2869         if (max_level == PG_LEVEL_4K)
2870                 return PG_LEVEL_4K;
2871
2872         host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
2873         return min(host_level, max_level);
2874 }
2875
2876 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
2877 {
2878         struct kvm_memory_slot *slot = fault->slot;
2879         kvm_pfn_t mask;
2880
2881         fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
2882
2883         if (unlikely(fault->max_level == PG_LEVEL_4K))
2884                 return;
2885
2886         if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn))
2887                 return;
2888
2889         if (kvm_slot_dirty_track_enabled(slot))
2890                 return;
2891
2892         /*
2893          * Enforce the iTLB multihit workaround after capturing the requested
2894          * level, which will be used to do precise, accurate accounting.
2895          */
2896         fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
2897                                                      fault->gfn, fault->pfn,
2898                                                      fault->max_level);
2899         if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
2900                 return;
2901
2902         /*
2903          * mmu_notifier_retry() was successful and mmu_lock is held, so
2904          * the pmd can't be split from under us.
2905          */
2906         fault->goal_level = fault->req_level;
2907         mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
2908         VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
2909         fault->pfn &= ~mask;
2910 }
2911
2912 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
2913 {
2914         if (cur_level > PG_LEVEL_4K &&
2915             cur_level == fault->goal_level &&
2916             is_shadow_present_pte(spte) &&
2917             !is_large_pte(spte)) {
2918                 /*
2919                  * A small SPTE exists for this pfn, but FNAME(fetch)
2920                  * and __direct_map would like to create a large PTE
2921                  * instead: just force them to go down another level,
2922                  * patching back for them into pfn the next 9 bits of
2923                  * the address.
2924                  */
2925                 u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
2926                                 KVM_PAGES_PER_HPAGE(cur_level - 1);
2927                 fault->pfn |= fault->gfn & page_mask;
2928                 fault->goal_level--;
2929         }
2930 }
2931
2932 static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
2933 {
2934         struct kvm_shadow_walk_iterator it;
2935         struct kvm_mmu_page *sp;
2936         int ret;
2937         gfn_t base_gfn = fault->gfn;
2938
2939         kvm_mmu_hugepage_adjust(vcpu, fault);
2940
2941         trace_kvm_mmu_spte_requested(fault);
2942         for_each_shadow_entry(vcpu, fault->addr, it) {
2943                 /*
2944                  * We cannot overwrite existing page tables with an NX
2945                  * large page, as the leaf could be executable.
2946                  */
2947                 if (fault->nx_huge_page_workaround_enabled)
2948                         disallowed_hugepage_adjust(fault, *it.sptep, it.level);
2949
2950                 base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
2951                 if (it.level == fault->goal_level)
2952                         break;
2953
2954                 drop_large_spte(vcpu, it.sptep);
2955                 if (is_shadow_present_pte(*it.sptep))
2956                         continue;
2957
2958                 sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
2959                                       it.level - 1, true, ACC_ALL);
2960
2961                 link_shadow_page(vcpu, it.sptep, sp);
2962                 if (fault->is_tdp && fault->huge_page_disallowed &&
2963                     fault->req_level >= it.level)
2964                         account_huge_nx_page(vcpu->kvm, sp);
2965         }
2966
2967         if (WARN_ON_ONCE(it.level != fault->goal_level))
2968                 return -EFAULT;
2969
2970         ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
2971                            base_gfn, fault->pfn, fault);
2972         if (ret == RET_PF_SPURIOUS)
2973                 return ret;
2974
2975         direct_pte_prefetch(vcpu, it.sptep);
2976         ++vcpu->stat.pf_fixed;
2977         return ret;
2978 }
2979
2980 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2981 {
2982         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
2983 }
2984
2985 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
2986 {
2987         /*
2988          * Do not cache the mmio info caused by writing the readonly gfn
2989          * into the spte otherwise read access on readonly gfn also can
2990          * caused mmio page fault and treat it as mmio access.
2991          */
2992         if (pfn == KVM_PFN_ERR_RO_FAULT)
2993                 return RET_PF_EMULATE;
2994
2995         if (pfn == KVM_PFN_ERR_HWPOISON) {
2996                 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
2997                 return RET_PF_RETRY;
2998         }
2999
3000         return -EFAULT;
3001 }
3002
3003 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
3004                                 unsigned int access, int *ret_val)
3005 {
3006         /* The pfn is invalid, report the error! */
3007         if (unlikely(is_error_pfn(fault->pfn))) {
3008                 *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
3009                 return true;
3010         }
3011
3012         if (unlikely(!fault->slot)) {
3013                 gva_t gva = fault->is_tdp ? 0 : fault->addr;
3014
3015                 vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
3016                                      access & shadow_mmio_access_mask);
3017                 /*
3018                  * If MMIO caching is disabled, emulate immediately without
3019                  * touching the shadow page tables as attempting to install an
3020                  * MMIO SPTE will just be an expensive nop.  Do not cache MMIO
3021                  * whose gfn is greater than host.MAXPHYADDR, any guest that
3022                  * generates such gfns is running nested and is being tricked
3023                  * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
3024                  * and only if L1's MAXPHYADDR is inaccurate with respect to
3025                  * the hardware's).
3026                  */
3027                 if (unlikely(!enable_mmio_caching) ||
3028                     unlikely(fault->gfn > kvm_mmu_max_gfn())) {
3029                         *ret_val = RET_PF_EMULATE;
3030                         return true;
3031                 }
3032         }
3033
3034         return false;
3035 }
3036
3037 static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
3038 {
3039         /*
3040          * Do not fix the mmio spte with invalid generation number which
3041          * need to be updated by slow page fault path.
3042          */
3043         if (fault->rsvd)
3044                 return false;
3045
3046         /* See if the page fault is due to an NX violation */
3047         if (unlikely(fault->exec && fault->present))
3048                 return false;
3049
3050         /*
3051          * #PF can be fast if:
3052          * 1. The shadow page table entry is not present, which could mean that
3053          *    the fault is potentially caused by access tracking (if enabled).
3054          * 2. The shadow page table entry is present and the fault
3055          *    is caused by write-protect, that means we just need change the W
3056          *    bit of the spte which can be done out of mmu-lock.
3057          *
3058          * However, if access tracking is disabled we know that a non-present
3059          * page must be a genuine page fault where we have to create a new SPTE.
3060          * So, if access tracking is disabled, we return true only for write
3061          * accesses to a present page.
3062          */
3063
3064         return shadow_acc_track_mask != 0 || (fault->write && fault->present);
3065 }
3066
3067 /*
3068  * Returns true if the SPTE was fixed successfully. Otherwise,
3069  * someone else modified the SPTE from its original value.
3070  */
3071 static bool
3072 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
3073                         u64 *sptep, u64 old_spte, u64 new_spte)
3074 {
3075         /*
3076          * Theoretically we could also set dirty bit (and flush TLB) here in
3077          * order to eliminate unnecessary PML logging. See comments in
3078          * set_spte. But fast_page_fault is very unlikely to happen with PML
3079          * enabled, so we do not do this. This might result in the same GPA
3080          * to be logged in PML buffer again when the write really happens, and
3081          * eventually to be called by mark_page_dirty twice. But it's also no
3082          * harm. This also avoids the TLB flush needed after setting dirty bit
3083          * so non-PML cases won't be impacted.
3084          *
3085          * Compare with set_spte where instead shadow_dirty_mask is set.
3086          */
3087         if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3088                 return false;
3089
3090         if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
3091                 mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
3092
3093         return true;
3094 }
3095
3096 static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
3097 {
3098         if (fault->exec)
3099                 return is_executable_pte(spte);
3100
3101         if (fault->write)
3102                 return is_writable_pte(spte);
3103
3104         /* Fault was on Read access */
3105         return spte & PT_PRESENT_MASK;
3106 }
3107
3108 /*
3109  * Returns the last level spte pointer of the shadow page walk for the given
3110  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
3111  * walk could be performed, returns NULL and *spte does not contain valid data.
3112  *
3113  * Contract:
3114  *  - Must be called between walk_shadow_page_lockless_{begin,end}.
3115  *  - The returned sptep must not be used after walk_shadow_page_lockless_end.
3116  */
3117 static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
3118 {
3119         struct kvm_shadow_walk_iterator iterator;
3120         u64 old_spte;
3121         u64 *sptep = NULL;
3122
3123         for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
3124                 sptep = iterator.sptep;
3125                 *spte = old_spte;
3126         }
3127
3128         return sptep;
3129 }
3130
3131 /*
3132  * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
3133  */
3134 static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3135 {
3136         struct kvm_mmu_page *sp;
3137         int ret = RET_PF_INVALID;
3138         u64 spte = 0ull;
3139         u64 *sptep = NULL;
3140         uint retry_count = 0;
3141
3142         if (!page_fault_can_be_fast(fault))
3143                 return ret;
3144
3145         walk_shadow_page_lockless_begin(vcpu);
3146
3147         do {
3148                 u64 new_spte;
3149
3150                 if (is_tdp_mmu(vcpu->arch.mmu))
3151                         sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3152                 else
3153                         sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3154
3155                 if (!is_shadow_present_pte(spte))
3156                         break;
3157
3158                 sp = sptep_to_sp(sptep);
3159                 if (!is_last_spte(spte, sp->role.level))
3160                         break;
3161
3162                 /*
3163                  * Check whether the memory access that caused the fault would
3164                  * still cause it if it were to be performed right now. If not,
3165                  * then this is a spurious fault caused by TLB lazily flushed,
3166                  * or some other CPU has already fixed the PTE after the
3167                  * current CPU took the fault.
3168                  *
3169                  * Need not check the access of upper level table entries since
3170                  * they are always ACC_ALL.
3171                  */
3172                 if (is_access_allowed(fault, spte)) {
3173                         ret = RET_PF_SPURIOUS;
3174                         break;
3175                 }
3176
3177                 new_spte = spte;
3178
3179                 if (is_access_track_spte(spte))
3180                         new_spte = restore_acc_track_spte(new_spte);
3181
3182                 /*
3183                  * Currently, to simplify the code, write-protection can
3184                  * be removed in the fast path only if the SPTE was
3185                  * write-protected for dirty-logging or access tracking.
3186                  */
3187                 if (fault->write &&
3188                     spte_can_locklessly_be_made_writable(spte)) {
3189                         new_spte |= PT_WRITABLE_MASK;
3190
3191                         /*
3192                          * Do not fix write-permission on the large spte when
3193                          * dirty logging is enabled. Since we only dirty the
3194                          * first page into the dirty-bitmap in
3195                          * fast_pf_fix_direct_spte(), other pages are missed
3196                          * if its slot has dirty logging enabled.
3197                          *
3198                          * Instead, we let the slow page fault path create a
3199                          * normal spte to fix the access.
3200                          */
3201                         if (sp->role.level > PG_LEVEL_4K &&
3202                             kvm_slot_dirty_track_enabled(fault->slot))
3203                                 break;
3204                 }
3205
3206                 /* Verify that the fault can be handled in the fast path */
3207                 if (new_spte == spte ||
3208                     !is_access_allowed(fault, new_spte))
3209                         break;
3210
3211                 /*
3212                  * Currently, fast page fault only works for direct mapping
3213                  * since the gfn is not stable for indirect shadow page. See
3214                  * Documentation/virt/kvm/locking.rst to get more detail.
3215                  */
3216                 if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
3217                         ret = RET_PF_FIXED;
3218                         break;
3219                 }
3220
3221                 if (++retry_count > 4) {
3222                         printk_once(KERN_WARNING
3223                                 "kvm: Fast #PF retrying more than 4 times.\n");
3224                         break;
3225                 }
3226
3227         } while (true);
3228
3229         trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
3230         walk_shadow_page_lockless_end(vcpu);
3231
3232         return ret;
3233 }
3234
3235 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3236                                struct list_head *invalid_list)
3237 {
3238         struct kvm_mmu_page *sp;
3239
3240         if (!VALID_PAGE(*root_hpa))
3241                 return;
3242
3243         sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
3244         if (WARN_ON(!sp))
3245                 return;
3246
3247         if (is_tdp_mmu_page(sp))
3248                 kvm_tdp_mmu_put_root(kvm, sp, false);
3249         else if (!--sp->root_count && sp->role.invalid)
3250                 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3251
3252         *root_hpa = INVALID_PAGE;
3253 }
3254
3255 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3256 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
3257                         ulong roots_to_free)
3258 {
3259         int i;
3260         LIST_HEAD(invalid_list);
3261         bool free_active_root;
3262
3263         BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3264
3265         /* Before acquiring the MMU lock, see if we need to do any real work. */
3266         free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
3267                 && VALID_PAGE(mmu->root.hpa);
3268
3269         if (!free_active_root) {
3270                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3271                         if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3272                             VALID_PAGE(mmu->prev_roots[i].hpa))
3273                                 break;
3274
3275                 if (i == KVM_MMU_NUM_PREV_ROOTS)
3276                         return;
3277         }
3278
3279         write_lock(&kvm->mmu_lock);
3280
3281         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3282                 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3283                         mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
3284                                            &invalid_list);
3285
3286         if (free_active_root) {
3287                 if (to_shadow_page(mmu->root.hpa)) {
3288                         mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
3289                 } else if (mmu->pae_root) {
3290                         for (i = 0; i < 4; ++i) {
3291                                 if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
3292                                         continue;
3293
3294                                 mmu_free_root_page(kvm, &mmu->pae_root[i],
3295                                                    &invalid_list);
3296                                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3297                         }
3298                 }
3299                 mmu->root.hpa = INVALID_PAGE;
3300                 mmu->root.pgd = 0;
3301         }
3302
3303         kvm_mmu_commit_zap_page(kvm, &invalid_list);
3304         write_unlock(&kvm->mmu_lock);
3305 }
3306 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3307
3308 void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
3309 {
3310         unsigned long roots_to_free = 0;
3311         hpa_t root_hpa;
3312         int i;
3313
3314         /*
3315          * This should not be called while L2 is active, L2 can't invalidate
3316          * _only_ its own roots, e.g. INVVPID unconditionally exits.
3317          */
3318         WARN_ON_ONCE(mmu->mmu_role.base.guest_mode);
3319
3320         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
3321                 root_hpa = mmu->prev_roots[i].hpa;
3322                 if (!VALID_PAGE(root_hpa))
3323                         continue;
3324
3325                 if (!to_shadow_page(root_hpa) ||
3326                         to_shadow_page(root_hpa)->role.guest_mode)
3327                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3328         }
3329
3330         kvm_mmu_free_roots(kvm, mmu, roots_to_free);
3331 }
3332 EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
3333
3334
3335 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3336 {
3337         int ret = 0;
3338
3339         if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
3340                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3341                 ret = 1;
3342         }
3343
3344         return ret;
3345 }
3346
3347 static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
3348                             u8 level, bool direct)
3349 {
3350         struct kvm_mmu_page *sp;
3351
3352         sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
3353         ++sp->root_count;
3354
3355         return __pa(sp->spt);
3356 }
3357
3358 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3359 {
3360         struct kvm_mmu *mmu = vcpu->arch.mmu;
3361         u8 shadow_root_level = mmu->shadow_root_level;
3362         hpa_t root;
3363         unsigned i;
3364         int r;
3365
3366         write_lock(&vcpu->kvm->mmu_lock);
3367         r = make_mmu_pages_available(vcpu);
3368         if (r < 0)
3369                 goto out_unlock;
3370
3371         if (is_tdp_mmu_enabled(vcpu->kvm)) {
3372                 root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
3373                 mmu->root.hpa = root;
3374         } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
3375                 root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
3376                 mmu->root.hpa = root;
3377         } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
3378                 if (WARN_ON_ONCE(!mmu->pae_root)) {
3379                         r = -EIO;
3380                         goto out_unlock;
3381                 }
3382
3383                 for (i = 0; i < 4; ++i) {
3384                         WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3385
3386                         root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
3387                                               i << 30, PT32_ROOT_LEVEL, true);
3388                         mmu->pae_root[i] = root | PT_PRESENT_MASK |
3389                                            shadow_me_mask;
3390                 }
3391                 mmu->root.hpa = __pa(mmu->pae_root);
3392         } else {
3393                 WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
3394                 r = -EIO;
3395                 goto out_unlock;
3396         }
3397
3398         /* root.pgd is ignored for direct MMUs. */
3399         mmu->root.pgd = 0;
3400 out_unlock:
3401         write_unlock(&vcpu->kvm->mmu_lock);
3402         return r;
3403 }
3404
3405 static int mmu_first_shadow_root_alloc(struct kvm *kvm)
3406 {
3407         struct kvm_memslots *slots;
3408         struct kvm_memory_slot *slot;
3409         int r = 0, i, bkt;
3410
3411         /*
3412          * Check if this is the first shadow root being allocated before
3413          * taking the lock.
3414          */
3415         if (kvm_shadow_root_allocated(kvm))
3416                 return 0;
3417
3418         mutex_lock(&kvm->slots_arch_lock);
3419
3420         /* Recheck, under the lock, whether this is the first shadow root. */
3421         if (kvm_shadow_root_allocated(kvm))
3422                 goto out_unlock;
3423
3424         /*
3425          * Check if anything actually needs to be allocated, e.g. all metadata
3426          * will be allocated upfront if TDP is disabled.
3427          */
3428         if (kvm_memslots_have_rmaps(kvm) &&
3429             kvm_page_track_write_tracking_enabled(kvm))
3430                 goto out_success;
3431
3432         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
3433                 slots = __kvm_memslots(kvm, i);
3434                 kvm_for_each_memslot(slot, bkt, slots) {
3435                         /*
3436                          * Both of these functions are no-ops if the target is
3437                          * already allocated, so unconditionally calling both
3438                          * is safe.  Intentionally do NOT free allocations on
3439                          * failure to avoid having to track which allocations
3440                          * were made now versus when the memslot was created.
3441                          * The metadata is guaranteed to be freed when the slot
3442                          * is freed, and will be kept/used if userspace retries
3443                          * KVM_RUN instead of killing the VM.
3444                          */
3445                         r = memslot_rmap_alloc(slot, slot->npages);
3446                         if (r)
3447                                 goto out_unlock;
3448                         r = kvm_page_track_write_tracking_alloc(slot);
3449                         if (r)
3450                                 goto out_unlock;
3451                 }
3452         }
3453
3454         /*
3455          * Ensure that shadow_root_allocated becomes true strictly after
3456          * all the related pointers are set.
3457          */
3458 out_success:
3459         smp_store_release(&kvm->arch.shadow_root_allocated, true);
3460
3461 out_unlock:
3462         mutex_unlock(&kvm->slots_arch_lock);
3463         return r;
3464 }
3465
3466 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3467 {
3468         struct kvm_mmu *mmu = vcpu->arch.mmu;
3469         u64 pdptrs[4], pm_mask;
3470         gfn_t root_gfn, root_pgd;
3471         hpa_t root;
3472         unsigned i;
3473         int r;
3474
3475         root_pgd = mmu->get_guest_pgd(vcpu);
3476         root_gfn = root_pgd >> PAGE_SHIFT;
3477
3478         if (mmu_check_root(vcpu, root_gfn))
3479                 return 1;
3480
3481         /*
3482          * On SVM, reading PDPTRs might access guest memory, which might fault
3483          * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
3484          */
3485         if (mmu->root_level == PT32E_ROOT_LEVEL) {
3486                 for (i = 0; i < 4; ++i) {
3487                         pdptrs[i] = mmu->get_pdptr(vcpu, i);
3488                         if (!(pdptrs[i] & PT_PRESENT_MASK))
3489                                 continue;
3490
3491                         if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
3492                                 return 1;
3493                 }
3494         }
3495
3496         r = mmu_first_shadow_root_alloc(vcpu->kvm);
3497         if (r)
3498                 return r;
3499
3500         write_lock(&vcpu->kvm->mmu_lock);
3501         r = make_mmu_pages_available(vcpu);
3502         if (r < 0)
3503                 goto out_unlock;
3504
3505         /*
3506          * Do we shadow a long mode page table? If so we need to
3507          * write-protect the guests page table root.
3508          */
3509         if (mmu->root_level >= PT64_ROOT_4LEVEL) {
3510                 root = mmu_alloc_root(vcpu, root_gfn, 0,
3511                                       mmu->shadow_root_level, false);
3512                 mmu->root.hpa = root;
3513                 goto set_root_pgd;
3514         }
3515
3516         if (WARN_ON_ONCE(!mmu->pae_root)) {
3517                 r = -EIO;
3518                 goto out_unlock;
3519         }
3520
3521         /*
3522          * We shadow a 32 bit page table. This may be a legacy 2-level
3523          * or a PAE 3-level page table. In either case we need to be aware that
3524          * the shadow page table may be a PAE or a long mode page table.
3525          */
3526         pm_mask = PT_PRESENT_MASK | shadow_me_mask;
3527         if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3528                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3529
3530                 if (WARN_ON_ONCE(!mmu->pml4_root)) {
3531                         r = -EIO;
3532                         goto out_unlock;
3533                 }
3534                 mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
3535
3536                 if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
3537                         if (WARN_ON_ONCE(!mmu->pml5_root)) {
3538                                 r = -EIO;
3539                                 goto out_unlock;
3540                         }
3541                         mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
3542                 }
3543         }
3544
3545         for (i = 0; i < 4; ++i) {
3546                 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3547
3548                 if (mmu->root_level == PT32E_ROOT_LEVEL) {
3549                         if (!(pdptrs[i] & PT_PRESENT_MASK)) {
3550                                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3551                                 continue;
3552                         }
3553                         root_gfn = pdptrs[i] >> PAGE_SHIFT;
3554                 }
3555
3556                 root = mmu_alloc_root(vcpu, root_gfn, i << 30,
3557                                       PT32_ROOT_LEVEL, false);
3558                 mmu->pae_root[i] = root | pm_mask;
3559         }
3560
3561         if (mmu->shadow_root_level == PT64_ROOT_5LEVEL)
3562                 mmu->root.hpa = __pa(mmu->pml5_root);
3563         else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3564                 mmu->root.hpa = __pa(mmu->pml4_root);
3565         else
3566                 mmu->root.hpa = __pa(mmu->pae_root);
3567
3568 set_root_pgd:
3569         mmu->root.pgd = root_pgd;
3570 out_unlock:
3571         write_unlock(&vcpu->kvm->mmu_lock);
3572
3573         return r;
3574 }
3575
3576 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
3577 {
3578         struct kvm_mmu *mmu = vcpu->arch.mmu;
3579         bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL;
3580         u64 *pml5_root = NULL;
3581         u64 *pml4_root = NULL;
3582         u64 *pae_root;
3583
3584         /*
3585          * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
3586          * tables are allocated and initialized at root creation as there is no
3587          * equivalent level in the guest's NPT to shadow.  Allocate the tables
3588          * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
3589          */
3590         if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
3591             mmu->shadow_root_level < PT64_ROOT_4LEVEL)
3592                 return 0;
3593
3594         /*
3595          * NPT, the only paging mode that uses this horror, uses a fixed number
3596          * of levels for the shadow page tables, e.g. all MMUs are 4-level or
3597          * all MMus are 5-level.  Thus, this can safely require that pml5_root
3598          * is allocated if the other roots are valid and pml5 is needed, as any
3599          * prior MMU would also have required pml5.
3600          */
3601         if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
3602                 return 0;
3603
3604         /*
3605          * The special roots should always be allocated in concert.  Yell and
3606          * bail if KVM ends up in a state where only one of the roots is valid.
3607          */
3608         if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
3609                          (need_pml5 && mmu->pml5_root)))
3610                 return -EIO;
3611
3612         /*
3613          * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
3614          * doesn't need to be decrypted.
3615          */
3616         pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3617         if (!pae_root)
3618                 return -ENOMEM;
3619
3620 #ifdef CONFIG_X86_64
3621         pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3622         if (!pml4_root)
3623                 goto err_pml4;
3624
3625         if (need_pml5) {
3626                 pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3627                 if (!pml5_root)
3628                         goto err_pml5;
3629         }
3630 #endif
3631
3632         mmu->pae_root = pae_root;
3633         mmu->pml4_root = pml4_root;
3634         mmu->pml5_root = pml5_root;
3635
3636         return 0;
3637
3638 #ifdef CONFIG_X86_64
3639 err_pml5:
3640         free_page((unsigned long)pml4_root);
3641 err_pml4:
3642         free_page((unsigned long)pae_root);
3643         return -ENOMEM;
3644 #endif
3645 }
3646
3647 static bool is_unsync_root(hpa_t root)
3648 {
3649         struct kvm_mmu_page *sp;
3650
3651         if (!VALID_PAGE(root))
3652                 return false;
3653
3654         /*
3655          * The read barrier orders the CPU's read of SPTE.W during the page table
3656          * walk before the reads of sp->unsync/sp->unsync_children here.
3657          *
3658          * Even if another CPU was marking the SP as unsync-ed simultaneously,
3659          * any guest page table changes are not guaranteed to be visible anyway
3660          * until this VCPU issues a TLB flush strictly after those changes are
3661          * made.  We only need to ensure that the other CPU sets these flags
3662          * before any actual changes to the page tables are made.  The comments
3663          * in mmu_try_to_unsync_pages() describe what could go wrong if this
3664          * requirement isn't satisfied.
3665          */
3666         smp_rmb();
3667         sp = to_shadow_page(root);
3668
3669         /*
3670          * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
3671          * PDPTEs for a given PAE root need to be synchronized individually.
3672          */
3673         if (WARN_ON_ONCE(!sp))
3674                 return false;
3675
3676         if (sp->unsync || sp->unsync_children)
3677                 return true;
3678
3679         return false;
3680 }
3681
3682 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3683 {
3684         int i;
3685         struct kvm_mmu_page *sp;
3686
3687         if (vcpu->arch.mmu->direct_map)
3688                 return;
3689
3690         if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
3691                 return;
3692
3693         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3694
3695         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3696                 hpa_t root = vcpu->arch.mmu->root.hpa;
3697                 sp = to_shadow_page(root);
3698
3699                 if (!is_unsync_root(root))
3700                         return;
3701
3702                 write_lock(&vcpu->kvm->mmu_lock);
3703                 mmu_sync_children(vcpu, sp, true);
3704                 write_unlock(&vcpu->kvm->mmu_lock);
3705                 return;
3706         }
3707
3708         write_lock(&vcpu->kvm->mmu_lock);
3709
3710         for (i = 0; i < 4; ++i) {
3711                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3712
3713                 if (IS_VALID_PAE_ROOT(root)) {
3714                         root &= PT64_BASE_ADDR_MASK;
3715                         sp = to_shadow_page(root);
3716                         mmu_sync_children(vcpu, sp, true);
3717                 }
3718         }
3719
3720         write_unlock(&vcpu->kvm->mmu_lock);
3721 }
3722
3723 void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
3724 {
3725         unsigned long roots_to_free = 0;
3726         int i;
3727
3728         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3729                 if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
3730                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3731
3732         /* sync prev_roots by simply freeing them */
3733         kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
3734 }
3735
3736 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3737                                   gpa_t vaddr, u64 access,
3738                                   struct x86_exception *exception)
3739 {
3740         if (exception)
3741                 exception->error_code = 0;
3742         return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
3743 }
3744
3745 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3746 {
3747         /*
3748          * A nested guest cannot use the MMIO cache if it is using nested
3749          * page tables, because cr2 is a nGPA while the cache stores GPAs.
3750          */
3751         if (mmu_is_nested(vcpu))
3752                 return false;
3753
3754         if (direct)
3755                 return vcpu_match_mmio_gpa(vcpu, addr);
3756
3757         return vcpu_match_mmio_gva(vcpu, addr);
3758 }
3759
3760 /*
3761  * Return the level of the lowest level SPTE added to sptes.
3762  * That SPTE may be non-present.
3763  *
3764  * Must be called between walk_shadow_page_lockless_{begin,end}.
3765  */
3766 static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
3767 {
3768         struct kvm_shadow_walk_iterator iterator;
3769         int leaf = -1;
3770         u64 spte;
3771
3772         for (shadow_walk_init(&iterator, vcpu, addr),
3773              *root_level = iterator.level;
3774              shadow_walk_okay(&iterator);
3775              __shadow_walk_next(&iterator, spte)) {
3776                 leaf = iterator.level;
3777                 spte = mmu_spte_get_lockless(iterator.sptep);
3778
3779                 sptes[leaf] = spte;
3780         }
3781
3782         return leaf;
3783 }
3784
3785 /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
3786 static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3787 {
3788         u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
3789         struct rsvd_bits_validate *rsvd_check;
3790         int root, leaf, level;
3791         bool reserved = false;
3792
3793         walk_shadow_page_lockless_begin(vcpu);
3794
3795         if (is_tdp_mmu(vcpu->arch.mmu))
3796                 leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
3797         else
3798                 leaf = get_walk(vcpu, addr, sptes, &root);
3799
3800         walk_shadow_page_lockless_end(vcpu);
3801
3802         if (unlikely(leaf < 0)) {
3803                 *sptep = 0ull;
3804                 return reserved;
3805         }
3806
3807         *sptep = sptes[leaf];
3808
3809         /*
3810          * Skip reserved bits checks on the terminal leaf if it's not a valid
3811          * SPTE.  Note, this also (intentionally) skips MMIO SPTEs, which, by
3812          * design, always have reserved bits set.  The purpose of the checks is
3813          * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
3814          */
3815         if (!is_shadow_present_pte(sptes[leaf]))
3816                 leaf++;
3817
3818         rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
3819
3820         for (level = root; level >= leaf; level--)
3821                 reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
3822
3823         if (reserved) {
3824                 pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
3825                        __func__, addr);
3826                 for (level = root; level >= leaf; level--)
3827                         pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
3828                                sptes[level], level,
3829                                get_rsvd_bits(rsvd_check, sptes[level], level));
3830         }
3831
3832         return reserved;
3833 }
3834
3835 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3836 {
3837         u64 spte;
3838         bool reserved;
3839
3840         if (mmio_info_in_cache(vcpu, addr, direct))
3841                 return RET_PF_EMULATE;
3842
3843         reserved = get_mmio_spte(vcpu, addr, &spte);
3844         if (WARN_ON(reserved))
3845                 return -EINVAL;
3846
3847         if (is_mmio_spte(spte)) {
3848                 gfn_t gfn = get_mmio_spte_gfn(spte);
3849                 unsigned int access = get_mmio_spte_access(spte);
3850
3851                 if (!check_mmio_spte(vcpu, spte))
3852                         return RET_PF_INVALID;
3853
3854                 if (direct)
3855                         addr = 0;
3856
3857                 trace_handle_mmio_page_fault(addr, gfn, access);
3858                 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3859                 return RET_PF_EMULATE;
3860         }
3861
3862         /*
3863          * If the page table is zapped by other cpus, let CPU fault again on
3864          * the address.
3865          */
3866         return RET_PF_RETRY;
3867 }
3868
3869 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
3870                                          struct kvm_page_fault *fault)
3871 {
3872         if (unlikely(fault->rsvd))
3873                 return false;
3874
3875         if (!fault->present || !fault->write)
3876                 return false;
3877
3878         /*
3879          * guest is writing the page which is write tracked which can
3880          * not be fixed by page fault handler.
3881          */
3882         if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
3883                 return true;
3884
3885         return false;
3886 }
3887
3888 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
3889 {
3890         struct kvm_shadow_walk_iterator iterator;
3891         u64 spte;
3892
3893         walk_shadow_page_lockless_begin(vcpu);
3894         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
3895                 clear_sp_write_flooding_count(iterator.sptep);
3896         walk_shadow_page_lockless_end(vcpu);
3897 }
3898
3899 static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
3900 {
3901         /* make sure the token value is not 0 */
3902         u32 id = vcpu->arch.apf.id;
3903
3904         if (id << 12 == 0)
3905                 vcpu->arch.apf.id = 1;
3906
3907         return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
3908 }
3909
3910 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
3911                                     gfn_t gfn)
3912 {
3913         struct kvm_arch_async_pf arch;
3914
3915         arch.token = alloc_apf_token(vcpu);
3916         arch.gfn = gfn;
3917         arch.direct_map = vcpu->arch.mmu->direct_map;
3918         arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
3919
3920         return kvm_setup_async_pf(vcpu, cr2_or_gpa,
3921                                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
3922 }
3923
3924 static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r)
3925 {
3926         struct kvm_memory_slot *slot = fault->slot;
3927         bool async;
3928
3929         /*
3930          * Retry the page fault if the gfn hit a memslot that is being deleted
3931          * or moved.  This ensures any existing SPTEs for the old memslot will
3932          * be zapped before KVM inserts a new MMIO SPTE for the gfn.
3933          */
3934         if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
3935                 goto out_retry;
3936
3937         if (!kvm_is_visible_memslot(slot)) {
3938                 /* Don't expose private memslots to L2. */
3939                 if (is_guest_mode(vcpu)) {
3940                         fault->slot = NULL;
3941                         fault->pfn = KVM_PFN_NOSLOT;
3942                         fault->map_writable = false;
3943                         return false;
3944                 }
3945                 /*
3946                  * If the APIC access page exists but is disabled, go directly
3947                  * to emulation without caching the MMIO access or creating a
3948                  * MMIO SPTE.  That way the cache doesn't need to be purged
3949                  * when the AVIC is re-enabled.
3950                  */
3951                 if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
3952                     !kvm_apicv_activated(vcpu->kvm)) {
3953                         *r = RET_PF_EMULATE;
3954                         return true;
3955                 }
3956         }
3957
3958         async = false;
3959         fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
3960                                           fault->write, &fault->map_writable,
3961                                           &fault->hva);
3962         if (!async)
3963                 return false; /* *pfn has correct page already */
3964
3965         if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
3966                 trace_kvm_try_async_get_page(fault->addr, fault->gfn);
3967                 if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
3968                         trace_kvm_async_pf_doublefault(fault->addr, fault->gfn);
3969                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
3970                         goto out_retry;
3971                 } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn))
3972                         goto out_retry;
3973         }
3974
3975         fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
3976                                           fault->write, &fault->map_writable,
3977                                           &fault->hva);
3978         return false;
3979
3980 out_retry:
3981         *r = RET_PF_RETRY;
3982         return true;
3983 }
3984
3985 /*
3986  * Returns true if the page fault is stale and needs to be retried, i.e. if the
3987  * root was invalidated by a memslot update or a relevant mmu_notifier fired.
3988  */
3989 static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
3990                                 struct kvm_page_fault *fault, int mmu_seq)
3991 {
3992         struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
3993
3994         /* Special roots, e.g. pae_root, are not backed by shadow pages. */
3995         if (sp && is_obsolete_sp(vcpu->kvm, sp))
3996                 return true;
3997
3998         /*
3999          * Roots without an associated shadow page are considered invalid if
4000          * there is a pending request to free obsolete roots.  The request is
4001          * only a hint that the current root _may_ be obsolete and needs to be
4002          * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
4003          * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
4004          * to reload even if no vCPU is actively using the root.
4005          */
4006         if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
4007                 return true;
4008
4009         return fault->slot &&
4010                mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
4011 }
4012
4013 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4014 {
4015         bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
4016
4017         unsigned long mmu_seq;
4018         int r;
4019
4020         fault->gfn = fault->addr >> PAGE_SHIFT;
4021         fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
4022
4023         if (page_fault_handle_page_track(vcpu, fault))
4024                 return RET_PF_EMULATE;
4025
4026         r = fast_page_fault(vcpu, fault);
4027         if (r != RET_PF_INVALID)
4028                 return r;
4029
4030         r = mmu_topup_memory_caches(vcpu, false);
4031         if (r)
4032                 return r;
4033
4034         mmu_seq = vcpu->kvm->mmu_notifier_seq;
4035         smp_rmb();
4036
4037         if (kvm_faultin_pfn(vcpu, fault, &r))
4038                 return r;
4039
4040         if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r))
4041                 return r;
4042
4043         r = RET_PF_RETRY;
4044
4045         if (is_tdp_mmu_fault)
4046                 read_lock(&vcpu->kvm->mmu_lock);
4047         else
4048                 write_lock(&vcpu->kvm->mmu_lock);
4049
4050         if (is_page_fault_stale(vcpu, fault, mmu_seq))
4051                 goto out_unlock;
4052
4053         r = make_mmu_pages_available(vcpu);
4054         if (r)
4055                 goto out_unlock;
4056
4057         if (is_tdp_mmu_fault)
4058                 r = kvm_tdp_mmu_map(vcpu, fault);
4059         else
4060                 r = __direct_map(vcpu, fault);
4061
4062 out_unlock:
4063         if (is_tdp_mmu_fault)
4064                 read_unlock(&vcpu->kvm->mmu_lock);
4065         else
4066                 write_unlock(&vcpu->kvm->mmu_lock);
4067         kvm_release_pfn_clean(fault->pfn);
4068         return r;
4069 }
4070
4071 static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
4072                                 struct kvm_page_fault *fault)
4073 {
4074         pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
4075
4076         /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
4077         fault->max_level = PG_LEVEL_2M;
4078         return direct_page_fault(vcpu, fault);
4079 }
4080
4081 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4082                                 u64 fault_address, char *insn, int insn_len)
4083 {
4084         int r = 1;
4085         u32 flags = vcpu->arch.apf.host_apf_flags;
4086
4087 #ifndef CONFIG_X86_64
4088         /* A 64-bit CR2 should be impossible on 32-bit KVM. */
4089         if (WARN_ON_ONCE(fault_address >> 32))
4090                 return -EFAULT;
4091 #endif
4092
4093         vcpu->arch.l1tf_flush_l1d = true;
4094         if (!flags) {
4095                 trace_kvm_page_fault(fault_address, error_code);
4096
4097                 if (kvm_event_needs_reinjection(vcpu))
4098                         kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4099                 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4100                                 insn_len);
4101         } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
4102                 vcpu->arch.apf.host_apf_flags = 0;
4103                 local_irq_disable();
4104                 kvm_async_pf_task_wait_schedule(fault_address);
4105                 local_irq_enable();
4106         } else {
4107                 WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
4108         }
4109
4110         return r;
4111 }
4112 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4113
4114 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4115 {
4116         while (fault->max_level > PG_LEVEL_4K) {
4117                 int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
4118                 gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
4119
4120                 if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
4121                         break;
4122
4123                 --fault->max_level;
4124         }
4125
4126         return direct_page_fault(vcpu, fault);
4127 }
4128
4129 static void nonpaging_init_context(struct kvm_mmu *context)
4130 {
4131         context->page_fault = nonpaging_page_fault;
4132         context->gva_to_gpa = nonpaging_gva_to_gpa;
4133         context->sync_page = nonpaging_sync_page;
4134         context->invlpg = NULL;
4135         context->direct_map = true;
4136 }
4137
4138 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
4139                                   union kvm_mmu_page_role role)
4140 {
4141         return (role.direct || pgd == root->pgd) &&
4142                VALID_PAGE(root->hpa) &&
4143                role.word == to_shadow_page(root->hpa)->role.word;
4144 }
4145
4146 /*
4147  * Find out if a previously cached root matching the new pgd/role is available,
4148  * and insert the current root as the MRU in the cache.
4149  * If a matching root is found, it is assigned to kvm_mmu->root and
4150  * true is returned.
4151  * If no match is found, kvm_mmu->root is left invalid, the LRU root is
4152  * evicted to make room for the current root, and false is returned.
4153  */
4154 static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
4155                                               gpa_t new_pgd,
4156                                               union kvm_mmu_page_role new_role)
4157 {
4158         uint i;
4159
4160         if (is_root_usable(&mmu->root, new_pgd, new_role))
4161                 return true;
4162
4163         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4164                 /*
4165                  * The swaps end up rotating the cache like this:
4166                  *   C   0 1 2 3   (on entry to the function)
4167                  *   0   C 1 2 3
4168                  *   1   C 0 2 3
4169                  *   2   C 0 1 3
4170                  *   3   C 0 1 2   (on exit from the loop)
4171                  */
4172                 swap(mmu->root, mmu->prev_roots[i]);
4173                 if (is_root_usable(&mmu->root, new_pgd, new_role))
4174                         return true;
4175         }
4176
4177         kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4178         return false;
4179 }
4180
4181 /*
4182  * Find out if a previously cached root matching the new pgd/role is available.
4183  * On entry, mmu->root is invalid.
4184  * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
4185  * of the cache becomes invalid, and true is returned.
4186  * If no match is found, kvm_mmu->root is left invalid and false is returned.
4187  */
4188 static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
4189                                              gpa_t new_pgd,
4190                                              union kvm_mmu_page_role new_role)
4191 {
4192         uint i;
4193
4194         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4195                 if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
4196                         goto hit;
4197
4198         return false;
4199
4200 hit:
4201         swap(mmu->root, mmu->prev_roots[i]);
4202         /* Bubble up the remaining roots.  */
4203         for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
4204                 mmu->prev_roots[i] = mmu->prev_roots[i + 1];
4205         mmu->prev_roots[i].hpa = INVALID_PAGE;
4206         return true;
4207 }
4208
4209 static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
4210                             gpa_t new_pgd, union kvm_mmu_page_role new_role)
4211 {
4212         /*
4213          * For now, limit the caching to 64-bit hosts+VMs in order to avoid
4214          * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4215          * later if necessary.
4216          */
4217         if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
4218                 kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4219
4220         if (VALID_PAGE(mmu->root.hpa))
4221                 return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
4222         else
4223                 return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
4224 }
4225
4226 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
4227 {
4228         struct kvm_mmu *mmu = vcpu->arch.mmu;
4229         union kvm_mmu_page_role new_role = mmu->mmu_role.base;
4230
4231         if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
4232                 /* kvm_mmu_ensure_valid_pgd will set up a new root.  */
4233                 return;
4234         }
4235
4236         /*
4237          * It's possible that the cached previous root page is obsolete because
4238          * of a change in the MMU generation number. However, changing the
4239          * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
4240          * which will free the root set here and allocate a new one.
4241          */
4242         kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
4243
4244         if (force_flush_and_sync_on_reuse) {
4245                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4246                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
4247         }
4248
4249         /*
4250          * The last MMIO access's GVA and GPA are cached in the VCPU. When
4251          * switching to a new CR3, that GVA->GPA mapping may no longer be
4252          * valid. So clear any cached MMIO info even when we don't need to sync
4253          * the shadow page tables.
4254          */
4255         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4256
4257         /*
4258          * If this is a direct root page, it doesn't have a write flooding
4259          * count. Otherwise, clear the write flooding count.
4260          */
4261         if (!new_role.direct)
4262                 __clear_sp_write_flooding_count(
4263                                 to_shadow_page(vcpu->arch.mmu->root.hpa));
4264 }
4265 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
4266
4267 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4268 {
4269         return kvm_read_cr3(vcpu);
4270 }
4271
4272 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4273                            unsigned int access)
4274 {
4275         if (unlikely(is_mmio_spte(*sptep))) {
4276                 if (gfn != get_mmio_spte_gfn(*sptep)) {
4277                         mmu_spte_clear_no_track(sptep);
4278                         return true;
4279                 }
4280
4281                 mark_mmio_spte(vcpu, sptep, gfn, access);
4282                 return true;
4283         }
4284
4285         return false;
4286 }
4287
4288 #define PTTYPE_EPT 18 /* arbitrary */
4289 #define PTTYPE PTTYPE_EPT
4290 #include "paging_tmpl.h"
4291 #undef PTTYPE
4292
4293 #define PTTYPE 64
4294 #include "paging_tmpl.h"
4295 #undef PTTYPE
4296
4297 #define PTTYPE 32
4298 #include "paging_tmpl.h"
4299 #undef PTTYPE
4300
4301 static void
4302 __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
4303                         u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
4304                         bool pse, bool amd)
4305 {
4306         u64 gbpages_bit_rsvd = 0;
4307         u64 nonleaf_bit8_rsvd = 0;
4308         u64 high_bits_rsvd;
4309
4310         rsvd_check->bad_mt_xwr = 0;
4311
4312         if (!gbpages)
4313                 gbpages_bit_rsvd = rsvd_bits(7, 7);
4314
4315         if (level == PT32E_ROOT_LEVEL)
4316                 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
4317         else
4318                 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4319
4320         /* Note, NX doesn't exist in PDPTEs, this is handled below. */
4321         if (!nx)
4322                 high_bits_rsvd |= rsvd_bits(63, 63);
4323
4324         /*
4325          * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4326          * leaf entries) on AMD CPUs only.
4327          */
4328         if (amd)
4329                 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4330
4331         switch (level) {
4332         case PT32_ROOT_LEVEL:
4333                 /* no rsvd bits for 2 level 4K page table entries */
4334                 rsvd_check->rsvd_bits_mask[0][1] = 0;
4335                 rsvd_check->rsvd_bits_mask[0][0] = 0;
4336                 rsvd_check->rsvd_bits_mask[1][0] =
4337                         rsvd_check->rsvd_bits_mask[0][0];
4338
4339                 if (!pse) {
4340                         rsvd_check->rsvd_bits_mask[1][1] = 0;
4341                         break;
4342                 }
4343
4344                 if (is_cpuid_PSE36())
4345                         /* 36bits PSE 4MB page */
4346                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4347                 else
4348                         /* 32 bits PSE 4MB page */
4349                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4350                 break;
4351         case PT32E_ROOT_LEVEL:
4352                 rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
4353                                                    high_bits_rsvd |
4354                                                    rsvd_bits(5, 8) |
4355                                                    rsvd_bits(1, 2);     /* PDPTE */
4356                 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;      /* PDE */
4357                 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;      /* PTE */
4358                 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4359                                                    rsvd_bits(13, 20);   /* large page */
4360                 rsvd_check->rsvd_bits_mask[1][0] =
4361                         rsvd_check->rsvd_bits_mask[0][0];
4362                 break;
4363         case PT64_ROOT_5LEVEL:
4364                 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
4365                                                    nonleaf_bit8_rsvd |
4366                                                    rsvd_bits(7, 7);
4367                 rsvd_check->rsvd_bits_mask[1][4] =
4368                         rsvd_check->rsvd_bits_mask[0][4];
4369                 fallthrough;
4370         case PT64_ROOT_4LEVEL:
4371                 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
4372                                                    nonleaf_bit8_rsvd |
4373                                                    rsvd_bits(7, 7);
4374                 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
4375                                                    gbpages_bit_rsvd;
4376                 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
4377                 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4378                 rsvd_check->rsvd_bits_mask[1][3] =
4379                         rsvd_check->rsvd_bits_mask[0][3];
4380                 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
4381                                                    gbpages_bit_rsvd |
4382                                                    rsvd_bits(13, 29);
4383                 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4384                                                    rsvd_bits(13, 20); /* large page */
4385                 rsvd_check->rsvd_bits_mask[1][0] =
4386                         rsvd_check->rsvd_bits_mask[0][0];
4387                 break;
4388         }
4389 }
4390
4391 static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
4392 {
4393         /*
4394          * If TDP is enabled, let the guest use GBPAGES if they're supported in
4395          * hardware.  The hardware page walker doesn't let KVM disable GBPAGES,
4396          * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
4397          * walk for performance and complexity reasons.  Not to mention KVM
4398          * _can't_ solve the problem because GVA->GPA walks aren't visible to
4399          * KVM once a TDP translation is installed.  Mimic hardware behavior so
4400          * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
4401          */
4402         return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
4403                              guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
4404 }
4405
4406 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4407                                   struct kvm_mmu *context)
4408 {
4409         __reset_rsvds_bits_mask(&context->guest_rsvd_check,
4410                                 vcpu->arch.reserved_gpa_bits,
4411                                 context->root_level, is_efer_nx(context),
4412                                 guest_can_use_gbpages(vcpu),
4413                                 is_cr4_pse(context),
4414                                 guest_cpuid_is_amd_or_hygon(vcpu));
4415 }
4416
4417 static void
4418 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4419                             u64 pa_bits_rsvd, bool execonly, int huge_page_level)
4420 {
4421         u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4422         u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
4423         u64 bad_mt_xwr;
4424
4425         if (huge_page_level < PG_LEVEL_1G)
4426                 large_1g_rsvd = rsvd_bits(7, 7);
4427         if (huge_page_level < PG_LEVEL_2M)
4428                 large_2m_rsvd = rsvd_bits(7, 7);
4429
4430         rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
4431         rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
4432         rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
4433         rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
4434         rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4435
4436         /* large page */
4437         rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4438         rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4439         rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
4440         rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
4441         rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4442
4443         bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
4444         bad_mt_xwr |= 0xFFull << (3 * 8);       /* bits 3..5 must not be 3 */
4445         bad_mt_xwr |= 0xFFull << (7 * 8);       /* bits 3..5 must not be 7 */
4446         bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   /* bits 0..2 must not be 010 */
4447         bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   /* bits 0..2 must not be 110 */
4448         if (!execonly) {
4449                 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4450                 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4451         }
4452         rsvd_check->bad_mt_xwr = bad_mt_xwr;
4453 }
4454
4455 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4456                 struct kvm_mmu *context, bool execonly, int huge_page_level)
4457 {
4458         __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4459                                     vcpu->arch.reserved_gpa_bits, execonly,
4460                                     huge_page_level);
4461 }
4462
4463 static inline u64 reserved_hpa_bits(void)
4464 {
4465         return rsvd_bits(shadow_phys_bits, 63);
4466 }
4467
4468 /*
4469  * the page table on host is the shadow page table for the page
4470  * table in guest or amd nested guest, its mmu features completely
4471  * follow the features in guest.
4472  */
4473 static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4474                                         struct kvm_mmu *context)
4475 {
4476         /*
4477          * KVM uses NX when TDP is disabled to handle a variety of scenarios,
4478          * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
4479          * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
4480          * The iTLB multi-hit workaround can be toggled at any time, so assume
4481          * NX can be used by any non-nested shadow MMU to avoid having to reset
4482          * MMU contexts.  Note, KVM forces EFER.NX=1 when TDP is disabled.
4483          */
4484         bool uses_nx = is_efer_nx(context) || !tdp_enabled;
4485
4486         /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
4487         bool is_amd = true;
4488         /* KVM doesn't use 2-level page tables for the shadow MMU. */
4489         bool is_pse = false;
4490         struct rsvd_bits_validate *shadow_zero_check;
4491         int i;
4492
4493         WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL);
4494
4495         shadow_zero_check = &context->shadow_zero_check;
4496         __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4497                                 context->shadow_root_level, uses_nx,
4498                                 guest_can_use_gbpages(vcpu), is_pse, is_amd);
4499
4500         if (!shadow_me_mask)
4501                 return;
4502
4503         for (i = context->shadow_root_level; --i >= 0;) {
4504                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4505                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4506         }
4507
4508 }
4509
4510 static inline bool boot_cpu_is_amd(void)
4511 {
4512         WARN_ON_ONCE(!tdp_enabled);
4513         return shadow_x_mask == 0;
4514 }
4515
4516 /*
4517  * the direct page table on host, use as much mmu features as
4518  * possible, however, kvm currently does not do execution-protection.
4519  */
4520 static void
4521 reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
4522 {
4523         struct rsvd_bits_validate *shadow_zero_check;
4524         int i;
4525
4526         shadow_zero_check = &context->shadow_zero_check;
4527
4528         if (boot_cpu_is_amd())
4529                 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4530                                         context->shadow_root_level, false,
4531                                         boot_cpu_has(X86_FEATURE_GBPAGES),
4532                                         false, true);
4533         else
4534                 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4535                                             reserved_hpa_bits(), false,
4536                                             max_huge_page_level);
4537
4538         if (!shadow_me_mask)
4539                 return;
4540
4541         for (i = context->shadow_root_level; --i >= 0;) {
4542                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4543                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4544         }
4545 }
4546
4547 /*
4548  * as the comments in reset_shadow_zero_bits_mask() except it
4549  * is the shadow page table for intel nested guest.
4550  */
4551 static void
4552 reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
4553 {
4554         __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4555                                     reserved_hpa_bits(), execonly,
4556                                     max_huge_page_level);
4557 }
4558
4559 #define BYTE_MASK(access) \
4560         ((1 & (access) ? 2 : 0) | \
4561          (2 & (access) ? 4 : 0) | \
4562          (3 & (access) ? 8 : 0) | \
4563          (4 & (access) ? 16 : 0) | \
4564          (5 & (access) ? 32 : 0) | \
4565          (6 & (access) ? 64 : 0) | \
4566          (7 & (access) ? 128 : 0))
4567
4568
4569 static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
4570 {
4571         unsigned byte;
4572
4573         const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4574         const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4575         const u8 u = BYTE_MASK(ACC_USER_MASK);
4576
4577         bool cr4_smep = is_cr4_smep(mmu);
4578         bool cr4_smap = is_cr4_smap(mmu);
4579         bool cr0_wp = is_cr0_wp(mmu);
4580         bool efer_nx = is_efer_nx(mmu);
4581
4582         for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4583                 unsigned pfec = byte << 1;
4584
4585                 /*
4586                  * Each "*f" variable has a 1 bit for each UWX value
4587                  * that causes a fault with the given PFEC.
4588                  */
4589
4590                 /* Faults from writes to non-writable pages */
4591                 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4592                 /* Faults from user mode accesses to supervisor pages */
4593                 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4594                 /* Faults from fetches of non-executable pages*/
4595                 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4596                 /* Faults from kernel mode fetches of user pages */
4597                 u8 smepf = 0;
4598                 /* Faults from kernel mode accesses of user pages */
4599                 u8 smapf = 0;
4600
4601                 if (!ept) {
4602                         /* Faults from kernel mode accesses to user pages */
4603                         u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4604
4605                         /* Not really needed: !nx will cause pte.nx to fault */
4606                         if (!efer_nx)
4607                                 ff = 0;
4608
4609                         /* Allow supervisor writes if !cr0.wp */
4610                         if (!cr0_wp)
4611                                 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4612
4613                         /* Disallow supervisor fetches of user code if cr4.smep */
4614                         if (cr4_smep)
4615                                 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4616
4617                         /*
4618                          * SMAP:kernel-mode data accesses from user-mode
4619                          * mappings should fault. A fault is considered
4620                          * as a SMAP violation if all of the following
4621                          * conditions are true:
4622                          *   - X86_CR4_SMAP is set in CR4
4623                          *   - A user page is accessed
4624                          *   - The access is not a fetch
4625                          *   - The access is supervisor mode
4626                          *   - If implicit supervisor access or X86_EFLAGS_AC is clear
4627                          *
4628                          * Here, we cover the first four conditions.
4629                          * The fifth is computed dynamically in permission_fault();
4630                          * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4631                          * *not* subject to SMAP restrictions.
4632                          */
4633                         if (cr4_smap)
4634                                 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4635                 }
4636
4637                 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4638         }
4639 }
4640
4641 /*
4642 * PKU is an additional mechanism by which the paging controls access to
4643 * user-mode addresses based on the value in the PKRU register.  Protection
4644 * key violations are reported through a bit in the page fault error code.
4645 * Unlike other bits of the error code, the PK bit is not known at the
4646 * call site of e.g. gva_to_gpa; it must be computed directly in
4647 * permission_fault based on two bits of PKRU, on some machine state (CR4,
4648 * CR0, EFER, CPL), and on other bits of the error code and the page tables.
4649 *
4650 * In particular the following conditions come from the error code, the
4651 * page tables and the machine state:
4652 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4653 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4654 * - PK is always zero if U=0 in the page tables
4655 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4656 *
4657 * The PKRU bitmask caches the result of these four conditions.  The error
4658 * code (minus the P bit) and the page table's U bit form an index into the
4659 * PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
4660 * with the two bits of the PKRU register corresponding to the protection key.
4661 * For the first three conditions above the bits will be 00, thus masking
4662 * away both AD and WD.  For all reads or if the last condition holds, WD
4663 * only will be masked away.
4664 */
4665 static void update_pkru_bitmask(struct kvm_mmu *mmu)
4666 {
4667         unsigned bit;
4668         bool wp;
4669
4670         mmu->pkru_mask = 0;
4671
4672         if (!is_cr4_pke(mmu))
4673                 return;
4674
4675         wp = is_cr0_wp(mmu);
4676
4677         for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4678                 unsigned pfec, pkey_bits;
4679                 bool check_pkey, check_write, ff, uf, wf, pte_user;
4680
4681                 pfec = bit << 1;
4682                 ff = pfec & PFERR_FETCH_MASK;
4683                 uf = pfec & PFERR_USER_MASK;
4684                 wf = pfec & PFERR_WRITE_MASK;
4685
4686                 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
4687                 pte_user = pfec & PFERR_RSVD_MASK;
4688
4689                 /*
4690                  * Only need to check the access which is not an
4691                  * instruction fetch and is to a user page.
4692                  */
4693                 check_pkey = (!ff && pte_user);
4694                 /*
4695                  * write access is controlled by PKRU if it is a
4696                  * user access or CR0.WP = 1.
4697                  */
4698                 check_write = check_pkey && wf && (uf || wp);
4699
4700                 /* PKRU.AD stops both read and write access. */
4701                 pkey_bits = !!check_pkey;
4702                 /* PKRU.WD stops write access. */
4703                 pkey_bits |= (!!check_write) << 1;
4704
4705                 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4706         }
4707 }
4708
4709 static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
4710                                         struct kvm_mmu *mmu)
4711 {
4712         if (!is_cr0_pg(mmu))
4713                 return;
4714
4715         reset_rsvds_bits_mask(vcpu, mmu);
4716         update_permission_bitmask(mmu, false);
4717         update_pkru_bitmask(mmu);
4718 }
4719
4720 static void paging64_init_context(struct kvm_mmu *context)
4721 {
4722         context->page_fault = paging64_page_fault;
4723         context->gva_to_gpa = paging64_gva_to_gpa;
4724         context->sync_page = paging64_sync_page;
4725         context->invlpg = paging64_invlpg;
4726         context->direct_map = false;
4727 }
4728
4729 static void paging32_init_context(struct kvm_mmu *context)
4730 {
4731         context->page_fault = paging32_page_fault;
4732         context->gva_to_gpa = paging32_gva_to_gpa;
4733         context->sync_page = paging32_sync_page;
4734         context->invlpg = paging32_invlpg;
4735         context->direct_map = false;
4736 }
4737
4738 static union kvm_mmu_role
4739 kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
4740 {
4741         union kvm_mmu_role role = {0};
4742
4743         role.base.access = ACC_ALL;
4744         role.base.smm = is_smm(vcpu);
4745         role.base.guest_mode = is_guest_mode(vcpu);
4746         role.ext.valid = 1;
4747
4748         if (!____is_cr0_pg(regs)) {
4749                 role.base.direct = 1;
4750                 return role;
4751         }
4752
4753         role.base.efer_nx = ____is_efer_nx(regs);
4754         role.base.cr0_wp = ____is_cr0_wp(regs);
4755         role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
4756         role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
4757         role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
4758
4759         if (____is_efer_lma(regs))
4760                 role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
4761                                                         : PT64_ROOT_4LEVEL;
4762         else if (____is_cr4_pae(regs))
4763                 role.base.level = PT32E_ROOT_LEVEL;
4764         else
4765                 role.base.level = PT32_ROOT_LEVEL;
4766
4767         role.ext.cr0_pg = 1;
4768         role.ext.cr4_pae = ____is_cr4_pae(regs);
4769         role.ext.cr4_smep = ____is_cr4_smep(regs);
4770         role.ext.cr4_smap = ____is_cr4_smap(regs);
4771         role.ext.cr4_pse = ____is_cr4_pse(regs);
4772
4773         /* PKEY and LA57 are active iff long mode is active. */
4774         role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
4775         role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
4776         role.ext.efer_lma = ____is_efer_lma(regs);
4777         return role;
4778 }
4779
4780 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4781                                                    const struct kvm_mmu_role_regs *regs)
4782 {
4783         union kvm_mmu_role role = {0};
4784
4785         role.base.access = ACC_ALL;
4786         if (____is_cr0_pg(regs)) {
4787                 role.ext.cr0_pg = 1;
4788                 role.base.efer_nx = ____is_efer_nx(regs);
4789                 role.base.cr0_wp = ____is_cr0_wp(regs);
4790
4791                 role.ext.cr4_pae = ____is_cr4_pae(regs);
4792                 role.ext.cr4_smep = ____is_cr4_smep(regs);
4793                 role.ext.cr4_smap = ____is_cr4_smap(regs);
4794                 role.ext.cr4_pse = ____is_cr4_pse(regs);
4795
4796                 /* PKEY and LA57 are active iff long mode is active. */
4797                 role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
4798                 role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
4799                 role.ext.efer_lma = ____is_efer_lma(regs);
4800         }
4801         role.base.smm = is_smm(vcpu);
4802         role.base.guest_mode = is_guest_mode(vcpu);
4803         role.ext.valid = 1;
4804
4805         return role;
4806 }
4807
4808 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
4809 {
4810         /* tdp_root_level is architecture forced level, use it if nonzero */
4811         if (tdp_root_level)
4812                 return tdp_root_level;
4813
4814         /* Use 5-level TDP if and only if it's useful/necessary. */
4815         if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
4816                 return 4;
4817
4818         return max_tdp_level;
4819 }
4820
4821 static union kvm_mmu_role
4822 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
4823                                 const struct kvm_mmu_role_regs *regs)
4824 {
4825         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs);
4826
4827         role.base.ad_disabled = (shadow_accessed_mask == 0);
4828         role.base.level = kvm_mmu_get_tdp_level(vcpu);
4829         role.base.direct = true;
4830         role.base.has_4_byte_gpte = false;
4831
4832         return role;
4833 }
4834
4835 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
4836                              const struct kvm_mmu_role_regs *regs)
4837 {
4838         struct kvm_mmu *context = &vcpu->arch.root_mmu;
4839         union kvm_mmu_role cpu_role = kvm_calc_cpu_role(vcpu, regs);
4840         union kvm_mmu_role mmu_role =
4841                 kvm_calc_tdp_mmu_root_page_role(vcpu, regs);
4842
4843         if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
4844             mmu_role.as_u64 == context->mmu_role.as_u64)
4845                 return;
4846
4847         context->cpu_role.as_u64 = cpu_role.as_u64;
4848         context->mmu_role.as_u64 = mmu_role.as_u64;
4849         context->page_fault = kvm_tdp_page_fault;
4850         context->sync_page = nonpaging_sync_page;
4851         context->invlpg = NULL;
4852         context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
4853         context->direct_map = true;
4854         context->get_guest_pgd = get_cr3;
4855         context->get_pdptr = kvm_pdptr_read;
4856         context->inject_page_fault = kvm_inject_page_fault;
4857         context->root_level = cpu_role.base.level;
4858
4859         if (!is_cr0_pg(context))
4860                 context->gva_to_gpa = nonpaging_gva_to_gpa;
4861         else if (is_cr4_pae(context))
4862                 context->gva_to_gpa = paging64_gva_to_gpa;
4863         else
4864                 context->gva_to_gpa = paging32_gva_to_gpa;
4865
4866         reset_guest_paging_metadata(vcpu, context);
4867         reset_tdp_shadow_zero_bits_mask(context);
4868 }
4869
4870 static union kvm_mmu_role
4871 kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
4872                                       const struct kvm_mmu_role_regs *regs)
4873 {
4874         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs);
4875
4876         role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
4877         role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
4878         role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs);
4879
4880         return role;
4881 }
4882
4883 static union kvm_mmu_role
4884 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
4885                                    const struct kvm_mmu_role_regs *regs)
4886 {
4887         union kvm_mmu_role role =
4888                 kvm_calc_shadow_root_page_role_common(vcpu, regs);
4889
4890         role.base.direct = !____is_cr0_pg(regs);
4891
4892         if (!____is_efer_lma(regs))
4893                 role.base.level = PT32E_ROOT_LEVEL;
4894         else if (____is_cr4_la57(regs))
4895                 role.base.level = PT64_ROOT_5LEVEL;
4896         else
4897                 role.base.level = PT64_ROOT_4LEVEL;
4898
4899         return role;
4900 }
4901
4902 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
4903                                     union kvm_mmu_role cpu_role,
4904                                     union kvm_mmu_role mmu_role)
4905 {
4906         if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
4907             mmu_role.as_u64 == context->mmu_role.as_u64)
4908                 return;
4909
4910         context->cpu_role.as_u64 = cpu_role.as_u64;
4911         context->mmu_role.as_u64 = mmu_role.as_u64;
4912
4913         if (!is_cr0_pg(context))
4914                 nonpaging_init_context(context);
4915         else if (is_cr4_pae(context))
4916                 paging64_init_context(context);
4917         else
4918                 paging32_init_context(context);
4919         context->root_level = cpu_role.base.level;
4920
4921         reset_guest_paging_metadata(vcpu, context);
4922         context->shadow_root_level = mmu_role.base.level;
4923
4924         reset_shadow_zero_bits_mask(vcpu, context);
4925 }
4926
4927 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
4928                                 const struct kvm_mmu_role_regs *regs)
4929 {
4930         struct kvm_mmu *context = &vcpu->arch.root_mmu;
4931         union kvm_mmu_role cpu_role = kvm_calc_cpu_role(vcpu, regs);
4932         union kvm_mmu_role mmu_role =
4933                 kvm_calc_shadow_mmu_root_page_role(vcpu, regs);
4934
4935         shadow_mmu_init_context(vcpu, context, cpu_role, mmu_role);
4936 }
4937
4938 static union kvm_mmu_role
4939 kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
4940                                    const struct kvm_mmu_role_regs *regs)
4941 {
4942         union kvm_mmu_role role =
4943                 kvm_calc_shadow_root_page_role_common(vcpu, regs);
4944
4945         role.base.direct = false;
4946         role.base.level = kvm_mmu_get_tdp_level(vcpu);
4947
4948         return role;
4949 }
4950
4951 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
4952                              unsigned long cr4, u64 efer, gpa_t nested_cr3)
4953 {
4954         struct kvm_mmu *context = &vcpu->arch.guest_mmu;
4955         struct kvm_mmu_role_regs regs = {
4956                 .cr0 = cr0,
4957                 .cr4 = cr4 & ~X86_CR4_PKE,
4958                 .efer = efer,
4959         };
4960         union kvm_mmu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
4961         union kvm_mmu_role mmu_role = kvm_calc_shadow_npt_root_page_role(vcpu, &regs);
4962
4963         shadow_mmu_init_context(vcpu, context, cpu_role, mmu_role);
4964         kvm_mmu_new_pgd(vcpu, nested_cr3);
4965 }
4966 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
4967
4968 static union kvm_mmu_role
4969 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
4970                                    bool execonly, u8 level)
4971 {
4972         union kvm_mmu_role role = {0};
4973
4974         /*
4975          * KVM does not support SMM transfer monitors, and consequently does not
4976          * support the "entry to SMM" control either.  role.base.smm is always 0.
4977          */
4978         WARN_ON_ONCE(is_smm(vcpu));
4979         role.base.level = level;
4980         role.base.has_4_byte_gpte = false;
4981         role.base.direct = false;
4982         role.base.ad_disabled = !accessed_dirty;
4983         role.base.guest_mode = true;
4984         role.base.access = ACC_ALL;
4985
4986         role.ext.word = 0;
4987         role.ext.execonly = execonly;
4988         role.ext.valid = 1;
4989
4990         return role;
4991 }
4992
4993 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4994                              int huge_page_level, bool accessed_dirty,
4995                              gpa_t new_eptp)
4996 {
4997         struct kvm_mmu *context = &vcpu->arch.guest_mmu;
4998         u8 level = vmx_eptp_page_walk_level(new_eptp);
4999         union kvm_mmu_role new_mode =
5000                 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
5001                                                    execonly, level);
5002
5003         if (new_mode.as_u64 != context->cpu_role.as_u64) {
5004                 /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
5005                 context->cpu_role.as_u64 = new_mode.as_u64;
5006                 context->mmu_role.as_u64 = new_mode.as_u64;
5007
5008                 context->shadow_root_level = level;
5009
5010                 context->ept_ad = accessed_dirty;
5011                 context->page_fault = ept_page_fault;
5012                 context->gva_to_gpa = ept_gva_to_gpa;
5013                 context->sync_page = ept_sync_page;
5014                 context->invlpg = ept_invlpg;
5015                 context->root_level = level;
5016                 context->direct_map = false;
5017                 update_permission_bitmask(context, true);
5018                 context->pkru_mask = 0;
5019                 reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
5020                 reset_ept_shadow_zero_bits_mask(context, execonly);
5021         }
5022
5023         kvm_mmu_new_pgd(vcpu, new_eptp);
5024 }
5025 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5026
5027 static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
5028                              const struct kvm_mmu_role_regs *regs)
5029 {
5030         struct kvm_mmu *context = &vcpu->arch.root_mmu;
5031
5032         kvm_init_shadow_mmu(vcpu, regs);
5033
5034         context->get_guest_pgd     = get_cr3;
5035         context->get_pdptr         = kvm_pdptr_read;
5036         context->inject_page_fault = kvm_inject_page_fault;
5037 }
5038
5039 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
5040                                 const struct kvm_mmu_role_regs *regs)
5041 {
5042         union kvm_mmu_role new_mode = kvm_calc_cpu_role(vcpu, regs);
5043         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5044
5045         if (new_mode.as_u64 == g_context->cpu_role.as_u64)
5046                 return;
5047
5048         g_context->cpu_role.as_u64   = new_mode.as_u64;
5049         g_context->get_guest_pgd     = get_cr3;
5050         g_context->get_pdptr         = kvm_pdptr_read;
5051         g_context->inject_page_fault = kvm_inject_page_fault;
5052         g_context->root_level        = new_mode.base.level;
5053
5054         /*
5055          * L2 page tables are never shadowed, so there is no need to sync
5056          * SPTEs.
5057          */
5058         g_context->invlpg            = NULL;
5059
5060         /*
5061          * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5062          * L1's nested page tables (e.g. EPT12). The nested translation
5063          * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5064          * L2's page tables as the first level of translation and L1's
5065          * nested page tables as the second level of translation. Basically
5066          * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5067          */
5068         if (!is_paging(vcpu))
5069                 g_context->gva_to_gpa = nonpaging_gva_to_gpa;
5070         else if (is_long_mode(vcpu))
5071                 g_context->gva_to_gpa = paging64_gva_to_gpa;
5072         else if (is_pae(vcpu))
5073                 g_context->gva_to_gpa = paging64_gva_to_gpa;
5074         else
5075                 g_context->gva_to_gpa = paging32_gva_to_gpa;
5076
5077         reset_guest_paging_metadata(vcpu, g_context);
5078 }
5079
5080 void kvm_init_mmu(struct kvm_vcpu *vcpu)
5081 {
5082         struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
5083
5084         if (mmu_is_nested(vcpu))
5085                 init_kvm_nested_mmu(vcpu, &regs);
5086         else if (tdp_enabled)
5087                 init_kvm_tdp_mmu(vcpu, &regs);
5088         else
5089                 init_kvm_softmmu(vcpu, &regs);
5090 }
5091 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5092
5093 void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
5094 {
5095         /*
5096          * Invalidate all MMU roles to force them to reinitialize as CPUID
5097          * information is factored into reserved bit calculations.
5098          *
5099          * Correctly handling multiple vCPU models with respect to paging and
5100          * physical address properties) in a single VM would require tracking
5101          * all relevant CPUID information in kvm_mmu_page_role. That is very
5102          * undesirable as it would increase the memory requirements for
5103          * gfn_track (see struct kvm_mmu_page_role comments).  For now that
5104          * problem is swept under the rug; KVM's CPUID API is horrific and
5105          * it's all but impossible to solve it without introducing a new API.
5106          */
5107         vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
5108         vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
5109         vcpu->arch.nested_mmu.mmu_role.ext.valid = 0;
5110         vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
5111         vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
5112         vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
5113         kvm_mmu_reset_context(vcpu);
5114
5115         /*
5116          * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
5117          * kvm_arch_vcpu_ioctl().
5118          */
5119         KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
5120 }
5121
5122 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5123 {
5124         kvm_mmu_unload(vcpu);
5125         kvm_init_mmu(vcpu);
5126 }
5127 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5128
5129 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5130 {
5131         int r;
5132
5133         r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
5134         if (r)
5135                 goto out;
5136         r = mmu_alloc_special_roots(vcpu);
5137         if (r)
5138                 goto out;
5139         if (vcpu->arch.mmu->direct_map)
5140                 r = mmu_alloc_direct_roots(vcpu);
5141         else
5142                 r = mmu_alloc_shadow_roots(vcpu);
5143         if (r)
5144                 goto out;
5145
5146         kvm_mmu_sync_roots(vcpu);
5147
5148         kvm_mmu_load_pgd(vcpu);
5149
5150         /*
5151          * Flush any TLB entries for the new root, the provenance of the root
5152          * is unknown.  Even if KVM ensures there are no stale TLB entries
5153          * for a freed root, in theory another hypervisor could have left
5154          * stale entries.  Flushing on alloc also allows KVM to skip the TLB
5155          * flush when freeing a root (see kvm_tdp_mmu_put_root()).
5156          */
5157         static_call(kvm_x86_flush_tlb_current)(vcpu);
5158 out:
5159         return r;
5160 }
5161
5162 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5163 {
5164         struct kvm *kvm = vcpu->kvm;
5165
5166         kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5167         WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
5168         kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5169         WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
5170         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
5171 }
5172
5173 static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
5174 {
5175         struct kvm_mmu_page *sp;
5176
5177         if (!VALID_PAGE(root_hpa))
5178                 return false;
5179
5180         /*
5181          * When freeing obsolete roots, treat roots as obsolete if they don't
5182          * have an associated shadow page.  This does mean KVM will get false
5183          * positives and free roots that don't strictly need to be freed, but
5184          * such false positives are relatively rare:
5185          *
5186          *  (a) only PAE paging and nested NPT has roots without shadow pages
5187          *  (b) remote reloads due to a memslot update obsoletes _all_ roots
5188          *  (c) KVM doesn't track previous roots for PAE paging, and the guest
5189          *      is unlikely to zap an in-use PGD.
5190          */
5191         sp = to_shadow_page(root_hpa);
5192         return !sp || is_obsolete_sp(kvm, sp);
5193 }
5194
5195 static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
5196 {
5197         unsigned long roots_to_free = 0;
5198         int i;
5199
5200         if (is_obsolete_root(kvm, mmu->root.hpa))
5201                 roots_to_free |= KVM_MMU_ROOT_CURRENT;
5202
5203         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5204                 if (is_obsolete_root(kvm, mmu->root.hpa))
5205                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5206         }
5207
5208         if (roots_to_free)
5209                 kvm_mmu_free_roots(kvm, mmu, roots_to_free);
5210 }
5211
5212 void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
5213 {
5214         __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
5215         __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
5216 }
5217
5218 static bool need_remote_flush(u64 old, u64 new)
5219 {
5220         if (!is_shadow_present_pte(old))
5221                 return false;
5222         if (!is_shadow_present_pte(new))
5223                 return true;
5224         if ((old ^ new) & PT64_BASE_ADDR_MASK)
5225                 return true;
5226         old ^= shadow_nx_mask;
5227         new ^= shadow_nx_mask;
5228         return (old & ~new & PT64_PERM_MASK) != 0;
5229 }
5230
5231 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5232                                     int *bytes)
5233 {
5234         u64 gentry = 0;
5235         int r;
5236
5237         /*
5238          * Assume that the pte write on a page table of the same type
5239          * as the current vcpu paging mode since we update the sptes only
5240          * when they have the same mode.
5241          */
5242         if (is_pae(vcpu) && *bytes == 4) {
5243                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5244                 *gpa &= ~(gpa_t)7;
5245                 *bytes = 8;
5246         }
5247
5248         if (*bytes == 4 || *bytes == 8) {
5249                 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5250                 if (r)
5251                         gentry = 0;
5252         }
5253
5254         return gentry;
5255 }
5256
5257 /*
5258  * If we're seeing too many writes to a page, it may no longer be a page table,
5259  * or we may be forking, in which case it is better to unmap the page.
5260  */
5261 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5262 {
5263         /*
5264          * Skip write-flooding detected for the sp whose level is 1, because
5265          * it can become unsync, then the guest page is not write-protected.
5266          */
5267         if (sp->role.level == PG_LEVEL_4K)
5268                 return false;
5269
5270         atomic_inc(&sp->write_flooding_count);
5271         return atomic_read(&sp->write_flooding_count) >= 3;
5272 }
5273
5274 /*
5275  * Misaligned accesses are too much trouble to fix up; also, they usually
5276  * indicate a page is not used as a page table.
5277  */
5278 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5279                                     int bytes)
5280 {
5281         unsigned offset, pte_size, misaligned;
5282
5283         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5284                  gpa, bytes, sp->role.word);
5285
5286         offset = offset_in_page(gpa);
5287         pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
5288
5289         /*
5290          * Sometimes, the OS only writes the last one bytes to update status
5291          * bits, for example, in linux, andb instruction is used in clear_bit().
5292          */
5293         if (!(offset & (pte_size - 1)) && bytes == 1)
5294                 return false;
5295
5296         misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5297         misaligned |= bytes < 4;
5298
5299         return misaligned;
5300 }
5301
5302 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5303 {
5304         unsigned page_offset, quadrant;
5305         u64 *spte;
5306         int level;
5307
5308         page_offset = offset_in_page(gpa);
5309         level = sp->role.level;
5310         *nspte = 1;
5311         if (sp->role.has_4_byte_gpte) {
5312                 page_offset <<= 1;      /* 32->64 */
5313                 /*
5314                  * A 32-bit pde maps 4MB while the shadow pdes map
5315                  * only 2MB.  So we need to double the offset again
5316                  * and zap two pdes instead of one.
5317                  */
5318                 if (level == PT32_ROOT_LEVEL) {
5319                         page_offset &= ~7; /* kill rounding error */
5320                         page_offset <<= 1;
5321                         *nspte = 2;
5322                 }
5323                 quadrant = page_offset >> PAGE_SHIFT;
5324                 page_offset &= ~PAGE_MASK;
5325                 if (quadrant != sp->role.quadrant)
5326                         return NULL;
5327         }
5328
5329         spte = &sp->spt[page_offset / sizeof(*spte)];
5330         return spte;
5331 }
5332
5333 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5334                               const u8 *new, int bytes,
5335                               struct kvm_page_track_notifier_node *node)
5336 {
5337         gfn_t gfn = gpa >> PAGE_SHIFT;
5338         struct kvm_mmu_page *sp;
5339         LIST_HEAD(invalid_list);
5340         u64 entry, gentry, *spte;
5341         int npte;
5342         bool flush = false;
5343
5344         /*
5345          * If we don't have indirect shadow pages, it means no page is
5346          * write-protected, so we can exit simply.
5347          */
5348         if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5349                 return;
5350
5351         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5352
5353         /*
5354          * No need to care whether allocation memory is successful
5355          * or not since pte prefetch is skipped if it does not have
5356          * enough objects in the cache.
5357          */
5358         mmu_topup_memory_caches(vcpu, true);
5359
5360         write_lock(&vcpu->kvm->mmu_lock);
5361
5362         gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5363
5364         ++vcpu->kvm->stat.mmu_pte_write;
5365
5366         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5367                 if (detect_write_misaligned(sp, gpa, bytes) ||
5368                       detect_write_flooding(sp)) {
5369                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5370                         ++vcpu->kvm->stat.mmu_flooded;
5371                         continue;
5372                 }
5373
5374                 spte = get_written_sptes(sp, gpa, &npte);
5375                 if (!spte)
5376                         continue;
5377
5378                 while (npte--) {
5379                         entry = *spte;
5380                         mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
5381                         if (gentry && sp->role.level != PG_LEVEL_4K)
5382                                 ++vcpu->kvm->stat.mmu_pde_zapped;
5383                         if (need_remote_flush(entry, *spte))
5384                                 flush = true;
5385                         ++spte;
5386                 }
5387         }
5388         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
5389         write_unlock(&vcpu->kvm->mmu_lock);
5390 }
5391
5392 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5393                        void *insn, int insn_len)
5394 {
5395         int r, emulation_type = EMULTYPE_PF;
5396         bool direct = vcpu->arch.mmu->direct_map;
5397
5398         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
5399                 return RET_PF_RETRY;
5400
5401         r = RET_PF_INVALID;
5402         if (unlikely(error_code & PFERR_RSVD_MASK)) {
5403                 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5404                 if (r == RET_PF_EMULATE)
5405                         goto emulate;
5406         }
5407
5408         if (r == RET_PF_INVALID) {
5409                 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
5410                                           lower_32_bits(error_code), false);
5411                 if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
5412                         return -EIO;
5413         }
5414
5415         if (r < 0)
5416                 return r;
5417         if (r != RET_PF_EMULATE)
5418                 return 1;
5419
5420         /*
5421          * Before emulating the instruction, check if the error code
5422          * was due to a RO violation while translating the guest page.
5423          * This can occur when using nested virtualization with nested
5424          * paging in both guests. If true, we simply unprotect the page
5425          * and resume the guest.
5426          */
5427         if (vcpu->arch.mmu->direct_map &&
5428             (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5429                 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5430                 return 1;
5431         }
5432
5433         /*
5434          * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5435          * optimistically try to just unprotect the page and let the processor
5436          * re-execute the instruction that caused the page fault.  Do not allow
5437          * retrying MMIO emulation, as it's not only pointless but could also
5438          * cause us to enter an infinite loop because the processor will keep
5439          * faulting on the non-existent MMIO address.  Retrying an instruction
5440          * from a nested guest is also pointless and dangerous as we are only
5441          * explicitly shadowing L1's page tables, i.e. unprotecting something
5442          * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5443          */
5444         if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5445                 emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
5446 emulate:
5447         return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5448                                        insn_len);
5449 }
5450 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5451
5452 void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
5453                             gva_t gva, hpa_t root_hpa)
5454 {
5455         int i;
5456
5457         /* It's actually a GPA for vcpu->arch.guest_mmu.  */
5458         if (mmu != &vcpu->arch.guest_mmu) {
5459                 /* INVLPG on a non-canonical address is a NOP according to the SDM.  */
5460                 if (is_noncanonical_address(gva, vcpu))
5461                         return;
5462
5463                 static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
5464         }
5465
5466         if (!mmu->invlpg)
5467                 return;
5468
5469         if (root_hpa == INVALID_PAGE) {
5470                 mmu->invlpg(vcpu, gva, mmu->root.hpa);
5471
5472                 /*
5473                  * INVLPG is required to invalidate any global mappings for the VA,
5474                  * irrespective of PCID. Since it would take us roughly similar amount
5475                  * of work to determine whether any of the prev_root mappings of the VA
5476                  * is marked global, or to just sync it blindly, so we might as well
5477                  * just always sync it.
5478                  *
5479                  * Mappings not reachable via the current cr3 or the prev_roots will be
5480                  * synced when switching to that cr3, so nothing needs to be done here
5481                  * for them.
5482                  */
5483                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5484                         if (VALID_PAGE(mmu->prev_roots[i].hpa))
5485                                 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5486         } else {
5487                 mmu->invlpg(vcpu, gva, root_hpa);
5488         }
5489 }
5490
5491 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5492 {
5493         kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
5494         ++vcpu->stat.invlpg;
5495 }
5496 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5497
5498
5499 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5500 {
5501         struct kvm_mmu *mmu = vcpu->arch.mmu;
5502         bool tlb_flush = false;
5503         uint i;
5504
5505         if (pcid == kvm_get_active_pcid(vcpu)) {
5506                 mmu->invlpg(vcpu, gva, mmu->root.hpa);
5507                 tlb_flush = true;
5508         }
5509
5510         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5511                 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5512                     pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
5513                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5514                         tlb_flush = true;
5515                 }
5516         }
5517
5518         if (tlb_flush)
5519                 static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
5520
5521         ++vcpu->stat.invlpg;
5522
5523         /*
5524          * Mappings not reachable via the current cr3 or the prev_roots will be
5525          * synced when switching to that cr3, so nothing needs to be done here
5526          * for them.
5527          */
5528 }
5529
5530 void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
5531                        int tdp_max_root_level, int tdp_huge_page_level)
5532 {
5533         tdp_enabled = enable_tdp;
5534         tdp_root_level = tdp_forced_root_level;
5535         max_tdp_level = tdp_max_root_level;
5536
5537         /*
5538          * max_huge_page_level reflects KVM's MMU capabilities irrespective
5539          * of kernel support, e.g. KVM may be capable of using 1GB pages when
5540          * the kernel is not.  But, KVM never creates a page size greater than
5541          * what is used by the kernel for any given HVA, i.e. the kernel's
5542          * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
5543          */
5544         if (tdp_enabled)
5545                 max_huge_page_level = tdp_huge_page_level;
5546         else if (boot_cpu_has(X86_FEATURE_GBPAGES))
5547                 max_huge_page_level = PG_LEVEL_1G;
5548         else
5549                 max_huge_page_level = PG_LEVEL_2M;
5550 }
5551 EXPORT_SYMBOL_GPL(kvm_configure_mmu);
5552
5553 /* The return value indicates if tlb flush on all vcpus is needed. */
5554 typedef bool (*slot_level_handler) (struct kvm *kvm,
5555                                     struct kvm_rmap_head *rmap_head,
5556                                     const struct kvm_memory_slot *slot);
5557
5558 /* The caller should hold mmu-lock before calling this function. */
5559 static __always_inline bool
5560 slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5561                         slot_level_handler fn, int start_level, int end_level,
5562                         gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
5563                         bool flush)
5564 {
5565         struct slot_rmap_walk_iterator iterator;
5566
5567         for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5568                         end_gfn, &iterator) {
5569                 if (iterator.rmap)
5570                         flush |= fn(kvm, iterator.rmap, memslot);
5571
5572                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
5573                         if (flush && flush_on_yield) {
5574                                 kvm_flush_remote_tlbs_with_address(kvm,
5575                                                 start_gfn,
5576                                                 iterator.gfn - start_gfn + 1);
5577                                 flush = false;
5578                         }
5579                         cond_resched_rwlock_write(&kvm->mmu_lock);
5580                 }
5581         }
5582
5583         return flush;
5584 }
5585
5586 static __always_inline bool
5587 slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5588                   slot_level_handler fn, int start_level, int end_level,
5589                   bool flush_on_yield)
5590 {
5591         return slot_handle_level_range(kvm, memslot, fn, start_level,
5592                         end_level, memslot->base_gfn,
5593                         memslot->base_gfn + memslot->npages - 1,
5594                         flush_on_yield, false);
5595 }
5596
5597 static __always_inline bool
5598 slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5599                      slot_level_handler fn, bool flush_on_yield)
5600 {
5601         return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
5602                                  PG_LEVEL_4K, flush_on_yield);
5603 }
5604
5605 static void free_mmu_pages(struct kvm_mmu *mmu)
5606 {
5607         if (!tdp_enabled && mmu->pae_root)
5608                 set_memory_encrypted((unsigned long)mmu->pae_root, 1);
5609         free_page((unsigned long)mmu->pae_root);
5610         free_page((unsigned long)mmu->pml4_root);
5611         free_page((unsigned long)mmu->pml5_root);
5612 }
5613
5614 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5615 {
5616         struct page *page;
5617         int i;
5618
5619         mmu->root.hpa = INVALID_PAGE;
5620         mmu->root.pgd = 0;
5621         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5622                 mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5623
5624         /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
5625         if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
5626                 return 0;
5627
5628         /*
5629          * When using PAE paging, the four PDPTEs are treated as 'root' pages,
5630          * while the PDP table is a per-vCPU construct that's allocated at MMU
5631          * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
5632          * x86_64.  Therefore we need to allocate the PDP table in the first
5633          * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
5634          * generally doesn't use PAE paging and can skip allocating the PDP
5635          * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
5636          * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
5637          * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
5638          */
5639         if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5640                 return 0;
5641
5642         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5643         if (!page)
5644                 return -ENOMEM;
5645
5646         mmu->pae_root = page_address(page);
5647
5648         /*
5649          * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
5650          * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
5651          * that KVM's writes and the CPU's reads get along.  Note, this is
5652          * only necessary when using shadow paging, as 64-bit NPT can get at
5653          * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
5654          * by 32-bit kernels (when KVM itself uses 32-bit NPT).
5655          */
5656         if (!tdp_enabled)
5657                 set_memory_decrypted((unsigned long)mmu->pae_root, 1);
5658         else
5659                 WARN_ON_ONCE(shadow_me_mask);
5660
5661         for (i = 0; i < 4; ++i)
5662                 mmu->pae_root[i] = INVALID_PAE_ROOT;
5663
5664         return 0;
5665 }
5666
5667 int kvm_mmu_create(struct kvm_vcpu *vcpu)
5668 {
5669         int ret;
5670
5671         vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
5672         vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
5673
5674         vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
5675         vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
5676
5677         vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
5678
5679         vcpu->arch.mmu = &vcpu->arch.root_mmu;
5680         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5681
5682         ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
5683         if (ret)
5684                 return ret;
5685
5686         ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
5687         if (ret)
5688                 goto fail_allocate_root;
5689
5690         return ret;
5691  fail_allocate_root:
5692         free_mmu_pages(&vcpu->arch.guest_mmu);
5693         return ret;
5694 }
5695
5696 #define BATCH_ZAP_PAGES 10
5697 static void kvm_zap_obsolete_pages(struct kvm *kvm)
5698 {
5699         struct kvm_mmu_page *sp, *node;
5700         int nr_zapped, batch = 0;
5701
5702 restart:
5703         list_for_each_entry_safe_reverse(sp, node,
5704               &kvm->arch.active_mmu_pages, link) {
5705                 /*
5706                  * No obsolete valid page exists before a newly created page
5707                  * since active_mmu_pages is a FIFO list.
5708                  */
5709                 if (!is_obsolete_sp(kvm, sp))
5710                         break;
5711
5712                 /*
5713                  * Invalid pages should never land back on the list of active
5714                  * pages.  Skip the bogus page, otherwise we'll get stuck in an
5715                  * infinite loop if the page gets put back on the list (again).
5716                  */
5717                 if (WARN_ON(sp->role.invalid))
5718                         continue;
5719
5720                 /*
5721                  * No need to flush the TLB since we're only zapping shadow
5722                  * pages with an obsolete generation number and all vCPUS have
5723                  * loaded a new root, i.e. the shadow pages being zapped cannot
5724                  * be in active use by the guest.
5725                  */
5726                 if (batch >= BATCH_ZAP_PAGES &&
5727                     cond_resched_rwlock_write(&kvm->mmu_lock)) {
5728                         batch = 0;
5729                         goto restart;
5730                 }
5731
5732                 if (__kvm_mmu_prepare_zap_page(kvm, sp,
5733                                 &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
5734                         batch += nr_zapped;
5735                         goto restart;
5736                 }
5737         }
5738
5739         /*
5740          * Kick all vCPUs (via remote TLB flush) before freeing the page tables
5741          * to ensure KVM is not in the middle of a lockless shadow page table
5742          * walk, which may reference the pages.  The remote TLB flush itself is
5743          * not required and is simply a convenient way to kick vCPUs as needed.
5744          * KVM performs a local TLB flush when allocating a new root (see
5745          * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
5746          * running with an obsolete MMU.
5747          */
5748         kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5749 }
5750
5751 /*
5752  * Fast invalidate all shadow pages and use lock-break technique
5753  * to zap obsolete pages.
5754  *
5755  * It's required when memslot is being deleted or VM is being
5756  * destroyed, in these cases, we should ensure that KVM MMU does
5757  * not use any resource of the being-deleted slot or all slots
5758  * after calling the function.
5759  */
5760 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5761 {
5762         lockdep_assert_held(&kvm->slots_lock);
5763
5764         write_lock(&kvm->mmu_lock);
5765         trace_kvm_mmu_zap_all_fast(kvm);
5766
5767         /*
5768          * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
5769          * held for the entire duration of zapping obsolete pages, it's
5770          * impossible for there to be multiple invalid generations associated
5771          * with *valid* shadow pages at any given time, i.e. there is exactly
5772          * one valid generation and (at most) one invalid generation.
5773          */
5774         kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
5775
5776         /*
5777          * In order to ensure all vCPUs drop their soon-to-be invalid roots,
5778          * invalidating TDP MMU roots must be done while holding mmu_lock for
5779          * write and in the same critical section as making the reload request,
5780          * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
5781          */
5782         if (is_tdp_mmu_enabled(kvm))
5783                 kvm_tdp_mmu_invalidate_all_roots(kvm);
5784
5785         /*
5786          * Notify all vcpus to reload its shadow page table and flush TLB.
5787          * Then all vcpus will switch to new shadow page table with the new
5788          * mmu_valid_gen.
5789          *
5790          * Note: we need to do this under the protection of mmu_lock,
5791          * otherwise, vcpu would purge shadow page but miss tlb flush.
5792          */
5793         kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
5794
5795         kvm_zap_obsolete_pages(kvm);
5796
5797         write_unlock(&kvm->mmu_lock);
5798
5799         /*
5800          * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
5801          * returning to the caller, e.g. if the zap is in response to a memslot
5802          * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
5803          * associated with the deleted memslot once the update completes, and
5804          * Deferring the zap until the final reference to the root is put would
5805          * lead to use-after-free.
5806          */
5807         if (is_tdp_mmu_enabled(kvm))
5808                 kvm_tdp_mmu_zap_invalidated_roots(kvm);
5809 }
5810
5811 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5812 {
5813         return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5814 }
5815
5816 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5817                         struct kvm_memory_slot *slot,
5818                         struct kvm_page_track_notifier_node *node)
5819 {
5820         kvm_mmu_zap_all_fast(kvm);
5821 }
5822
5823 int kvm_mmu_init_vm(struct kvm *kvm)
5824 {
5825         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5826         int r;
5827
5828         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5829         INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
5830         INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
5831         spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
5832
5833         r = kvm_mmu_init_tdp_mmu(kvm);
5834         if (r < 0)
5835                 return r;
5836
5837         node->track_write = kvm_mmu_pte_write;
5838         node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5839         kvm_page_track_register_notifier(kvm, node);
5840         return 0;
5841 }
5842
5843 void kvm_mmu_uninit_vm(struct kvm *kvm)
5844 {
5845         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5846
5847         kvm_page_track_unregister_notifier(kvm, node);
5848
5849         kvm_mmu_uninit_tdp_mmu(kvm);
5850 }
5851
5852 static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5853 {
5854         const struct kvm_memory_slot *memslot;
5855         struct kvm_memslots *slots;
5856         struct kvm_memslot_iter iter;
5857         bool flush = false;
5858         gfn_t start, end;
5859         int i;
5860
5861         if (!kvm_memslots_have_rmaps(kvm))
5862                 return flush;
5863
5864         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5865                 slots = __kvm_memslots(kvm, i);
5866
5867                 kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
5868                         memslot = iter.slot;
5869                         start = max(gfn_start, memslot->base_gfn);
5870                         end = min(gfn_end, memslot->base_gfn + memslot->npages);
5871                         if (WARN_ON_ONCE(start >= end))
5872                                 continue;
5873
5874                         flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5875
5876                                                         PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
5877                                                         start, end - 1, true, flush);
5878                 }
5879         }
5880
5881         return flush;
5882 }
5883
5884 /*
5885  * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
5886  * (not including it)
5887  */
5888 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5889 {
5890         bool flush;
5891         int i;
5892
5893         if (WARN_ON_ONCE(gfn_end <= gfn_start))
5894                 return;
5895
5896         write_lock(&kvm->mmu_lock);
5897
5898         kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
5899
5900         flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end);
5901
5902         if (is_tdp_mmu_enabled(kvm)) {
5903                 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
5904                         flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
5905                                                       gfn_end, true, flush);
5906         }
5907
5908         if (flush)
5909                 kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
5910                                                    gfn_end - gfn_start);
5911
5912         kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
5913
5914         write_unlock(&kvm->mmu_lock);
5915 }
5916
5917 static bool slot_rmap_write_protect(struct kvm *kvm,
5918                                     struct kvm_rmap_head *rmap_head,
5919                                     const struct kvm_memory_slot *slot)
5920 {
5921         return rmap_write_protect(rmap_head, false);
5922 }
5923
5924 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5925                                       const struct kvm_memory_slot *memslot,
5926                                       int start_level)
5927 {
5928         bool flush = false;
5929
5930         if (kvm_memslots_have_rmaps(kvm)) {
5931                 write_lock(&kvm->mmu_lock);
5932                 flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
5933                                           start_level, KVM_MAX_HUGEPAGE_LEVEL,
5934                                           false);
5935                 write_unlock(&kvm->mmu_lock);
5936         }
5937
5938         if (is_tdp_mmu_enabled(kvm)) {
5939                 read_lock(&kvm->mmu_lock);
5940                 flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
5941                 read_unlock(&kvm->mmu_lock);
5942         }
5943
5944         /*
5945          * Flush TLBs if any SPTEs had to be write-protected to ensure that
5946          * guest writes are reflected in the dirty bitmap before the memslot
5947          * update completes, i.e. before enabling dirty logging is visible to
5948          * userspace.
5949          *
5950          * Perform the TLB flush outside the mmu_lock to reduce the amount of
5951          * time the lock is held. However, this does mean that another CPU can
5952          * now grab mmu_lock and encounter a write-protected SPTE while CPUs
5953          * still have a writable mapping for the associated GFN in their TLB.
5954          *
5955          * This is safe but requires KVM to be careful when making decisions
5956          * based on the write-protection status of an SPTE. Specifically, KVM
5957          * also write-protects SPTEs to monitor changes to guest page tables
5958          * during shadow paging, and must guarantee no CPUs can write to those
5959          * page before the lock is dropped. As mentioned in the previous
5960          * paragraph, a write-protected SPTE is no guarantee that CPU cannot
5961          * perform writes. So to determine if a TLB flush is truly required, KVM
5962          * will clear a separate software-only bit (MMU-writable) and skip the
5963          * flush if-and-only-if this bit was already clear.
5964          *
5965          * See is_writable_pte() for more details.
5966          */
5967         if (flush)
5968                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
5969 }
5970
5971 /* Must be called with the mmu_lock held in write-mode. */
5972 void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
5973                                    const struct kvm_memory_slot *memslot,
5974                                    u64 start, u64 end,
5975                                    int target_level)
5976 {
5977         if (is_tdp_mmu_enabled(kvm))
5978                 kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
5979                                                  target_level, false);
5980
5981         /*
5982          * A TLB flush is unnecessary at this point for the same resons as in
5983          * kvm_mmu_slot_try_split_huge_pages().
5984          */
5985 }
5986
5987 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
5988                                         const struct kvm_memory_slot *memslot,
5989                                         int target_level)
5990 {
5991         u64 start = memslot->base_gfn;
5992         u64 end = start + memslot->npages;
5993
5994         if (is_tdp_mmu_enabled(kvm)) {
5995                 read_lock(&kvm->mmu_lock);
5996                 kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
5997                 read_unlock(&kvm->mmu_lock);
5998         }
5999
6000         /*
6001          * No TLB flush is necessary here. KVM will flush TLBs after
6002          * write-protecting and/or clearing dirty on the newly split SPTEs to
6003          * ensure that guest writes are reflected in the dirty log before the
6004          * ioctl to enable dirty logging on this memslot completes. Since the
6005          * split SPTEs retain the write and dirty bits of the huge SPTE, it is
6006          * safe for KVM to decide if a TLB flush is necessary based on the split
6007          * SPTEs.
6008          */
6009 }
6010
6011 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
6012                                          struct kvm_rmap_head *rmap_head,
6013                                          const struct kvm_memory_slot *slot)
6014 {
6015         u64 *sptep;
6016         struct rmap_iterator iter;
6017         int need_tlb_flush = 0;
6018         kvm_pfn_t pfn;
6019         struct kvm_mmu_page *sp;
6020
6021 restart:
6022         for_each_rmap_spte(rmap_head, &iter, sptep) {
6023                 sp = sptep_to_sp(sptep);
6024                 pfn = spte_to_pfn(*sptep);
6025
6026                 /*
6027                  * We cannot do huge page mapping for indirect shadow pages,
6028                  * which are found on the last rmap (level = 1) when not using
6029                  * tdp; such shadow pages are synced with the page table in
6030                  * the guest, and the guest page table is using 4K page size
6031                  * mapping if the indirect sp has level = 1.
6032                  */
6033                 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
6034                     sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
6035                                                                pfn, PG_LEVEL_NUM)) {
6036                         pte_list_remove(kvm, rmap_head, sptep);
6037
6038                         if (kvm_available_flush_tlb_with_range())
6039                                 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
6040                                         KVM_PAGES_PER_HPAGE(sp->role.level));
6041                         else
6042                                 need_tlb_flush = 1;
6043
6044                         goto restart;
6045                 }
6046         }
6047
6048         return need_tlb_flush;
6049 }
6050
6051 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6052                                    const struct kvm_memory_slot *slot)
6053 {
6054         if (kvm_memslots_have_rmaps(kvm)) {
6055                 write_lock(&kvm->mmu_lock);
6056                 /*
6057                  * Zap only 4k SPTEs since the legacy MMU only supports dirty
6058                  * logging at a 4k granularity and never creates collapsible
6059                  * 2m SPTEs during dirty logging.
6060                  */
6061                 if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true))
6062                         kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
6063                 write_unlock(&kvm->mmu_lock);
6064         }
6065
6066         if (is_tdp_mmu_enabled(kvm)) {
6067                 read_lock(&kvm->mmu_lock);
6068                 kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
6069                 read_unlock(&kvm->mmu_lock);
6070         }
6071 }
6072
6073 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
6074                                         const struct kvm_memory_slot *memslot)
6075 {
6076         /*
6077          * All current use cases for flushing the TLBs for a specific memslot
6078          * related to dirty logging, and many do the TLB flush out of mmu_lock.
6079          * The interaction between the various operations on memslot must be
6080          * serialized by slots_locks to ensure the TLB flush from one operation
6081          * is observed by any other operation on the same memslot.
6082          */
6083         lockdep_assert_held(&kvm->slots_lock);
6084         kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6085                                            memslot->npages);
6086 }
6087
6088 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
6089                                    const struct kvm_memory_slot *memslot)
6090 {
6091         bool flush = false;
6092
6093         if (kvm_memslots_have_rmaps(kvm)) {
6094                 write_lock(&kvm->mmu_lock);
6095                 /*
6096                  * Clear dirty bits only on 4k SPTEs since the legacy MMU only
6097                  * support dirty logging at a 4k granularity.
6098                  */
6099                 flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
6100                 write_unlock(&kvm->mmu_lock);
6101         }
6102
6103         if (is_tdp_mmu_enabled(kvm)) {
6104                 read_lock(&kvm->mmu_lock);
6105                 flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
6106                 read_unlock(&kvm->mmu_lock);
6107         }
6108
6109         /*
6110          * It's also safe to flush TLBs out of mmu lock here as currently this
6111          * function is only used for dirty logging, in which case flushing TLB
6112          * out of mmu lock also guarantees no dirty pages will be lost in
6113          * dirty_bitmap.
6114          */
6115         if (flush)
6116                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
6117 }
6118
6119 void kvm_mmu_zap_all(struct kvm *kvm)
6120 {
6121         struct kvm_mmu_page *sp, *node;
6122         LIST_HEAD(invalid_list);
6123         int ign;
6124
6125         write_lock(&kvm->mmu_lock);
6126 restart:
6127         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6128                 if (WARN_ON(sp->role.invalid))
6129                         continue;
6130                 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
6131                         goto restart;
6132                 if (cond_resched_rwlock_write(&kvm->mmu_lock))
6133                         goto restart;
6134         }
6135
6136         kvm_mmu_commit_zap_page(kvm, &invalid_list);
6137
6138         if (is_tdp_mmu_enabled(kvm))
6139                 kvm_tdp_mmu_zap_all(kvm);
6140
6141         write_unlock(&kvm->mmu_lock);
6142 }
6143
6144 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
6145 {
6146         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
6147
6148         gen &= MMIO_SPTE_GEN_MASK;
6149
6150         /*
6151          * Generation numbers are incremented in multiples of the number of
6152          * address spaces in order to provide unique generations across all
6153          * address spaces.  Strip what is effectively the address space
6154          * modifier prior to checking for a wrap of the MMIO generation so
6155          * that a wrap in any address space is detected.
6156          */
6157         gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6158
6159         /*
6160          * The very rare case: if the MMIO generation number has wrapped,
6161          * zap all shadow pages.
6162          */
6163         if (unlikely(gen == 0)) {
6164                 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
6165                 kvm_mmu_zap_all_fast(kvm);
6166         }
6167 }
6168
6169 static unsigned long
6170 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
6171 {
6172         struct kvm *kvm;
6173         int nr_to_scan = sc->nr_to_scan;
6174         unsigned long freed = 0;
6175
6176         mutex_lock(&kvm_lock);
6177
6178         list_for_each_entry(kvm, &vm_list, vm_list) {
6179                 int idx;
6180                 LIST_HEAD(invalid_list);
6181
6182                 /*
6183                  * Never scan more than sc->nr_to_scan VM instances.
6184                  * Will not hit this condition practically since we do not try
6185                  * to shrink more than one VM and it is very unlikely to see
6186                  * !n_used_mmu_pages so many times.
6187                  */
6188                 if (!nr_to_scan--)
6189                         break;
6190                 /*
6191                  * n_used_mmu_pages is accessed without holding kvm->mmu_lock
6192                  * here. We may skip a VM instance errorneosly, but we do not
6193                  * want to shrink a VM that only started to populate its MMU
6194                  * anyway.
6195                  */
6196                 if (!kvm->arch.n_used_mmu_pages &&
6197                     !kvm_has_zapped_obsolete_pages(kvm))
6198                         continue;
6199
6200                 idx = srcu_read_lock(&kvm->srcu);
6201                 write_lock(&kvm->mmu_lock);
6202
6203                 if (kvm_has_zapped_obsolete_pages(kvm)) {
6204                         kvm_mmu_commit_zap_page(kvm,
6205                               &kvm->arch.zapped_obsolete_pages);
6206                         goto unlock;
6207                 }
6208
6209                 freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
6210
6211 unlock:
6212                 write_unlock(&kvm->mmu_lock);
6213                 srcu_read_unlock(&kvm->srcu, idx);
6214
6215                 /*
6216                  * unfair on small ones
6217                  * per-vm shrinkers cry out
6218                  * sadness comes quickly
6219                  */
6220                 list_move_tail(&kvm->vm_list, &vm_list);
6221                 break;
6222         }
6223
6224         mutex_unlock(&kvm_lock);
6225         return freed;
6226 }
6227
6228 static unsigned long
6229 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6230 {
6231         return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6232 }
6233
6234 static struct shrinker mmu_shrinker = {
6235         .count_objects = mmu_shrink_count,
6236         .scan_objects = mmu_shrink_scan,
6237         .seeks = DEFAULT_SEEKS * 10,
6238 };
6239
6240 static void mmu_destroy_caches(void)
6241 {
6242         kmem_cache_destroy(pte_list_desc_cache);
6243         kmem_cache_destroy(mmu_page_header_cache);
6244 }
6245
6246 static bool get_nx_auto_mode(void)
6247 {
6248         /* Return true when CPU has the bug, and mitigations are ON */
6249         return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6250 }
6251
6252 static void __set_nx_huge_pages(bool val)
6253 {
6254         nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6255 }
6256
6257 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6258 {
6259         bool old_val = nx_huge_pages;
6260         bool new_val;
6261
6262         /* In "auto" mode deploy workaround only if CPU has the bug. */
6263         if (sysfs_streq(val, "off"))
6264                 new_val = 0;
6265         else if (sysfs_streq(val, "force"))
6266                 new_val = 1;
6267         else if (sysfs_streq(val, "auto"))
6268                 new_val = get_nx_auto_mode();
6269         else if (strtobool(val, &new_val) < 0)
6270                 return -EINVAL;
6271
6272         __set_nx_huge_pages(new_val);
6273
6274         if (new_val != old_val) {
6275                 struct kvm *kvm;
6276
6277                 mutex_lock(&kvm_lock);
6278
6279                 list_for_each_entry(kvm, &vm_list, vm_list) {
6280                         mutex_lock(&kvm->slots_lock);
6281                         kvm_mmu_zap_all_fast(kvm);
6282                         mutex_unlock(&kvm->slots_lock);
6283
6284                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6285                 }
6286                 mutex_unlock(&kvm_lock);
6287         }
6288
6289         return 0;
6290 }
6291
6292 /*
6293  * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
6294  * its default value of -1 is technically undefined behavior for a boolean.
6295  */
6296 void kvm_mmu_x86_module_init(void)
6297 {
6298         if (nx_huge_pages == -1)
6299                 __set_nx_huge_pages(get_nx_auto_mode());
6300 }
6301
6302 /*
6303  * The bulk of the MMU initialization is deferred until the vendor module is
6304  * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
6305  * to be reset when a potentially different vendor module is loaded.
6306  */
6307 int kvm_mmu_vendor_module_init(void)
6308 {
6309         int ret = -ENOMEM;
6310
6311         /*
6312          * MMU roles use union aliasing which is, generally speaking, an
6313          * undefined behavior. However, we supposedly know how compilers behave
6314          * and the current status quo is unlikely to change. Guardians below are
6315          * supposed to let us know if the assumption becomes false.
6316          */
6317         BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6318         BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6319         BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6320
6321         kvm_mmu_reset_all_pte_masks();
6322
6323         pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6324                                             sizeof(struct pte_list_desc),
6325                                             0, SLAB_ACCOUNT, NULL);
6326         if (!pte_list_desc_cache)
6327                 goto out;
6328
6329         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6330                                                   sizeof(struct kvm_mmu_page),
6331                                                   0, SLAB_ACCOUNT, NULL);
6332         if (!mmu_page_header_cache)
6333                 goto out;
6334
6335         if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6336                 goto out;
6337
6338         ret = register_shrinker(&mmu_shrinker);
6339         if (ret)
6340                 goto out;
6341
6342         return 0;
6343
6344 out:
6345         mmu_destroy_caches();
6346         return ret;
6347 }
6348
6349 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6350 {
6351         kvm_mmu_unload(vcpu);
6352         free_mmu_pages(&vcpu->arch.root_mmu);
6353         free_mmu_pages(&vcpu->arch.guest_mmu);
6354         mmu_free_memory_caches(vcpu);
6355 }
6356
6357 void kvm_mmu_vendor_module_exit(void)
6358 {
6359         mmu_destroy_caches();
6360         percpu_counter_destroy(&kvm_total_used_mmu_pages);
6361         unregister_shrinker(&mmu_shrinker);
6362 }
6363
6364 /*
6365  * Calculate the effective recovery period, accounting for '0' meaning "let KVM
6366  * select a halving time of 1 hour".  Returns true if recovery is enabled.
6367  */
6368 static bool calc_nx_huge_pages_recovery_period(uint *period)
6369 {
6370         /*
6371          * Use READ_ONCE to get the params, this may be called outside of the
6372          * param setters, e.g. by the kthread to compute its next timeout.
6373          */
6374         bool enabled = READ_ONCE(nx_huge_pages);
6375         uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6376
6377         if (!enabled || !ratio)
6378                 return false;
6379
6380         *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
6381         if (!*period) {
6382                 /* Make sure the period is not less than one second.  */
6383                 ratio = min(ratio, 3600u);
6384                 *period = 60 * 60 * 1000 / ratio;
6385         }
6386         return true;
6387 }
6388
6389 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
6390 {
6391         bool was_recovery_enabled, is_recovery_enabled;
6392         uint old_period, new_period;
6393         int err;
6394
6395         was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
6396
6397         err = param_set_uint(val, kp);
6398         if (err)
6399                 return err;
6400
6401         is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
6402
6403         if (is_recovery_enabled &&
6404             (!was_recovery_enabled || old_period > new_period)) {
6405                 struct kvm *kvm;
6406
6407                 mutex_lock(&kvm_lock);
6408
6409                 list_for_each_entry(kvm, &vm_list, vm_list)
6410                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6411
6412                 mutex_unlock(&kvm_lock);
6413         }
6414
6415         return err;
6416 }
6417
6418 static void kvm_recover_nx_lpages(struct kvm *kvm)
6419 {
6420         unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
6421         int rcu_idx;
6422         struct kvm_mmu_page *sp;
6423         unsigned int ratio;
6424         LIST_HEAD(invalid_list);
6425         bool flush = false;
6426         ulong to_zap;
6427
6428         rcu_idx = srcu_read_lock(&kvm->srcu);
6429         write_lock(&kvm->mmu_lock);
6430
6431         /*
6432          * Zapping TDP MMU shadow pages, including the remote TLB flush, must
6433          * be done under RCU protection, because the pages are freed via RCU
6434          * callback.
6435          */
6436         rcu_read_lock();
6437
6438         ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6439         to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
6440         for ( ; to_zap; --to_zap) {
6441                 if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
6442                         break;
6443
6444                 /*
6445                  * We use a separate list instead of just using active_mmu_pages
6446                  * because the number of lpage_disallowed pages is expected to
6447                  * be relatively small compared to the total.
6448                  */
6449                 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6450                                       struct kvm_mmu_page,
6451                                       lpage_disallowed_link);
6452                 WARN_ON_ONCE(!sp->lpage_disallowed);
6453                 if (is_tdp_mmu_page(sp)) {
6454                         flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
6455                 } else {
6456                         kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6457                         WARN_ON_ONCE(sp->lpage_disallowed);
6458                 }
6459
6460                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
6461                         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6462                         rcu_read_unlock();
6463
6464                         cond_resched_rwlock_write(&kvm->mmu_lock);
6465                         flush = false;
6466
6467                         rcu_read_lock();
6468                 }
6469         }
6470         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6471
6472         rcu_read_unlock();
6473
6474         write_unlock(&kvm->mmu_lock);
6475         srcu_read_unlock(&kvm->srcu, rcu_idx);
6476 }
6477
6478 static long get_nx_lpage_recovery_timeout(u64 start_time)
6479 {
6480         bool enabled;
6481         uint period;
6482
6483         enabled = calc_nx_huge_pages_recovery_period(&period);
6484
6485         return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
6486                        : MAX_SCHEDULE_TIMEOUT;
6487 }
6488
6489 static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6490 {
6491         u64 start_time;
6492         long remaining_time;
6493
6494         while (true) {
6495                 start_time = get_jiffies_64();
6496                 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6497
6498                 set_current_state(TASK_INTERRUPTIBLE);
6499                 while (!kthread_should_stop() && remaining_time > 0) {
6500                         schedule_timeout(remaining_time);
6501                         remaining_time = get_nx_lpage_recovery_timeout(start_time);
6502                         set_current_state(TASK_INTERRUPTIBLE);
6503                 }
6504
6505                 set_current_state(TASK_RUNNING);
6506
6507                 if (kthread_should_stop())
6508                         return 0;
6509
6510                 kvm_recover_nx_lpages(kvm);
6511         }
6512 }
6513
6514 int kvm_mmu_post_init_vm(struct kvm *kvm)
6515 {
6516         int err;
6517
6518         err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6519                                           "kvm-nx-lpage-recovery",
6520                                           &kvm->arch.nx_lpage_recovery_thread);
6521         if (!err)
6522                 kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6523
6524         return err;
6525 }
6526
6527 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6528 {
6529         if (kvm->arch.nx_lpage_recovery_thread)
6530                 kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6531 }