arch/x86/kvm/mmu/mmu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * This module enables machines with Intel VT-x extensions to run virtual
   6  * machines without emulation or binary translation.
   7  *
   8  * MMU support
   9  *
  10  * Copyright (C) 2006 Qumranet, Inc.
  11  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  12  *
  13  * Authors:
  14  *   Yaniv Kamay  <yaniv@qumranet.com>
  15  *   Avi Kivity   <avi@qumranet.com>
  16  */
  17
  18 #include "irq.h"
  19 #include "ioapic.h"
  20 #include "mmu.h"
  21 #include "mmu_internal.h"
  22 #include "tdp_mmu.h"
  23 #include "x86.h"
  24 #include "kvm_cache_regs.h"
  25 #include "kvm_emulate.h"
  26 #include "cpuid.h"
  27 #include "spte.h"
  28
  29 #include <linux/kvm_host.h>
  30 #include <linux/types.h>
  31 #include <linux/string.h>
  32 #include <linux/mm.h>
  33 #include <linux/highmem.h>
  34 #include <linux/moduleparam.h>
  35 #include <linux/export.h>
  36 #include <linux/swap.h>
  37 #include <linux/hugetlb.h>
  38 #include <linux/compiler.h>
  39 #include <linux/srcu.h>
  40 #include <linux/slab.h>
  41 #include <linux/sched/signal.h>
  42 #include <linux/uaccess.h>
  43 #include <linux/hash.h>
  44 #include <linux/kern_levels.h>
  45 #include <linux/kthread.h>
  46
  47 #include <asm/page.h>
  48 #include <asm/memtype.h>
  49 #include <asm/cmpxchg.h>
  50 #include <asm/io.h>
  51 #include <asm/set_memory.h>
  52 #include <asm/vmx.h>
  53 #include <asm/kvm_page_track.h>
  54 #include "trace.h"
  55
  56 #include "paging.h"
  57
  58 extern bool itlb_multihit_kvm_mitigation;
  59
  60 int __read_mostly nx_huge_pages = -1;
  61 static uint __read_mostly nx_huge_pages_recovery_period_ms;
  62 #ifdef CONFIG_PREEMPT_RT
  63 /* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
  64 static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
  65 #else
  66 static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
  67 #endif
  68
  69 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
  70 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
  71
  72 static const struct kernel_param_ops nx_huge_pages_ops = {
  73         .set = set_nx_huge_pages,
  74         .get = param_get_bool,
  75 };
  76
  77 static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
  78         .set = set_nx_huge_pages_recovery_param,
  79         .get = param_get_uint,
  80 };
  81
  82 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
  83 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
  84 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
  85                 &nx_huge_pages_recovery_ratio, 0644);
  86 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
  87 module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
  88                 &nx_huge_pages_recovery_period_ms, 0644);
  89 __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
  90
  91 static bool __read_mostly force_flush_and_sync_on_reuse;
  92 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
  93
  94 /*
  95  * When setting this variable to true it enables Two-Dimensional-Paging
  96  * where the hardware walks 2 page tables:
  97  * 1. the guest-virtual to guest-physical
  98  * 2. while doing 1. it walks guest-physical to host-physical
  99  * If the hardware supports that we don't need to do shadow paging.
 100  */
 101 bool tdp_enabled = false;
 102
 103 static int max_huge_page_level __read_mostly;
 104 static int tdp_root_level __read_mostly;
 105 static int max_tdp_level __read_mostly;
 106
 107 #ifdef MMU_DEBUG
 108 bool dbg = 0;
 109 module_param(dbg, bool, 0644);
 110 #endif
 111
 112 #define PTE_PREFETCH_NUM                8
 113
 114 #define PT32_LEVEL_BITS 10
 115
 116 #define PT32_LEVEL_SHIFT(level) \
 117                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
 118
 119 #define PT32_LVL_OFFSET_MASK(level) \
 120         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 121                                                 * PT32_LEVEL_BITS))) - 1))
 122
 123 #define PT32_INDEX(address, level)\
 124         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 125
 126
 127 #define PT32_BASE_ADDR_MASK PAGE_MASK
 128 #define PT32_DIR_BASE_ADDR_MASK \
 129         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
 130 #define PT32_LVL_ADDR_MASK(level) \
 131         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 132                                             * PT32_LEVEL_BITS))) - 1))
 133
 134 #include <trace/events/kvm.h>
 135
 136 /* make pte_list_desc fit well in cache lines */
 137 #define PTE_LIST_EXT 14
 138
 139 /*
 140  * Slight optimization of cacheline layout, by putting `more' and `spte_count'
 141  * at the start; then accessing it will only use one single cacheline for
 142  * either full (entries==PTE_LIST_EXT) case or entries<=6.
 143  */
 144 struct pte_list_desc {
 145         struct pte_list_desc *more;
 146         /*
 147          * Stores number of entries stored in the pte_list_desc.  No need to be
 148          * u64 but just for easier alignment.  When PTE_LIST_EXT, means full.
 149          */
 150         u64 spte_count;
 151         u64 *sptes[PTE_LIST_EXT];
 152 };
 153
 154 struct kvm_shadow_walk_iterator {
 155         u64 addr;
 156         hpa_t shadow_addr;
 157         u64 *sptep;
 158         int level;
 159         unsigned index;
 160 };
 161
 162 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
 163         for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
 164                                          (_root), (_addr));                \
 165              shadow_walk_okay(&(_walker));                                 \
 166              shadow_walk_next(&(_walker)))
 167
 168 #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
 169         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
 170              shadow_walk_okay(&(_walker));                      \
 171              shadow_walk_next(&(_walker)))
 172
 173 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)     \
 174         for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
 175              shadow_walk_okay(&(_walker)) &&                            \
 176                 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
 177              __shadow_walk_next(&(_walker), spte))
 178
 179 static struct kmem_cache *pte_list_desc_cache;
 180 struct kmem_cache *mmu_page_header_cache;
 181 static struct percpu_counter kvm_total_used_mmu_pages;
 182
 183 static void mmu_spte_set(u64 *sptep, u64 spte);
 184 static union kvm_mmu_page_role
 185 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
 186
 187 struct kvm_mmu_role_regs {
 188         const unsigned long cr0;
 189         const unsigned long cr4;
 190         const u64 efer;
 191 };
 192
 193 #define CREATE_TRACE_POINTS
 194 #include "mmutrace.h"
 195
 196 /*
 197  * Yes, lot's of underscores.  They're a hint that you probably shouldn't be
 198  * reading from the role_regs.  Once the mmu_role is constructed, it becomes
 199  * the single source of truth for the MMU's state.
 200  */
 201 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)                   \
 202 static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
 203 {                                                                       \
 204         return !!(regs->reg & flag);                                    \
 205 }
 206 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
 207 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
 208 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
 209 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
 210 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
 211 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
 212 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
 213 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
 214 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
 215 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
 216
 217 /*
 218  * The MMU itself (with a valid role) is the single source of truth for the
 219  * MMU.  Do not use the regs used to build the MMU/role, nor the vCPU.  The
 220  * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
 221  * and the vCPU may be incorrect/irrelevant.
 222  */
 223 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)         \
 224 static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)        \
 225 {                                                               \
 226         return !!(mmu->mmu_role. base_or_ext . reg##_##name);   \
 227 }
 228 BUILD_MMU_ROLE_ACCESSOR(ext,  cr0, pg);
 229 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
 230 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pse);
 231 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pae);
 232 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
 233 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
 234 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
 235 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
 236 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
 237
 238 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
 239 {
 240         struct kvm_mmu_role_regs regs = {
 241                 .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
 242                 .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
 243                 .efer = vcpu->arch.efer,
 244         };
 245
 246         return regs;
 247 }
 248
 249 static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs)
 250 {
 251         if (!____is_cr0_pg(regs))
 252                 return 0;
 253         else if (____is_efer_lma(regs))
 254                 return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL :
 255                                                PT64_ROOT_4LEVEL;
 256         else if (____is_cr4_pae(regs))
 257                 return PT32E_ROOT_LEVEL;
 258         else
 259                 return PT32_ROOT_LEVEL;
 260 }
 261
 262 static inline bool kvm_available_flush_tlb_with_range(void)
 263 {
 264         return kvm_x86_ops.tlb_remote_flush_with_range;
 265 }
 266
 267 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
 268                 struct kvm_tlb_range *range)
 269 {
 270         int ret = -ENOTSUPP;
 271
 272         if (range && kvm_x86_ops.tlb_remote_flush_with_range)
 273                 ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
 274
 275         if (ret)
 276                 kvm_flush_remote_tlbs(kvm);
 277 }
 278
 279 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
 280                 u64 start_gfn, u64 pages)
 281 {
 282         struct kvm_tlb_range range;
 283
 284         range.start_gfn = start_gfn;
 285         range.pages = pages;
 286
 287         kvm_flush_remote_tlbs_with_range(kvm, &range);
 288 }
 289
 290 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 291                            unsigned int access)
 292 {
 293         u64 spte = make_mmio_spte(vcpu, gfn, access);
 294
 295         trace_mark_mmio_spte(sptep, gfn, spte);
 296         mmu_spte_set(sptep, spte);
 297 }
 298
 299 static gfn_t get_mmio_spte_gfn(u64 spte)
 300 {
 301         u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
 302
 303         gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
 304                & shadow_nonpresent_or_rsvd_mask;
 305
 306         return gpa >> PAGE_SHIFT;
 307 }
 308
 309 static unsigned get_mmio_spte_access(u64 spte)
 310 {
 311         return spte & shadow_mmio_access_mask;
 312 }
 313
 314 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 315 {
 316         u64 kvm_gen, spte_gen, gen;
 317
 318         gen = kvm_vcpu_memslots(vcpu)->generation;
 319         if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
 320                 return false;
 321
 322         kvm_gen = gen & MMIO_SPTE_GEN_MASK;
 323         spte_gen = get_mmio_spte_generation(spte);
 324
 325         trace_check_mmio_spte(spte, kvm_gen, spte_gen);
 326         return likely(kvm_gen == spte_gen);
 327 }
 328
 329 static int is_cpuid_PSE36(void)
 330 {
 331         return 1;
 332 }
 333
 334 static gfn_t pse36_gfn_delta(u32 gpte)
 335 {
 336         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
 337
 338         return (gpte & PT32_DIR_PSE36_MASK) << shift;
 339 }
 340
 341 #ifdef CONFIG_X86_64
 342 static void __set_spte(u64 *sptep, u64 spte)
 343 {
 344         WRITE_ONCE(*sptep, spte);
 345 }
 346
 347 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 348 {
 349         WRITE_ONCE(*sptep, spte);
 350 }
 351
 352 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 353 {
 354         return xchg(sptep, spte);
 355 }
 356
 357 static u64 __get_spte_lockless(u64 *sptep)
 358 {
 359         return READ_ONCE(*sptep);
 360 }
 361 #else
 362 union split_spte {
 363         struct {
 364                 u32 spte_low;
 365                 u32 spte_high;
 366         };
 367         u64 spte;
 368 };
 369
 370 static void count_spte_clear(u64 *sptep, u64 spte)
 371 {
 372         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
 373
 374         if (is_shadow_present_pte(spte))
 375                 return;
 376
 377         /* Ensure the spte is completely set before we increase the count */
 378         smp_wmb();
 379         sp->clear_spte_count++;
 380 }
 381
 382 static void __set_spte(u64 *sptep, u64 spte)
 383 {
 384         union split_spte *ssptep, sspte;
 385
 386         ssptep = (union split_spte *)sptep;
 387         sspte = (union split_spte)spte;
 388
 389         ssptep->spte_high = sspte.spte_high;
 390
 391         /*
 392          * If we map the spte from nonpresent to present, We should store
 393          * the high bits firstly, then set present bit, so cpu can not
 394          * fetch this spte while we are setting the spte.
 395          */
 396         smp_wmb();
 397
 398         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 399 }
 400
 401 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 402 {
 403         union split_spte *ssptep, sspte;
 404
 405         ssptep = (union split_spte *)sptep;
 406         sspte = (union split_spte)spte;
 407
 408         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 409
 410         /*
 411          * If we map the spte from present to nonpresent, we should clear
 412          * present bit firstly to avoid vcpu fetch the old high bits.
 413          */
 414         smp_wmb();
 415
 416         ssptep->spte_high = sspte.spte_high;
 417         count_spte_clear(sptep, spte);
 418 }
 419
 420 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 421 {
 422         union split_spte *ssptep, sspte, orig;
 423
 424         ssptep = (union split_spte *)sptep;
 425         sspte = (union split_spte)spte;
 426
 427         /* xchg acts as a barrier before the setting of the high bits */
 428         orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
 429         orig.spte_high = ssptep->spte_high;
 430         ssptep->spte_high = sspte.spte_high;
 431         count_spte_clear(sptep, spte);
 432
 433         return orig.spte;
 434 }
 435
 436 /*
 437  * The idea using the light way get the spte on x86_32 guest is from
 438  * gup_get_pte (mm/gup.c).
 439  *
 440  * An spte tlb flush may be pending, because kvm_set_pte_rmapp
 441  * coalesces them and we are running out of the MMU lock.  Therefore
 442  * we need to protect against in-progress updates of the spte.
 443  *
 444  * Reading the spte while an update is in progress may get the old value
 445  * for the high part of the spte.  The race is fine for a present->non-present
 446  * change (because the high part of the spte is ignored for non-present spte),
 447  * but for a present->present change we must reread the spte.
 448  *
 449  * All such changes are done in two steps (present->non-present and
 450  * non-present->present), hence it is enough to count the number of
 451  * present->non-present updates: if it changed while reading the spte,
 452  * we might have hit the race.  This is done using clear_spte_count.
 453  */
 454 static u64 __get_spte_lockless(u64 *sptep)
 455 {
 456         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
 457         union split_spte spte, *orig = (union split_spte *)sptep;
 458         int count;
 459
 460 retry:
 461         count = sp->clear_spte_count;
 462         smp_rmb();
 463
 464         spte.spte_low = orig->spte_low;
 465         smp_rmb();
 466
 467         spte.spte_high = orig->spte_high;
 468         smp_rmb();
 469
 470         if (unlikely(spte.spte_low != orig->spte_low ||
 471               count != sp->clear_spte_count))
 472                 goto retry;
 473
 474         return spte.spte;
 475 }
 476 #endif
 477
 478 static bool spte_has_volatile_bits(u64 spte)
 479 {
 480         if (!is_shadow_present_pte(spte))
 481                 return false;
 482
 483         /*
 484          * Always atomically update spte if it can be updated
 485          * out of mmu-lock, it can ensure dirty bit is not lost,
 486          * also, it can help us to get a stable is_writable_pte()
 487          * to ensure tlb flush is not missed.
 488          */
 489         if (spte_can_locklessly_be_made_writable(spte) ||
 490             is_access_track_spte(spte))
 491                 return true;
 492
 493         if (spte_ad_enabled(spte)) {
 494                 if ((spte & shadow_accessed_mask) == 0 ||
 495                     (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
 496                         return true;
 497         }
 498
 499         return false;
 500 }
 501
 502 /* Rules for using mmu_spte_set:
 503  * Set the sptep from nonpresent to present.
 504  * Note: the sptep being assigned *must* be either not present
 505  * or in a state where the hardware will not attempt to update
 506  * the spte.
 507  */
 508 static void mmu_spte_set(u64 *sptep, u64 new_spte)
 509 {
 510         WARN_ON(is_shadow_present_pte(*sptep));
 511         __set_spte(sptep, new_spte);
 512 }
 513
 514 /*
 515  * Update the SPTE (excluding the PFN), but do not track changes in its
 516  * accessed/dirty status.
 517  */
 518 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
 519 {
 520         u64 old_spte = *sptep;
 521
 522         WARN_ON(!is_shadow_present_pte(new_spte));
 523         check_spte_writable_invariants(new_spte);
 524
 525         if (!is_shadow_present_pte(old_spte)) {
 526                 mmu_spte_set(sptep, new_spte);
 527                 return old_spte;
 528         }
 529
 530         if (!spte_has_volatile_bits(old_spte))
 531                 __update_clear_spte_fast(sptep, new_spte);
 532         else
 533                 old_spte = __update_clear_spte_slow(sptep, new_spte);
 534
 535         WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
 536
 537         return old_spte;
 538 }
 539
 540 /* Rules for using mmu_spte_update:
 541  * Update the state bits, it means the mapped pfn is not changed.
 542  *
 543  * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
 544  * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
 545  * spte, even though the writable spte might be cached on a CPU's TLB.
 546  *
 547  * Returns true if the TLB needs to be flushed
 548  */
 549 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 550 {
 551         bool flush = false;
 552         u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
 553
 554         if (!is_shadow_present_pte(old_spte))
 555                 return false;
 556
 557         /*
 558          * For the spte updated out of mmu-lock is safe, since
 559          * we always atomically update it, see the comments in
 560          * spte_has_volatile_bits().
 561          */
 562         if (spte_can_locklessly_be_made_writable(old_spte) &&
 563               !is_writable_pte(new_spte))
 564                 flush = true;
 565
 566         /*
 567          * Flush TLB when accessed/dirty states are changed in the page tables,
 568          * to guarantee consistency between TLB and page tables.
 569          */
 570
 571         if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
 572                 flush = true;
 573                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 574         }
 575
 576         if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
 577                 flush = true;
 578                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 579         }
 580
 581         return flush;
 582 }
 583
 584 /*
 585  * Rules for using mmu_spte_clear_track_bits:
 586  * It sets the sptep from present to nonpresent, and track the
 587  * state bits, it is used to clear the last level sptep.
 588  * Returns the old PTE.
 589  */
 590 static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
 591 {
 592         kvm_pfn_t pfn;
 593         u64 old_spte = *sptep;
 594         int level = sptep_to_sp(sptep)->role.level;
 595
 596         if (!spte_has_volatile_bits(old_spte))
 597                 __update_clear_spte_fast(sptep, 0ull);
 598         else
 599                 old_spte = __update_clear_spte_slow(sptep, 0ull);
 600
 601         if (!is_shadow_present_pte(old_spte))
 602                 return old_spte;
 603
 604         kvm_update_page_stats(kvm, level, -1);
 605
 606         pfn = spte_to_pfn(old_spte);
 607
 608         /*
 609          * KVM does not hold the refcount of the page used by
 610          * kvm mmu, before reclaiming the page, we should
 611          * unmap it from mmu first.
 612          */
 613         WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
 614
 615         if (is_accessed_spte(old_spte))
 616                 kvm_set_pfn_accessed(pfn);
 617
 618         if (is_dirty_spte(old_spte))
 619                 kvm_set_pfn_dirty(pfn);
 620
 621         return old_spte;
 622 }
 623
 624 /*
 625  * Rules for using mmu_spte_clear_no_track:
 626  * Directly clear spte without caring the state bits of sptep,
 627  * it is used to set the upper level spte.
 628  */
 629 static void mmu_spte_clear_no_track(u64 *sptep)
 630 {
 631         __update_clear_spte_fast(sptep, 0ull);
 632 }
 633
 634 static u64 mmu_spte_get_lockless(u64 *sptep)
 635 {
 636         return __get_spte_lockless(sptep);
 637 }
 638
 639 /* Returns the Accessed status of the PTE and resets it at the same time. */
 640 static bool mmu_spte_age(u64 *sptep)
 641 {
 642         u64 spte = mmu_spte_get_lockless(sptep);
 643
 644         if (!is_accessed_spte(spte))
 645                 return false;
 646
 647         if (spte_ad_enabled(spte)) {
 648                 clear_bit((ffs(shadow_accessed_mask) - 1),
 649                           (unsigned long *)sptep);
 650         } else {
 651                 /*
 652                  * Capture the dirty status of the page, so that it doesn't get
 653                  * lost when the SPTE is marked for access tracking.
 654                  */
 655                 if (is_writable_pte(spte))
 656                         kvm_set_pfn_dirty(spte_to_pfn(spte));
 657
 658                 spte = mark_spte_for_access_track(spte);
 659                 mmu_spte_update_no_track(sptep, spte);
 660         }
 661
 662         return true;
 663 }
 664
 665 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 666 {
 667         if (is_tdp_mmu(vcpu->arch.mmu)) {
 668                 kvm_tdp_mmu_walk_lockless_begin();
 669         } else {
 670                 /*
 671                  * Prevent page table teardown by making any free-er wait during
 672                  * kvm_flush_remote_tlbs() IPI to all active vcpus.
 673                  */
 674                 local_irq_disable();
 675
 676                 /*
 677                  * Make sure a following spte read is not reordered ahead of the write
 678                  * to vcpu->mode.
 679                  */
 680                 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
 681         }
 682 }
 683
 684 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 685 {
 686         if (is_tdp_mmu(vcpu->arch.mmu)) {
 687                 kvm_tdp_mmu_walk_lockless_end();
 688         } else {
 689                 /*
 690                  * Make sure the write to vcpu->mode is not reordered in front of
 691                  * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
 692                  * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
 693                  */
 694                 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
 695                 local_irq_enable();
 696         }
 697 }
 698
 699 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 700 {
 701         int r;
 702
 703         /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
 704         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
 705                                        1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
 706         if (r)
 707                 return r;
 708         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
 709                                        PT64_ROOT_MAX_LEVEL);
 710         if (r)
 711                 return r;
 712         if (maybe_indirect) {
 713                 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
 714                                                PT64_ROOT_MAX_LEVEL);
 715                 if (r)
 716                         return r;
 717         }
 718         return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
 719                                           PT64_ROOT_MAX_LEVEL);
 720 }
 721
 722 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 723 {
 724         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
 725         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
 726         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
 727         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
 728 }
 729
 730 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 731 {
 732         return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
 733 }
 734
 735 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
 736 {
 737         kmem_cache_free(pte_list_desc_cache, pte_list_desc);
 738 }
 739
 740 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
 741 {
 742         if (!sp->role.direct)
 743                 return sp->gfns[index];
 744
 745         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
 746 }
 747
 748 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
 749 {
 750         if (!sp->role.direct) {
 751                 sp->gfns[index] = gfn;
 752                 return;
 753         }
 754
 755         if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
 756                 pr_err_ratelimited("gfn mismatch under direct page %llx "
 757                                    "(expected %llx, got %llx)\n",
 758                                    sp->gfn,
 759                                    kvm_mmu_page_get_gfn(sp, index), gfn);
 760 }
 761
 762 /*
 763  * Return the pointer to the large page information for a given gfn,
 764  * handling slots that are not large page aligned.
 765  */
 766 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
 767                 const struct kvm_memory_slot *slot, int level)
 768 {
 769         unsigned long idx;
 770
 771         idx = gfn_to_index(gfn, slot->base_gfn, level);
 772         return &slot->arch.lpage_info[level - 2][idx];
 773 }
 774
 775 static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
 776                                             gfn_t gfn, int count)
 777 {
 778         struct kvm_lpage_info *linfo;
 779         int i;
 780
 781         for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
 782                 linfo = lpage_info_slot(gfn, slot, i);
 783                 linfo->disallow_lpage += count;
 784                 WARN_ON(linfo->disallow_lpage < 0);
 785         }
 786 }
 787
 788 void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
 789 {
 790         update_gfn_disallow_lpage_count(slot, gfn, 1);
 791 }
 792
 793 void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
 794 {
 795         update_gfn_disallow_lpage_count(slot, gfn, -1);
 796 }
 797
 798 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 799 {
 800         struct kvm_memslots *slots;
 801         struct kvm_memory_slot *slot;
 802         gfn_t gfn;
 803
 804         kvm->arch.indirect_shadow_pages++;
 805         gfn = sp->gfn;
 806         slots = kvm_memslots_for_spte_role(kvm, sp->role);
 807         slot = __gfn_to_memslot(slots, gfn);
 808
 809         /* the non-leaf shadow pages are keeping readonly. */
 810         if (sp->role.level > PG_LEVEL_4K)
 811                 return kvm_slot_page_track_add_page(kvm, slot, gfn,
 812                                                     KVM_PAGE_TRACK_WRITE);
 813
 814         kvm_mmu_gfn_disallow_lpage(slot, gfn);
 815 }
 816
 817 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 818 {
 819         if (sp->lpage_disallowed)
 820                 return;
 821
 822         ++kvm->stat.nx_lpage_splits;
 823         list_add_tail(&sp->lpage_disallowed_link,
 824                       &kvm->arch.lpage_disallowed_mmu_pages);
 825         sp->lpage_disallowed = true;
 826 }
 827
 828 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 829 {
 830         struct kvm_memslots *slots;
 831         struct kvm_memory_slot *slot;
 832         gfn_t gfn;
 833
 834         kvm->arch.indirect_shadow_pages--;
 835         gfn = sp->gfn;
 836         slots = kvm_memslots_for_spte_role(kvm, sp->role);
 837         slot = __gfn_to_memslot(slots, gfn);
 838         if (sp->role.level > PG_LEVEL_4K)
 839                 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
 840                                                        KVM_PAGE_TRACK_WRITE);
 841
 842         kvm_mmu_gfn_allow_lpage(slot, gfn);
 843 }
 844
 845 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 846 {
 847         --kvm->stat.nx_lpage_splits;
 848         sp->lpage_disallowed = false;
 849         list_del(&sp->lpage_disallowed_link);
 850 }
 851
 852 static struct kvm_memory_slot *
 853 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 854                             bool no_dirty_log)
 855 {
 856         struct kvm_memory_slot *slot;
 857
 858         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 859         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 860                 return NULL;
 861         if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
 862                 return NULL;
 863
 864         return slot;
 865 }
 866
 867 /*
 868  * About rmap_head encoding:
 869  *
 870  * If the bit zero of rmap_head->val is clear, then it points to the only spte
 871  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
 872  * pte_list_desc containing more mappings.
 873  */
 874
 875 /*
 876  * Returns the number of pointers in the rmap chain, not counting the new one.
 877  */
 878 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
 879                         struct kvm_rmap_head *rmap_head)
 880 {
 881         struct pte_list_desc *desc;
 882         int count = 0;
 883
 884         if (!rmap_head->val) {
 885                 rmap_printk("%p %llx 0->1\n", spte, *spte);
 886                 rmap_head->val = (unsigned long)spte;
 887         } else if (!(rmap_head->val & 1)) {
 888                 rmap_printk("%p %llx 1->many\n", spte, *spte);
 889                 desc = mmu_alloc_pte_list_desc(vcpu);
 890                 desc->sptes[0] = (u64 *)rmap_head->val;
 891                 desc->sptes[1] = spte;
 892                 desc->spte_count = 2;
 893                 rmap_head->val = (unsigned long)desc | 1;
 894                 ++count;
 895         } else {
 896                 rmap_printk("%p %llx many->many\n", spte, *spte);
 897                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
 898                 while (desc->spte_count == PTE_LIST_EXT) {
 899                         count += PTE_LIST_EXT;
 900                         if (!desc->more) {
 901                                 desc->more = mmu_alloc_pte_list_desc(vcpu);
 902                                 desc = desc->more;
 903                                 desc->spte_count = 0;
 904                                 break;
 905                         }
 906                         desc = desc->more;
 907                 }
 908                 count += desc->spte_count;
 909                 desc->sptes[desc->spte_count++] = spte;
 910         }
 911         return count;
 912 }
 913
 914 static void
 915 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
 916                            struct pte_list_desc *desc, int i,
 917                            struct pte_list_desc *prev_desc)
 918 {
 919         int j = desc->spte_count - 1;
 920
 921         desc->sptes[i] = desc->sptes[j];
 922         desc->sptes[j] = NULL;
 923         desc->spte_count--;
 924         if (desc->spte_count)
 925                 return;
 926         if (!prev_desc && !desc->more)
 927                 rmap_head->val = 0;
 928         else
 929                 if (prev_desc)
 930                         prev_desc->more = desc->more;
 931                 else
 932                         rmap_head->val = (unsigned long)desc->more | 1;
 933         mmu_free_pte_list_desc(desc);
 934 }
 935
 936 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
 937 {
 938         struct pte_list_desc *desc;
 939         struct pte_list_desc *prev_desc;
 940         int i;
 941
 942         if (!rmap_head->val) {
 943                 pr_err("%s: %p 0->BUG\n", __func__, spte);
 944                 BUG();
 945         } else if (!(rmap_head->val & 1)) {
 946                 rmap_printk("%p 1->0\n", spte);
 947                 if ((u64 *)rmap_head->val != spte) {
 948                         pr_err("%s:  %p 1->BUG\n", __func__, spte);
 949                         BUG();
 950                 }
 951                 rmap_head->val = 0;
 952         } else {
 953                 rmap_printk("%p many->many\n", spte);
 954                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
 955                 prev_desc = NULL;
 956                 while (desc) {
 957                         for (i = 0; i < desc->spte_count; ++i) {
 958                                 if (desc->sptes[i] == spte) {
 959                                         pte_list_desc_remove_entry(rmap_head,
 960                                                         desc, i, prev_desc);
 961                                         return;
 962                                 }
 963                         }
 964                         prev_desc = desc;
 965                         desc = desc->more;
 966                 }
 967                 pr_err("%s: %p many->many\n", __func__, spte);
 968                 BUG();
 969         }
 970 }
 971
 972 static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 973                             u64 *sptep)
 974 {
 975         mmu_spte_clear_track_bits(kvm, sptep);
 976         __pte_list_remove(sptep, rmap_head);
 977 }
 978
 979 /* Return true if rmap existed, false otherwise */
 980 static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
 981 {
 982         struct pte_list_desc *desc, *next;
 983         int i;
 984
 985         if (!rmap_head->val)
 986                 return false;
 987
 988         if (!(rmap_head->val & 1)) {
 989                 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
 990                 goto out;
 991         }
 992
 993         desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
 994
 995         for (; desc; desc = next) {
 996                 for (i = 0; i < desc->spte_count; i++)
 997                         mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
 998                 next = desc->more;
 999                 mmu_free_pte_list_desc(desc);
1000         }
1001 out:
1002         /* rmap_head is meaningless now, remember to reset it */
1003         rmap_head->val = 0;
1004         return true;
1005 }
1006
1007 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
1008 {
1009         struct pte_list_desc *desc;
1010         unsigned int count = 0;
1011
1012         if (!rmap_head->val)
1013                 return 0;
1014         else if (!(rmap_head->val & 1))
1015                 return 1;
1016
1017         desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1018
1019         while (desc) {
1020                 count += desc->spte_count;
1021                 desc = desc->more;
1022         }
1023
1024         return count;
1025 }
1026
1027 static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
1028                                          const struct kvm_memory_slot *slot)
1029 {
1030         unsigned long idx;
1031
1032         idx = gfn_to_index(gfn, slot->base_gfn, level);
1033         return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
1034 }
1035
1036 static bool rmap_can_add(struct kvm_vcpu *vcpu)
1037 {
1038         struct kvm_mmu_memory_cache *mc;
1039
1040         mc = &vcpu->arch.mmu_pte_list_desc_cache;
1041         return kvm_mmu_memory_cache_nr_free_objects(mc);
1042 }
1043
1044 static void rmap_remove(struct kvm *kvm, u64 *spte)
1045 {
1046         struct kvm_memslots *slots;
1047         struct kvm_memory_slot *slot;
1048         struct kvm_mmu_page *sp;
1049         gfn_t gfn;
1050         struct kvm_rmap_head *rmap_head;
1051
1052         sp = sptep_to_sp(spte);
1053         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1054
1055         /*
1056          * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
1057          * so we have to determine which memslots to use based on context
1058          * information in sp->role.
1059          */
1060         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1061
1062         slot = __gfn_to_memslot(slots, gfn);
1063         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1064
1065         __pte_list_remove(spte, rmap_head);
1066 }
1067
1068 /*
1069  * Used by the following functions to iterate through the sptes linked by a
1070  * rmap.  All fields are private and not assumed to be used outside.
1071  */
1072 struct rmap_iterator {
1073         /* private fields */
1074         struct pte_list_desc *desc;     /* holds the sptep if not NULL */
1075         int pos;                        /* index of the sptep */
1076 };
1077
1078 /*
1079  * Iteration must be started by this function.  This should also be used after
1080  * removing/dropping sptes from the rmap link because in such cases the
1081  * information in the iterator may not be valid.
1082  *
1083  * Returns sptep if found, NULL otherwise.
1084  */
1085 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1086                            struct rmap_iterator *iter)
1087 {
1088         u64 *sptep;
1089
1090         if (!rmap_head->val)
1091                 return NULL;
1092
1093         if (!(rmap_head->val & 1)) {
1094                 iter->desc = NULL;
1095                 sptep = (u64 *)rmap_head->val;
1096                 goto out;
1097         }
1098
1099         iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1100         iter->pos = 0;
1101         sptep = iter->desc->sptes[iter->pos];
1102 out:
1103         BUG_ON(!is_shadow_present_pte(*sptep));
1104         return sptep;
1105 }
1106
1107 /*
1108  * Must be used with a valid iterator: e.g. after rmap_get_first().
1109  *
1110  * Returns sptep if found, NULL otherwise.
1111  */
1112 static u64 *rmap_get_next(struct rmap_iterator *iter)
1113 {
1114         u64 *sptep;
1115
1116         if (iter->desc) {
1117                 if (iter->pos < PTE_LIST_EXT - 1) {
1118                         ++iter->pos;
1119                         sptep = iter->desc->sptes[iter->pos];
1120                         if (sptep)
1121                                 goto out;
1122                 }
1123
1124                 iter->desc = iter->desc->more;
1125
1126                 if (iter->desc) {
1127                         iter->pos = 0;
1128                         /* desc->sptes[0] cannot be NULL */
1129                         sptep = iter->desc->sptes[iter->pos];
1130                         goto out;
1131                 }
1132         }
1133
1134         return NULL;
1135 out:
1136         BUG_ON(!is_shadow_present_pte(*sptep));
1137         return sptep;
1138 }
1139
1140 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)                 \
1141         for (_spte_ = rmap_get_first(_rmap_head_, _iter_);              \
1142              _spte_; _spte_ = rmap_get_next(_iter_))
1143
1144 static void drop_spte(struct kvm *kvm, u64 *sptep)
1145 {
1146         u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
1147
1148         if (is_shadow_present_pte(old_spte))
1149                 rmap_remove(kvm, sptep);
1150 }
1151
1152
1153 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1154 {
1155         if (is_large_pte(*sptep)) {
1156                 WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
1157                 drop_spte(kvm, sptep);
1158                 return true;
1159         }
1160
1161         return false;
1162 }
1163
1164 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1165 {
1166         if (__drop_large_spte(vcpu->kvm, sptep)) {
1167                 struct kvm_mmu_page *sp = sptep_to_sp(sptep);
1168
1169                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1170                         KVM_PAGES_PER_HPAGE(sp->role.level));
1171         }
1172 }
1173
1174 /*
1175  * Write-protect on the specified @sptep, @pt_protect indicates whether
1176  * spte write-protection is caused by protecting shadow page table.
1177  *
1178  * Note: write protection is difference between dirty logging and spte
1179  * protection:
1180  * - for dirty logging, the spte can be set to writable at anytime if
1181  *   its dirty bitmap is properly set.
1182  * - for spte protection, the spte can be writable only after unsync-ing
1183  *   shadow page.
1184  *
1185  * Return true if tlb need be flushed.
1186  */
1187 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1188 {
1189         u64 spte = *sptep;
1190
1191         if (!is_writable_pte(spte) &&
1192               !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1193                 return false;
1194
1195         rmap_printk("spte %p %llx\n", sptep, *sptep);
1196
1197         if (pt_protect)
1198                 spte &= ~shadow_mmu_writable_mask;
1199         spte = spte & ~PT_WRITABLE_MASK;
1200
1201         return mmu_spte_update(sptep, spte);
1202 }
1203
1204 static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
1205                                bool pt_protect)
1206 {
1207         u64 *sptep;
1208         struct rmap_iterator iter;
1209         bool flush = false;
1210
1211         for_each_rmap_spte(rmap_head, &iter, sptep)
1212                 flush |= spte_write_protect(sptep, pt_protect);
1213
1214         return flush;
1215 }
1216
1217 static bool spte_clear_dirty(u64 *sptep)
1218 {
1219         u64 spte = *sptep;
1220
1221         rmap_printk("spte %p %llx\n", sptep, *sptep);
1222
1223         MMU_WARN_ON(!spte_ad_enabled(spte));
1224         spte &= ~shadow_dirty_mask;
1225         return mmu_spte_update(sptep, spte);
1226 }
1227
1228 static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1229 {
1230         bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1231                                                (unsigned long *)sptep);
1232         if (was_writable && !spte_ad_enabled(*sptep))
1233                 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1234
1235         return was_writable;
1236 }
1237
1238 /*
1239  * Gets the GFN ready for another round of dirty logging by clearing the
1240  *      - D bit on ad-enabled SPTEs, and
1241  *      - W bit on ad-disabled SPTEs.
1242  * Returns true iff any D or W bits were cleared.
1243  */
1244 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1245                                const struct kvm_memory_slot *slot)
1246 {
1247         u64 *sptep;
1248         struct rmap_iterator iter;
1249         bool flush = false;
1250
1251         for_each_rmap_spte(rmap_head, &iter, sptep)
1252                 if (spte_ad_need_write_protect(*sptep))
1253                         flush |= spte_wrprot_for_clear_dirty(sptep);
1254                 else
1255                         flush |= spte_clear_dirty(sptep);
1256
1257         return flush;
1258 }
1259
1260 /**
1261  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1262  * @kvm: kvm instance
1263  * @slot: slot to protect
1264  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1265  * @mask: indicates which pages we should protect
1266  *
1267  * Used when we do not need to care about huge page mappings.
1268  */
1269 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1270                                      struct kvm_memory_slot *slot,
1271                                      gfn_t gfn_offset, unsigned long mask)
1272 {
1273         struct kvm_rmap_head *rmap_head;
1274
1275         if (is_tdp_mmu_enabled(kvm))
1276                 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1277                                 slot->base_gfn + gfn_offset, mask, true);
1278
1279         if (!kvm_memslots_have_rmaps(kvm))
1280                 return;
1281
1282         while (mask) {
1283                 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1284                                         PG_LEVEL_4K, slot);
1285                 rmap_write_protect(rmap_head, false);
1286
1287                 /* clear the first set bit */
1288                 mask &= mask - 1;
1289         }
1290 }
1291
1292 /**
1293  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1294  * protect the page if the D-bit isn't supported.
1295  * @kvm: kvm instance
1296  * @slot: slot to clear D-bit
1297  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1298  * @mask: indicates which pages we should clear D-bit
1299  *
1300  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1301  */
1302 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1303                                          struct kvm_memory_slot *slot,
1304                                          gfn_t gfn_offset, unsigned long mask)
1305 {
1306         struct kvm_rmap_head *rmap_head;
1307
1308         if (is_tdp_mmu_enabled(kvm))
1309                 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1310                                 slot->base_gfn + gfn_offset, mask, false);
1311
1312         if (!kvm_memslots_have_rmaps(kvm))
1313                 return;
1314
1315         while (mask) {
1316                 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1317                                         PG_LEVEL_4K, slot);
1318                 __rmap_clear_dirty(kvm, rmap_head, slot);
1319
1320                 /* clear the first set bit */
1321                 mask &= mask - 1;
1322         }
1323 }
1324
1325 /**
1326  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1327  * PT level pages.
1328  *
1329  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1330  * enable dirty logging for them.
1331  *
1332  * We need to care about huge page mappings: e.g. during dirty logging we may
1333  * have such mappings.
1334  */
1335 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1336                                 struct kvm_memory_slot *slot,
1337                                 gfn_t gfn_offset, unsigned long mask)
1338 {
1339         /*
1340          * Huge pages are NOT write protected when we start dirty logging in
1341          * initially-all-set mode; must write protect them here so that they
1342          * are split to 4K on the first write.
1343          *
1344          * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
1345          * of memslot has no such restriction, so the range can cross two large
1346          * pages.
1347          */
1348         if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1349                 gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
1350                 gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
1351
1352                 if (READ_ONCE(eager_page_split))
1353                         kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
1354
1355                 kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
1356
1357                 /* Cross two large pages? */
1358                 if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
1359                     ALIGN(end << PAGE_SHIFT, PMD_SIZE))
1360                         kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
1361                                                        PG_LEVEL_2M);
1362         }
1363
1364         /* Now handle 4K PTEs.  */
1365         if (kvm_x86_ops.cpu_dirty_log_size)
1366                 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
1367         else
1368                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1369 }
1370
1371 int kvm_cpu_dirty_log_size(void)
1372 {
1373         return kvm_x86_ops.cpu_dirty_log_size;
1374 }
1375
1376 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1377                                     struct kvm_memory_slot *slot, u64 gfn,
1378                                     int min_level)
1379 {
1380         struct kvm_rmap_head *rmap_head;
1381         int i;
1382         bool write_protected = false;
1383
1384         if (kvm_memslots_have_rmaps(kvm)) {
1385                 for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
1386                         rmap_head = gfn_to_rmap(gfn, i, slot);
1387                         write_protected |= rmap_write_protect(rmap_head, true);
1388                 }
1389         }
1390
1391         if (is_tdp_mmu_enabled(kvm))
1392                 write_protected |=
1393                         kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
1394
1395         return write_protected;
1396 }
1397
1398 static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
1399 {
1400         struct kvm_memory_slot *slot;
1401
1402         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1403         return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
1404 }
1405
1406 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1407                           const struct kvm_memory_slot *slot)
1408 {
1409         return pte_list_destroy(kvm, rmap_head);
1410 }
1411
1412 static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1413                             struct kvm_memory_slot *slot, gfn_t gfn, int level,
1414                             pte_t unused)
1415 {
1416         return kvm_zap_rmapp(kvm, rmap_head, slot);
1417 }
1418
1419 static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1420                               struct kvm_memory_slot *slot, gfn_t gfn, int level,
1421                               pte_t pte)
1422 {
1423         u64 *sptep;
1424         struct rmap_iterator iter;
1425         bool need_flush = false;
1426         u64 new_spte;
1427         kvm_pfn_t new_pfn;
1428
1429         WARN_ON(pte_huge(pte));
1430         new_pfn = pte_pfn(pte);
1431
1432 restart:
1433         for_each_rmap_spte(rmap_head, &iter, sptep) {
1434                 rmap_printk("spte %p %llx gfn %llx (%d)\n",
1435                             sptep, *sptep, gfn, level);
1436
1437                 need_flush = true;
1438
1439                 if (pte_write(pte)) {
1440                         pte_list_remove(kvm, rmap_head, sptep);
1441                         goto restart;
1442                 } else {
1443                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1444                                         *sptep, new_pfn);
1445
1446                         mmu_spte_clear_track_bits(kvm, sptep);
1447                         mmu_spte_set(sptep, new_spte);
1448                 }
1449         }
1450
1451         if (need_flush && kvm_available_flush_tlb_with_range()) {
1452                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1453                 return false;
1454         }
1455
1456         return need_flush;
1457 }
1458
1459 struct slot_rmap_walk_iterator {
1460         /* input fields. */
1461         const struct kvm_memory_slot *slot;
1462         gfn_t start_gfn;
1463         gfn_t end_gfn;
1464         int start_level;
1465         int end_level;
1466
1467         /* output fields. */
1468         gfn_t gfn;
1469         struct kvm_rmap_head *rmap;
1470         int level;
1471
1472         /* private field. */
1473         struct kvm_rmap_head *end_rmap;
1474 };
1475
1476 static void
1477 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1478 {
1479         iterator->level = level;
1480         iterator->gfn = iterator->start_gfn;
1481         iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
1482         iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
1483 }
1484
1485 static void
1486 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1487                     const struct kvm_memory_slot *slot, int start_level,
1488                     int end_level, gfn_t start_gfn, gfn_t end_gfn)
1489 {
1490         iterator->slot = slot;
1491         iterator->start_level = start_level;
1492         iterator->end_level = end_level;
1493         iterator->start_gfn = start_gfn;
1494         iterator->end_gfn = end_gfn;
1495
1496         rmap_walk_init_level(iterator, iterator->start_level);
1497 }
1498
1499 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1500 {
1501         return !!iterator->rmap;
1502 }
1503
1504 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1505 {
1506         if (++iterator->rmap <= iterator->end_rmap) {
1507                 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1508                 return;
1509         }
1510
1511         if (++iterator->level > iterator->end_level) {
1512                 iterator->rmap = NULL;
1513                 return;
1514         }
1515
1516         rmap_walk_init_level(iterator, iterator->level);
1517 }
1518
1519 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1520            _start_gfn, _end_gfn, _iter_)                                \
1521         for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
1522                                  _end_level_, _start_gfn, _end_gfn);    \
1523              slot_rmap_walk_okay(_iter_);                               \
1524              slot_rmap_walk_next(_iter_))
1525
1526 typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1527                                struct kvm_memory_slot *slot, gfn_t gfn,
1528                                int level, pte_t pte);
1529
1530 static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
1531                                                  struct kvm_gfn_range *range,
1532                                                  rmap_handler_t handler)
1533 {
1534         struct slot_rmap_walk_iterator iterator;
1535         bool ret = false;
1536
1537         for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
1538                                  range->start, range->end - 1, &iterator)
1539                 ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
1540                                iterator.level, range->pte);
1541
1542         return ret;
1543 }
1544
1545 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1546 {
1547         bool flush = false;
1548
1549         if (kvm_memslots_have_rmaps(kvm))
1550                 flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
1551
1552         if (is_tdp_mmu_enabled(kvm))
1553                 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
1554
1555         return flush;
1556 }
1557
1558 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1559 {
1560         bool flush = false;
1561
1562         if (kvm_memslots_have_rmaps(kvm))
1563                 flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
1564
1565         if (is_tdp_mmu_enabled(kvm))
1566                 flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
1567
1568         return flush;
1569 }
1570
1571 static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1572                           struct kvm_memory_slot *slot, gfn_t gfn, int level,
1573                           pte_t unused)
1574 {
1575         u64 *sptep;
1576         struct rmap_iterator iter;
1577         int young = 0;
1578
1579         for_each_rmap_spte(rmap_head, &iter, sptep)
1580                 young |= mmu_spte_age(sptep);
1581
1582         return young;
1583 }
1584
1585 static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1586                                struct kvm_memory_slot *slot, gfn_t gfn,
1587                                int level, pte_t unused)
1588 {
1589         u64 *sptep;
1590         struct rmap_iterator iter;
1591
1592         for_each_rmap_spte(rmap_head, &iter, sptep)
1593                 if (is_accessed_spte(*sptep))
1594                         return true;
1595         return false;
1596 }
1597
1598 #define RMAP_RECYCLE_THRESHOLD 1000
1599
1600 static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
1601                      u64 *spte, gfn_t gfn)
1602 {
1603         struct kvm_mmu_page *sp;
1604         struct kvm_rmap_head *rmap_head;
1605         int rmap_count;
1606
1607         sp = sptep_to_sp(spte);
1608         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1609         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1610         rmap_count = pte_list_add(vcpu, spte, rmap_head);
1611
1612         if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
1613                 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
1614                 kvm_flush_remote_tlbs_with_address(
1615                                 vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
1616         }
1617 }
1618
1619 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1620 {
1621         bool young = false;
1622
1623         if (kvm_memslots_have_rmaps(kvm))
1624                 young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
1625
1626         if (is_tdp_mmu_enabled(kvm))
1627                 young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
1628
1629         return young;
1630 }
1631
1632 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1633 {
1634         bool young = false;
1635
1636         if (kvm_memslots_have_rmaps(kvm))
1637                 young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
1638
1639         if (is_tdp_mmu_enabled(kvm))
1640                 young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
1641
1642         return young;
1643 }
1644
1645 #ifdef MMU_DEBUG
1646 static int is_empty_shadow_page(u64 *spt)
1647 {
1648         u64 *pos;
1649         u64 *end;
1650
1651         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1652                 if (is_shadow_present_pte(*pos)) {
1653                         printk(KERN_ERR "%s: %p %llx\n", __func__,
1654                                pos, *pos);
1655                         return 0;
1656                 }
1657         return 1;
1658 }
1659 #endif
1660
1661 /*
1662  * This value is the sum of all of the kvm instances's
1663  * kvm->arch.n_used_mmu_pages values.  We need a global,
1664  * aggregate version in order to make the slab shrinker
1665  * faster
1666  */
1667 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
1668 {
1669         kvm->arch.n_used_mmu_pages += nr;
1670         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1671 }
1672
1673 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1674 {
1675         MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
1676         hlist_del(&sp->hash_link);
1677         list_del(&sp->link);
1678         free_page((unsigned long)sp->spt);
1679         if (!sp->role.direct)
1680                 free_page((unsigned long)sp->gfns);
1681         kmem_cache_free(mmu_page_header_cache, sp);
1682 }
1683
1684 static unsigned kvm_page_table_hashfn(gfn_t gfn)
1685 {
1686         return hash_64(gfn, KVM_MMU_HASH_SHIFT);
1687 }
1688
1689 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1690                                     struct kvm_mmu_page *sp, u64 *parent_pte)
1691 {
1692         if (!parent_pte)
1693                 return;
1694
1695         pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
1696 }
1697
1698 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1699                                        u64 *parent_pte)
1700 {
1701         __pte_list_remove(parent_pte, &sp->parent_ptes);
1702 }
1703
1704 static void drop_parent_pte(struct kvm_mmu_page *sp,
1705                             u64 *parent_pte)
1706 {
1707         mmu_page_remove_parent_pte(sp, parent_pte);
1708         mmu_spte_clear_no_track(parent_pte);
1709 }
1710
1711 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
1712 {
1713         struct kvm_mmu_page *sp;
1714
1715         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
1716         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
1717         if (!direct)
1718                 sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
1719         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1720
1721         /*
1722          * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
1723          * depends on valid pages being added to the head of the list.  See
1724          * comments in kvm_zap_obsolete_pages().
1725          */
1726         sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
1727         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1728         kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1729         return sp;
1730 }
1731
1732 static void mark_unsync(u64 *spte);
1733 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1734 {
1735         u64 *sptep;
1736         struct rmap_iterator iter;
1737
1738         for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
1739                 mark_unsync(sptep);
1740         }
1741 }
1742
1743 static void mark_unsync(u64 *spte)
1744 {
1745         struct kvm_mmu_page *sp;
1746         unsigned int index;
1747
1748         sp = sptep_to_sp(spte);
1749         index = spte - sp->spt;
1750         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1751                 return;
1752         if (sp->unsync_children++)
1753                 return;
1754         kvm_mmu_mark_parents_unsync(sp);
1755 }
1756
1757 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1758                                struct kvm_mmu_page *sp)
1759 {
1760         return -1;
1761 }
1762
1763 #define KVM_PAGE_ARRAY_NR 16
1764
1765 struct kvm_mmu_pages {
1766         struct mmu_page_and_offset {
1767                 struct kvm_mmu_page *sp;
1768                 unsigned int idx;
1769         } page[KVM_PAGE_ARRAY_NR];
1770         unsigned int nr;
1771 };
1772
1773 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1774                          int idx)
1775 {
1776         int i;
1777
1778         if (sp->unsync)
1779                 for (i=0; i < pvec->nr; i++)
1780                         if (pvec->page[i].sp == sp)
1781                                 return 0;
1782
1783         pvec->page[pvec->nr].sp = sp;
1784         pvec->page[pvec->nr].idx = idx;
1785         pvec->nr++;
1786         return (pvec->nr == KVM_PAGE_ARRAY_NR);
1787 }
1788
1789 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
1790 {
1791         --sp->unsync_children;
1792         WARN_ON((int)sp->unsync_children < 0);
1793         __clear_bit(idx, sp->unsync_child_bitmap);
1794 }
1795
1796 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1797                            struct kvm_mmu_pages *pvec)
1798 {
1799         int i, ret, nr_unsync_leaf = 0;
1800
1801         for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1802                 struct kvm_mmu_page *child;
1803                 u64 ent = sp->spt[i];
1804
1805                 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
1806                         clear_unsync_child_bit(sp, i);
1807                         continue;
1808                 }
1809
1810                 child = to_shadow_page(ent & PT64_BASE_ADDR_MASK);
1811
1812                 if (child->unsync_children) {
1813                         if (mmu_pages_add(pvec, child, i))
1814                                 return -ENOSPC;
1815
1816                         ret = __mmu_unsync_walk(child, pvec);
1817                         if (!ret) {
1818                                 clear_unsync_child_bit(sp, i);
1819                                 continue;
1820                         } else if (ret > 0) {
1821                                 nr_unsync_leaf += ret;
1822                         } else
1823                                 return ret;
1824                 } else if (child->unsync) {
1825                         nr_unsync_leaf++;
1826                         if (mmu_pages_add(pvec, child, i))
1827                                 return -ENOSPC;
1828                 } else
1829                         clear_unsync_child_bit(sp, i);
1830         }
1831
1832         return nr_unsync_leaf;
1833 }
1834
1835 #define INVALID_INDEX (-1)
1836
1837 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1838                            struct kvm_mmu_pages *pvec)
1839 {
1840         pvec->nr = 0;
1841         if (!sp->unsync_children)
1842                 return 0;
1843
1844         mmu_pages_add(pvec, sp, INVALID_INDEX);
1845         return __mmu_unsync_walk(sp, pvec);
1846 }
1847
1848 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1849 {
1850         WARN_ON(!sp->unsync);
1851         trace_kvm_mmu_sync_page(sp);
1852         sp->unsync = 0;
1853         --kvm->stat.mmu_unsync;
1854 }
1855
1856 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1857                                      struct list_head *invalid_list);
1858 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1859                                     struct list_head *invalid_list);
1860
1861 #define for_each_valid_sp(_kvm, _sp, _list)                             \
1862         hlist_for_each_entry(_sp, _list, hash_link)                     \
1863                 if (is_obsolete_sp((_kvm), (_sp))) {                    \
1864                 } else
1865
1866 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                 \
1867         for_each_valid_sp(_kvm, _sp,                                    \
1868           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])     \
1869                 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
1870
1871 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1872                          struct list_head *invalid_list)
1873 {
1874         int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
1875
1876         if (ret < 0) {
1877                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1878                 return false;
1879         }
1880
1881         return !!ret;
1882 }
1883
1884 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
1885                                         struct list_head *invalid_list,
1886                                         bool remote_flush)
1887 {
1888         if (!remote_flush && list_empty(invalid_list))
1889                 return false;
1890
1891         if (!list_empty(invalid_list))
1892                 kvm_mmu_commit_zap_page(kvm, invalid_list);
1893         else
1894                 kvm_flush_remote_tlbs(kvm);
1895         return true;
1896 }
1897
1898 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
1899 {
1900         if (sp->role.invalid)
1901                 return true;
1902
1903         /* TDP MMU pages due not use the MMU generation. */
1904         return !sp->tdp_mmu_page &&
1905                unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
1906 }
1907
1908 struct mmu_page_path {
1909         struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
1910         unsigned int idx[PT64_ROOT_MAX_LEVEL];
1911 };
1912
1913 #define for_each_sp(pvec, sp, parents, i)                       \
1914                 for (i = mmu_pages_first(&pvec, &parents);      \
1915                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
1916                         i = mmu_pages_next(&pvec, &parents, i))
1917
1918 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1919                           struct mmu_page_path *parents,
1920                           int i)
1921 {
1922         int n;
1923
1924         for (n = i+1; n < pvec->nr; n++) {
1925                 struct kvm_mmu_page *sp = pvec->page[n].sp;
1926                 unsigned idx = pvec->page[n].idx;
1927                 int level = sp->role.level;
1928
1929                 parents->idx[level-1] = idx;
1930                 if (level == PG_LEVEL_4K)
1931                         break;
1932
1933                 parents->parent[level-2] = sp;
1934         }
1935
1936         return n;
1937 }
1938
1939 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
1940                            struct mmu_page_path *parents)
1941 {
1942         struct kvm_mmu_page *sp;
1943         int level;
1944
1945         if (pvec->nr == 0)
1946                 return 0;
1947
1948         WARN_ON(pvec->page[0].idx != INVALID_INDEX);
1949
1950         sp = pvec->page[0].sp;
1951         level = sp->role.level;
1952         WARN_ON(level == PG_LEVEL_4K);
1953
1954         parents->parent[level-2] = sp;
1955
1956         /* Also set up a sentinel.  Further entries in pvec are all
1957          * children of sp, so this element is never overwritten.
1958          */
1959         parents->parent[level-1] = NULL;
1960         return mmu_pages_next(pvec, parents, 0);
1961 }
1962
1963 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1964 {
1965         struct kvm_mmu_page *sp;
1966         unsigned int level = 0;
1967
1968         do {
1969                 unsigned int idx = parents->idx[level];
1970                 sp = parents->parent[level];
1971                 if (!sp)
1972                         return;
1973
1974                 WARN_ON(idx == INVALID_INDEX);
1975                 clear_unsync_child_bit(sp, idx);
1976                 level++;
1977         } while (!sp->unsync_children);
1978 }
1979
1980 static int mmu_sync_children(struct kvm_vcpu *vcpu,
1981                              struct kvm_mmu_page *parent, bool can_yield)
1982 {
1983         int i;
1984         struct kvm_mmu_page *sp;
1985         struct mmu_page_path parents;
1986         struct kvm_mmu_pages pages;
1987         LIST_HEAD(invalid_list);
1988         bool flush = false;
1989
1990         while (mmu_unsync_walk(parent, &pages)) {
1991                 bool protected = false;
1992
1993                 for_each_sp(pages, sp, parents, i)
1994                         protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
1995
1996                 if (protected) {
1997                         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
1998                         flush = false;
1999                 }
2000
2001                 for_each_sp(pages, sp, parents, i) {
2002                         kvm_unlink_unsync_page(vcpu->kvm, sp);
2003                         flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2004                         mmu_pages_clear_parents(&parents);
2005                 }
2006                 if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
2007                         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
2008                         if (!can_yield) {
2009                                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2010                                 return -EINTR;
2011                         }
2012
2013                         cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
2014                         flush = false;
2015                 }
2016         }
2017
2018         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
2019         return 0;
2020 }
2021
2022 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2023 {
2024         atomic_set(&sp->write_flooding_count,  0);
2025 }
2026
2027 static void clear_sp_write_flooding_count(u64 *spte)
2028 {
2029         __clear_sp_write_flooding_count(sptep_to_sp(spte));
2030 }
2031
2032 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2033                                              gfn_t gfn,
2034                                              gva_t gaddr,
2035                                              unsigned level,
2036                                              int direct,
2037                                              unsigned int access)
2038 {
2039         bool direct_mmu = vcpu->arch.mmu->direct_map;
2040         union kvm_mmu_page_role role;
2041         struct hlist_head *sp_list;
2042         unsigned quadrant;
2043         struct kvm_mmu_page *sp;
2044         int collisions = 0;
2045         LIST_HEAD(invalid_list);
2046
2047         role = vcpu->arch.mmu->mmu_role.base;
2048         role.level = level;
2049         role.direct = direct;
2050         role.access = access;
2051         if (role.has_4_byte_gpte) {
2052                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2053                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2054                 role.quadrant = quadrant;
2055         }
2056
2057         sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
2058         for_each_valid_sp(vcpu->kvm, sp, sp_list) {
2059                 if (sp->gfn != gfn) {
2060                         collisions++;
2061                         continue;
2062                 }
2063
2064                 if (sp->role.word != role.word) {
2065                         /*
2066                          * If the guest is creating an upper-level page, zap
2067                          * unsync pages for the same gfn.  While it's possible
2068                          * the guest is using recursive page tables, in all
2069                          * likelihood the guest has stopped using the unsync
2070                          * page and is installing a completely unrelated page.
2071                          * Unsync pages must not be left as is, because the new
2072                          * upper-level page will be write-protected.
2073                          */
2074                         if (level > PG_LEVEL_4K && sp->unsync)
2075                                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2076                                                          &invalid_list);
2077                         continue;
2078                 }
2079
2080                 if (direct_mmu)
2081                         goto trace_get_page;
2082
2083                 if (sp->unsync) {
2084                         /*
2085                          * The page is good, but is stale.  kvm_sync_page does
2086                          * get the latest guest state, but (unlike mmu_unsync_children)
2087                          * it doesn't write-protect the page or mark it synchronized!
2088                          * This way the validity of the mapping is ensured, but the
2089                          * overhead of write protection is not incurred until the
2090                          * guest invalidates the TLB mapping.  This allows multiple
2091                          * SPs for a single gfn to be unsync.
2092                          *
2093                          * If the sync fails, the page is zapped.  If so, break
2094                          * in order to rebuild it.
2095                          */
2096                         if (!kvm_sync_page(vcpu, sp, &invalid_list))
2097                                 break;
2098
2099                         WARN_ON(!list_empty(&invalid_list));
2100                         kvm_flush_remote_tlbs(vcpu->kvm);
2101                 }
2102
2103                 __clear_sp_write_flooding_count(sp);
2104
2105 trace_get_page:
2106                 trace_kvm_mmu_get_page(sp, false);
2107                 goto out;
2108         }
2109
2110         ++vcpu->kvm->stat.mmu_cache_miss;
2111
2112         sp = kvm_mmu_alloc_page(vcpu, direct);
2113
2114         sp->gfn = gfn;
2115         sp->role = role;
2116         hlist_add_head(&sp->hash_link, sp_list);
2117         if (!direct) {
2118                 account_shadowed(vcpu->kvm, sp);
2119                 if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
2120                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2121         }
2122         trace_kvm_mmu_get_page(sp, true);
2123 out:
2124         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2125
2126         if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2127                 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2128         return sp;
2129 }
2130
2131 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2132                                         struct kvm_vcpu *vcpu, hpa_t root,
2133                                         u64 addr)
2134 {
2135         iterator->addr = addr;
2136         iterator->shadow_addr = root;
2137         iterator->level = vcpu->arch.mmu->shadow_root_level;
2138
2139         if (iterator->level >= PT64_ROOT_4LEVEL &&
2140             vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2141             !vcpu->arch.mmu->direct_map)
2142                 iterator->level = PT32E_ROOT_LEVEL;
2143
2144         if (iterator->level == PT32E_ROOT_LEVEL) {
2145                 /*
2146                  * prev_root is currently only used for 64-bit hosts. So only
2147                  * the active root_hpa is valid here.
2148                  */
2149                 BUG_ON(root != vcpu->arch.mmu->root.hpa);
2150
2151                 iterator->shadow_addr
2152                         = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2153                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2154                 --iterator->level;
2155                 if (!iterator->shadow_addr)
2156                         iterator->level = 0;
2157         }
2158 }
2159
2160 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2161                              struct kvm_vcpu *vcpu, u64 addr)
2162 {
2163         shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
2164                                     addr);
2165 }
2166
2167 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2168 {
2169         if (iterator->level < PG_LEVEL_4K)
2170                 return false;
2171
2172         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2173         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2174         return true;
2175 }
2176
2177 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2178                                u64 spte)
2179 {
2180         if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
2181                 iterator->level = 0;
2182                 return;
2183         }
2184
2185         iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2186         --iterator->level;
2187 }
2188
2189 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2190 {
2191         __shadow_walk_next(iterator, *iterator->sptep);
2192 }
2193
2194 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2195                              struct kvm_mmu_page *sp)
2196 {
2197         u64 spte;
2198
2199         BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2200
2201         spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
2202
2203         mmu_spte_set(sptep, spte);
2204
2205         mmu_page_add_parent_pte(vcpu, sp, sptep);
2206
2207         if (sp->unsync_children || sp->unsync)
2208                 mark_unsync(sptep);
2209 }
2210
2211 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2212                                    unsigned direct_access)
2213 {
2214         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2215                 struct kvm_mmu_page *child;
2216
2217                 /*
2218                  * For the direct sp, if the guest pte's dirty bit
2219                  * changed form clean to dirty, it will corrupt the
2220                  * sp's access: allow writable in the read-only sp,
2221                  * so we should update the spte at this point to get
2222                  * a new sp with the correct access.
2223                  */
2224                 child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK);
2225                 if (child->role.access == direct_access)
2226                         return;
2227
2228                 drop_parent_pte(child, sptep);
2229                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2230         }
2231 }
2232
2233 /* Returns the number of zapped non-leaf child shadow pages. */
2234 static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2235                             u64 *spte, struct list_head *invalid_list)
2236 {
2237         u64 pte;
2238         struct kvm_mmu_page *child;
2239
2240         pte = *spte;
2241         if (is_shadow_present_pte(pte)) {
2242                 if (is_last_spte(pte, sp->role.level)) {
2243                         drop_spte(kvm, spte);
2244                 } else {
2245                         child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
2246                         drop_parent_pte(child, spte);
2247
2248                         /*
2249                          * Recursively zap nested TDP SPs, parentless SPs are
2250                          * unlikely to be used again in the near future.  This
2251                          * avoids retaining a large number of stale nested SPs.
2252                          */
2253                         if (tdp_enabled && invalid_list &&
2254                             child->role.guest_mode && !child->parent_ptes.val)
2255                                 return kvm_mmu_prepare_zap_page(kvm, child,
2256                                                                 invalid_list);
2257                 }
2258         } else if (is_mmio_spte(pte)) {
2259                 mmu_spte_clear_no_track(spte);
2260         }
2261         return 0;
2262 }
2263
2264 static int kvm_mmu_page_unlink_children(struct kvm *kvm,
2265                                         struct kvm_mmu_page *sp,
2266                                         struct list_head *invalid_list)
2267 {
2268         int zapped = 0;
2269         unsigned i;
2270
2271         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2272                 zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
2273
2274         return zapped;
2275 }
2276
2277 static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
2278 {
2279         u64 *sptep;
2280         struct rmap_iterator iter;
2281
2282         while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2283                 drop_parent_pte(sp, sptep);
2284 }
2285
2286 static int mmu_zap_unsync_children(struct kvm *kvm,
2287                                    struct kvm_mmu_page *parent,
2288                                    struct list_head *invalid_list)
2289 {
2290         int i, zapped = 0;
2291         struct mmu_page_path parents;
2292         struct kvm_mmu_pages pages;
2293
2294         if (parent->role.level == PG_LEVEL_4K)
2295                 return 0;
2296
2297         while (mmu_unsync_walk(parent, &pages)) {
2298                 struct kvm_mmu_page *sp;
2299
2300                 for_each_sp(pages, sp, parents, i) {
2301                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2302                         mmu_pages_clear_parents(&parents);
2303                         zapped++;
2304                 }
2305         }
2306
2307         return zapped;
2308 }
2309
2310 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2311                                        struct kvm_mmu_page *sp,
2312                                        struct list_head *invalid_list,
2313                                        int *nr_zapped)
2314 {
2315         bool list_unstable;
2316
2317         trace_kvm_mmu_prepare_zap_page(sp);
2318         ++kvm->stat.mmu_shadow_zapped;
2319         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2320         *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
2321         kvm_mmu_unlink_parents(sp);
2322
2323         /* Zapping children means active_mmu_pages has become unstable. */
2324         list_unstable = *nr_zapped;
2325
2326         if (!sp->role.invalid && !sp->role.direct)
2327                 unaccount_shadowed(kvm, sp);
2328
2329         if (sp->unsync)
2330                 kvm_unlink_unsync_page(kvm, sp);
2331         if (!sp->root_count) {
2332                 /* Count self */
2333                 (*nr_zapped)++;
2334
2335                 /*
2336                  * Already invalid pages (previously active roots) are not on
2337                  * the active page list.  See list_del() in the "else" case of
2338                  * !sp->root_count.
2339                  */
2340                 if (sp->role.invalid)
2341                         list_add(&sp->link, invalid_list);
2342                 else
2343                         list_move(&sp->link, invalid_list);
2344                 kvm_mod_used_mmu_pages(kvm, -1);
2345         } else {
2346                 /*
2347                  * Remove the active root from the active page list, the root
2348                  * will be explicitly freed when the root_count hits zero.
2349                  */
2350                 list_del(&sp->link);
2351
2352                 /*
2353                  * Obsolete pages cannot be used on any vCPUs, see the comment
2354                  * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
2355                  * treats invalid shadow pages as being obsolete.
2356                  */
2357                 if (!is_obsolete_sp(kvm, sp))
2358                         kvm_reload_remote_mmus(kvm);
2359         }
2360
2361         if (sp->lpage_disallowed)
2362                 unaccount_huge_nx_page(kvm, sp);
2363
2364         sp->role.invalid = 1;
2365         return list_unstable;
2366 }
2367
2368 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2369                                      struct list_head *invalid_list)
2370 {
2371         int nr_zapped;
2372
2373         __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2374         return nr_zapped;
2375 }
2376
2377 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2378                                     struct list_head *invalid_list)
2379 {
2380         struct kvm_mmu_page *sp, *nsp;
2381
2382         if (list_empty(invalid_list))
2383                 return;
2384
2385         /*
2386          * We need to make sure everyone sees our modifications to
2387          * the page tables and see changes to vcpu->mode here. The barrier
2388          * in the kvm_flush_remote_tlbs() achieves this. This pairs
2389          * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2390          *
2391          * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2392          * guest mode and/or lockless shadow page table walks.
2393          */
2394         kvm_flush_remote_tlbs(kvm);
2395
2396         list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2397                 WARN_ON(!sp->role.invalid || sp->root_count);
2398                 kvm_mmu_free_page(sp);
2399         }
2400 }
2401
2402 static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
2403                                                   unsigned long nr_to_zap)
2404 {
2405         unsigned long total_zapped = 0;
2406         struct kvm_mmu_page *sp, *tmp;
2407         LIST_HEAD(invalid_list);
2408         bool unstable;
2409         int nr_zapped;
2410
2411         if (list_empty(&kvm->arch.active_mmu_pages))
2412                 return 0;
2413
2414 restart:
2415         list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
2416                 /*
2417                  * Don't zap active root pages, the page itself can't be freed
2418                  * and zapping it will just force vCPUs to realloc and reload.
2419                  */
2420                 if (sp->root_count)
2421                         continue;
2422
2423                 unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
2424                                                       &nr_zapped);
2425                 total_zapped += nr_zapped;
2426                 if (total_zapped >= nr_to_zap)
2427                         break;
2428
2429                 if (unstable)
2430                         goto restart;
2431         }
2432
2433         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2434
2435         kvm->stat.mmu_recycled += total_zapped;
2436         return total_zapped;
2437 }
2438
2439 static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
2440 {
2441         if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
2442                 return kvm->arch.n_max_mmu_pages -
2443                         kvm->arch.n_used_mmu_pages;
2444
2445         return 0;
2446 }
2447
2448 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
2449 {
2450         unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
2451
2452         if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
2453                 return 0;
2454
2455         kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
2456
2457         /*
2458          * Note, this check is intentionally soft, it only guarantees that one
2459          * page is available, while the caller may end up allocating as many as
2460          * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
2461          * exceeding the (arbitrary by default) limit will not harm the host,
2462          * being too aggressive may unnecessarily kill the guest, and getting an
2463          * exact count is far more trouble than it's worth, especially in the
2464          * page fault paths.
2465          */
2466         if (!kvm_mmu_available_pages(vcpu->kvm))
2467                 return -ENOSPC;
2468         return 0;
2469 }
2470
2471 /*
2472  * Changing the number of mmu pages allocated to the vm
2473  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2474  */
2475 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2476 {
2477         write_lock(&kvm->mmu_lock);
2478
2479         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2480                 kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
2481                                                   goal_nr_mmu_pages);
2482
2483                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2484         }
2485
2486         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2487
2488         write_unlock(&kvm->mmu_lock);
2489 }
2490
2491 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2492 {
2493         struct kvm_mmu_page *sp;
2494         LIST_HEAD(invalid_list);
2495         int r;
2496
2497         pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2498         r = 0;
2499         write_lock(&kvm->mmu_lock);
2500         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2501                 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2502                          sp->role.word);
2503                 r = 1;
2504                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2505         }
2506         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2507         write_unlock(&kvm->mmu_lock);
2508
2509         return r;
2510 }
2511
2512 static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2513 {
2514         gpa_t gpa;
2515         int r;
2516
2517         if (vcpu->arch.mmu->direct_map)
2518                 return 0;
2519
2520         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2521
2522         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2523
2524         return r;
2525 }
2526
2527 static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2528 {
2529         trace_kvm_mmu_unsync_page(sp);
2530         ++kvm->stat.mmu_unsync;
2531         sp->unsync = 1;
2532
2533         kvm_mmu_mark_parents_unsync(sp);
2534 }
2535
2536 /*
2537  * Attempt to unsync any shadow pages that can be reached by the specified gfn,
2538  * KVM is creating a writable mapping for said gfn.  Returns 0 if all pages
2539  * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
2540  * be write-protected.
2541  */
2542 int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
2543                             gfn_t gfn, bool can_unsync, bool prefetch)
2544 {
2545         struct kvm_mmu_page *sp;
2546         bool locked = false;
2547
2548         /*
2549          * Force write-protection if the page is being tracked.  Note, the page
2550          * track machinery is used to write-protect upper-level shadow pages,
2551          * i.e. this guards the role.level == 4K assertion below!
2552          */
2553         if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
2554                 return -EPERM;
2555
2556         /*
2557          * The page is not write-tracked, mark existing shadow pages unsync
2558          * unless KVM is synchronizing an unsync SP (can_unsync = false).  In
2559          * that case, KVM must complete emulation of the guest TLB flush before
2560          * allowing shadow pages to become unsync (writable by the guest).
2561          */
2562         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2563                 if (!can_unsync)
2564                         return -EPERM;
2565
2566                 if (sp->unsync)
2567                         continue;
2568
2569                 if (prefetch)
2570                         return -EEXIST;
2571
2572                 /*
2573                  * TDP MMU page faults require an additional spinlock as they
2574                  * run with mmu_lock held for read, not write, and the unsync
2575                  * logic is not thread safe.  Take the spinklock regardless of
2576                  * the MMU type to avoid extra conditionals/parameters, there's
2577                  * no meaningful penalty if mmu_lock is held for write.
2578                  */
2579                 if (!locked) {
2580                         locked = true;
2581                         spin_lock(&kvm->arch.mmu_unsync_pages_lock);
2582
2583                         /*
2584                          * Recheck after taking the spinlock, a different vCPU
2585                          * may have since marked the page unsync.  A false
2586                          * positive on the unprotected check above is not
2587                          * possible as clearing sp->unsync _must_ hold mmu_lock
2588                          * for write, i.e. unsync cannot transition from 0->1
2589                          * while this CPU holds mmu_lock for read (or write).
2590                          */
2591                         if (READ_ONCE(sp->unsync))
2592                                 continue;
2593                 }
2594
2595                 WARN_ON(sp->role.level != PG_LEVEL_4K);
2596                 kvm_unsync_page(kvm, sp);
2597         }
2598         if (locked)
2599                 spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
2600
2601         /*
2602          * We need to ensure that the marking of unsync pages is visible
2603          * before the SPTE is updated to allow writes because
2604          * kvm_mmu_sync_roots() checks the unsync flags without holding
2605          * the MMU lock and so can race with this. If the SPTE was updated
2606          * before the page had been marked as unsync-ed, something like the
2607          * following could happen:
2608          *
2609          * CPU 1                    CPU 2
2610          * ---------------------------------------------------------------------
2611          * 1.2 Host updates SPTE
2612          *     to be writable
2613          *                      2.1 Guest writes a GPTE for GVA X.
2614          *                          (GPTE being in the guest page table shadowed
2615          *                           by the SP from CPU 1.)
2616          *                          This reads SPTE during the page table walk.
2617          *                          Since SPTE.W is read as 1, there is no
2618          *                          fault.
2619          *
2620          *                      2.2 Guest issues TLB flush.
2621          *                          That causes a VM Exit.
2622          *
2623          *                      2.3 Walking of unsync pages sees sp->unsync is
2624          *                          false and skips the page.
2625          *
2626          *                      2.4 Guest accesses GVA X.
2627          *                          Since the mapping in the SP was not updated,
2628          *                          so the old mapping for GVA X incorrectly
2629          *                          gets used.
2630          * 1.1 Host marks SP
2631          *     as unsync
2632          *     (sp->unsync = true)
2633          *
2634          * The write barrier below ensures that 1.1 happens before 1.2 and thus
2635          * the situation in 2.4 does not arise.  It pairs with the read barrier
2636          * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
2637          */
2638         smp_wmb();
2639
2640         return 0;
2641 }
2642
2643 static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
2644                         u64 *sptep, unsigned int pte_access, gfn_t gfn,
2645                         kvm_pfn_t pfn, struct kvm_page_fault *fault)
2646 {
2647         struct kvm_mmu_page *sp = sptep_to_sp(sptep);
2648         int level = sp->role.level;
2649         int was_rmapped = 0;
2650         int ret = RET_PF_FIXED;
2651         bool flush = false;
2652         bool wrprot;
2653         u64 spte;
2654
2655         /* Prefetching always gets a writable pfn.  */
2656         bool host_writable = !fault || fault->map_writable;
2657         bool prefetch = !fault || fault->prefetch;
2658         bool write_fault = fault && fault->write;
2659
2660         pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2661                  *sptep, write_fault, gfn);
2662
2663         if (unlikely(is_noslot_pfn(pfn))) {
2664                 mark_mmio_spte(vcpu, sptep, gfn, pte_access);
2665                 return RET_PF_EMULATE;
2666         }
2667
2668         if (is_shadow_present_pte(*sptep)) {
2669                 /*
2670                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2671                  * the parent of the now unreachable PTE.
2672                  */
2673                 if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
2674                         struct kvm_mmu_page *child;
2675                         u64 pte = *sptep;
2676
2677                         child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
2678                         drop_parent_pte(child, sptep);
2679                         flush = true;
2680                 } else if (pfn != spte_to_pfn(*sptep)) {
2681                         pgprintk("hfn old %llx new %llx\n",
2682                                  spte_to_pfn(*sptep), pfn);
2683                         drop_spte(vcpu->kvm, sptep);
2684                         flush = true;
2685                 } else
2686                         was_rmapped = 1;
2687         }
2688
2689         wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
2690                            true, host_writable, &spte);
2691
2692         if (*sptep == spte) {
2693                 ret = RET_PF_SPURIOUS;
2694         } else {
2695                 trace_kvm_mmu_set_spte(level, gfn, sptep);
2696                 flush |= mmu_spte_update(sptep, spte);
2697         }
2698
2699         if (wrprot) {
2700                 if (write_fault)
2701                         ret = RET_PF_EMULATE;
2702         }
2703
2704         if (flush)
2705                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
2706                                 KVM_PAGES_PER_HPAGE(level));
2707
2708         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2709
2710         if (!was_rmapped) {
2711                 WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
2712                 kvm_update_page_stats(vcpu->kvm, level, 1);
2713                 rmap_add(vcpu, slot, sptep, gfn);
2714         }
2715
2716         return ret;
2717 }
2718
2719 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2720                                     struct kvm_mmu_page *sp,
2721                                     u64 *start, u64 *end)
2722 {
2723         struct page *pages[PTE_PREFETCH_NUM];
2724         struct kvm_memory_slot *slot;
2725         unsigned int access = sp->role.access;
2726         int i, ret;
2727         gfn_t gfn;
2728
2729         gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2730         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
2731         if (!slot)
2732                 return -1;
2733
2734         ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
2735         if (ret <= 0)
2736                 return -1;
2737
2738         for (i = 0; i < ret; i++, gfn++, start++) {
2739                 mmu_set_spte(vcpu, slot, start, access, gfn,
2740                              page_to_pfn(pages[i]), NULL);
2741                 put_page(pages[i]);
2742         }
2743
2744         return 0;
2745 }
2746
2747 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2748                                   struct kvm_mmu_page *sp, u64 *sptep)
2749 {
2750         u64 *spte, *start = NULL;
2751         int i;
2752
2753         WARN_ON(!sp->role.direct);
2754
2755         i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2756         spte = sp->spt + i;
2757
2758         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2759                 if (is_shadow_present_pte(*spte) || spte == sptep) {
2760                         if (!start)
2761                                 continue;
2762                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2763                                 return;
2764                         start = NULL;
2765                 } else if (!start)
2766                         start = spte;
2767         }
2768         if (start)
2769                 direct_pte_prefetch_many(vcpu, sp, start, spte);
2770 }
2771
2772 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2773 {
2774         struct kvm_mmu_page *sp;
2775
2776         sp = sptep_to_sp(sptep);
2777
2778         /*
2779          * Without accessed bits, there's no way to distinguish between
2780          * actually accessed translations and prefetched, so disable pte
2781          * prefetch if accessed bits aren't available.
2782          */
2783         if (sp_ad_disabled(sp))
2784                 return;
2785
2786         if (sp->role.level > PG_LEVEL_4K)
2787                 return;
2788
2789         /*
2790          * If addresses are being invalidated, skip prefetching to avoid
2791          * accidentally prefetching those addresses.
2792          */
2793         if (unlikely(vcpu->kvm->mmu_notifier_count))
2794                 return;
2795
2796         __direct_pte_prefetch(vcpu, sp, sptep);
2797 }
2798
2799 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
2800                                   const struct kvm_memory_slot *slot)
2801 {
2802         unsigned long hva;
2803         pte_t *pte;
2804         int level;
2805
2806         if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
2807                 return PG_LEVEL_4K;
2808
2809         /*
2810          * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
2811          * is not solely for performance, it's also necessary to avoid the
2812          * "writable" check in __gfn_to_hva_many(), which will always fail on
2813          * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
2814          * page fault steps have already verified the guest isn't writing a
2815          * read-only memslot.
2816          */
2817         hva = __gfn_to_hva_memslot(slot, gfn);
2818
2819         pte = lookup_address_in_mm(kvm->mm, hva, &level);
2820         if (unlikely(!pte))
2821                 return PG_LEVEL_4K;
2822
2823         return level;
2824 }
2825
2826 int kvm_mmu_max_mapping_level(struct kvm *kvm,
2827                               const struct kvm_memory_slot *slot, gfn_t gfn,
2828                               kvm_pfn_t pfn, int max_level)
2829 {
2830         struct kvm_lpage_info *linfo;
2831         int host_level;
2832
2833         max_level = min(max_level, max_huge_page_level);
2834         for ( ; max_level > PG_LEVEL_4K; max_level--) {
2835                 linfo = lpage_info_slot(gfn, slot, max_level);
2836                 if (!linfo->disallow_lpage)
2837                         break;
2838         }
2839
2840         if (max_level == PG_LEVEL_4K)
2841                 return PG_LEVEL_4K;
2842
2843         host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
2844         return min(host_level, max_level);
2845 }
2846
2847 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
2848 {
2849         struct kvm_memory_slot *slot = fault->slot;
2850         kvm_pfn_t mask;
2851
2852         fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
2853
2854         if (unlikely(fault->max_level == PG_LEVEL_4K))
2855                 return;
2856
2857         if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn))
2858                 return;
2859
2860         if (kvm_slot_dirty_track_enabled(slot))
2861                 return;
2862
2863         /*
2864          * Enforce the iTLB multihit workaround after capturing the requested
2865          * level, which will be used to do precise, accurate accounting.
2866          */
2867         fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
2868                                                      fault->gfn, fault->pfn,
2869                                                      fault->max_level);
2870         if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
2871                 return;
2872
2873         /*
2874          * mmu_notifier_retry() was successful and mmu_lock is held, so
2875          * the pmd can't be split from under us.
2876          */
2877         fault->goal_level = fault->req_level;
2878         mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
2879         VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
2880         fault->pfn &= ~mask;
2881 }
2882
2883 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
2884 {
2885         if (cur_level > PG_LEVEL_4K &&
2886             cur_level == fault->goal_level &&
2887             is_shadow_present_pte(spte) &&
2888             !is_large_pte(spte)) {
2889                 /*
2890                  * A small SPTE exists for this pfn, but FNAME(fetch)
2891                  * and __direct_map would like to create a large PTE
2892                  * instead: just force them to go down another level,
2893                  * patching back for them into pfn the next 9 bits of
2894                  * the address.
2895                  */
2896                 u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
2897                                 KVM_PAGES_PER_HPAGE(cur_level - 1);
2898                 fault->pfn |= fault->gfn & page_mask;
2899                 fault->goal_level--;
2900         }
2901 }
2902
2903 static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
2904 {
2905         struct kvm_shadow_walk_iterator it;
2906         struct kvm_mmu_page *sp;
2907         int ret;
2908         gfn_t base_gfn = fault->gfn;
2909
2910         kvm_mmu_hugepage_adjust(vcpu, fault);
2911
2912         trace_kvm_mmu_spte_requested(fault);
2913         for_each_shadow_entry(vcpu, fault->addr, it) {
2914                 /*
2915                  * We cannot overwrite existing page tables with an NX
2916                  * large page, as the leaf could be executable.
2917                  */
2918                 if (fault->nx_huge_page_workaround_enabled)
2919                         disallowed_hugepage_adjust(fault, *it.sptep, it.level);
2920
2921                 base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
2922                 if (it.level == fault->goal_level)
2923                         break;
2924
2925                 drop_large_spte(vcpu, it.sptep);
2926                 if (is_shadow_present_pte(*it.sptep))
2927                         continue;
2928
2929                 sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
2930                                       it.level - 1, true, ACC_ALL);
2931
2932                 link_shadow_page(vcpu, it.sptep, sp);
2933                 if (fault->is_tdp && fault->huge_page_disallowed &&
2934                     fault->req_level >= it.level)
2935                         account_huge_nx_page(vcpu->kvm, sp);
2936         }
2937
2938         if (WARN_ON_ONCE(it.level != fault->goal_level))
2939                 return -EFAULT;
2940
2941         ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
2942                            base_gfn, fault->pfn, fault);
2943         if (ret == RET_PF_SPURIOUS)
2944                 return ret;
2945
2946         direct_pte_prefetch(vcpu, it.sptep);
2947         ++vcpu->stat.pf_fixed;
2948         return ret;
2949 }
2950
2951 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2952 {
2953         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
2954 }
2955
2956 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
2957 {
2958         /*
2959          * Do not cache the mmio info caused by writing the readonly gfn
2960          * into the spte otherwise read access on readonly gfn also can
2961          * caused mmio page fault and treat it as mmio access.
2962          */
2963         if (pfn == KVM_PFN_ERR_RO_FAULT)
2964                 return RET_PF_EMULATE;
2965
2966         if (pfn == KVM_PFN_ERR_HWPOISON) {
2967                 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
2968                 return RET_PF_RETRY;
2969         }
2970
2971         return -EFAULT;
2972 }
2973
2974 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
2975                                 unsigned int access, int *ret_val)
2976 {
2977         /* The pfn is invalid, report the error! */
2978         if (unlikely(is_error_pfn(fault->pfn))) {
2979                 *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
2980                 return true;
2981         }
2982
2983         if (unlikely(!fault->slot)) {
2984                 gva_t gva = fault->is_tdp ? 0 : fault->addr;
2985
2986                 vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
2987                                      access & shadow_mmio_access_mask);
2988                 /*
2989                  * If MMIO caching is disabled, emulate immediately without
2990                  * touching the shadow page tables as attempting to install an
2991                  * MMIO SPTE will just be an expensive nop.
2992                  */
2993                 if (unlikely(!shadow_mmio_value)) {
2994                         *ret_val = RET_PF_EMULATE;
2995                         return true;
2996                 }
2997         }
2998
2999         return false;
3000 }
3001
3002 static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
3003 {
3004         /*
3005          * Do not fix the mmio spte with invalid generation number which
3006          * need to be updated by slow page fault path.
3007          */
3008         if (fault->rsvd)
3009                 return false;
3010
3011         /* See if the page fault is due to an NX violation */
3012         if (unlikely(fault->exec && fault->present))
3013                 return false;
3014
3015         /*
3016          * #PF can be fast if:
3017          * 1. The shadow page table entry is not present, which could mean that
3018          *    the fault is potentially caused by access tracking (if enabled).
3019          * 2. The shadow page table entry is present and the fault
3020          *    is caused by write-protect, that means we just need change the W
3021          *    bit of the spte which can be done out of mmu-lock.
3022          *
3023          * However, if access tracking is disabled we know that a non-present
3024          * page must be a genuine page fault where we have to create a new SPTE.
3025          * So, if access tracking is disabled, we return true only for write
3026          * accesses to a present page.
3027          */
3028
3029         return shadow_acc_track_mask != 0 || (fault->write && fault->present);
3030 }
3031
3032 /*
3033  * Returns true if the SPTE was fixed successfully. Otherwise,
3034  * someone else modified the SPTE from its original value.
3035  */
3036 static bool
3037 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
3038                         u64 *sptep, u64 old_spte, u64 new_spte)
3039 {
3040         /*
3041          * Theoretically we could also set dirty bit (and flush TLB) here in
3042          * order to eliminate unnecessary PML logging. See comments in
3043          * set_spte. But fast_page_fault is very unlikely to happen with PML
3044          * enabled, so we do not do this. This might result in the same GPA
3045          * to be logged in PML buffer again when the write really happens, and
3046          * eventually to be called by mark_page_dirty twice. But it's also no
3047          * harm. This also avoids the TLB flush needed after setting dirty bit
3048          * so non-PML cases won't be impacted.
3049          *
3050          * Compare with set_spte where instead shadow_dirty_mask is set.
3051          */
3052         if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3053                 return false;
3054
3055         if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
3056                 mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
3057
3058         return true;
3059 }
3060
3061 static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
3062 {
3063         if (fault->exec)
3064                 return is_executable_pte(spte);
3065
3066         if (fault->write)
3067                 return is_writable_pte(spte);
3068
3069         /* Fault was on Read access */
3070         return spte & PT_PRESENT_MASK;
3071 }
3072
3073 /*
3074  * Returns the last level spte pointer of the shadow page walk for the given
3075  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
3076  * walk could be performed, returns NULL and *spte does not contain valid data.
3077  *
3078  * Contract:
3079  *  - Must be called between walk_shadow_page_lockless_{begin,end}.
3080  *  - The returned sptep must not be used after walk_shadow_page_lockless_end.
3081  */
3082 static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
3083 {
3084         struct kvm_shadow_walk_iterator iterator;
3085         u64 old_spte;
3086         u64 *sptep = NULL;
3087
3088         for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
3089                 sptep = iterator.sptep;
3090                 *spte = old_spte;
3091         }
3092
3093         return sptep;
3094 }
3095
3096 /*
3097  * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
3098  */
3099 static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3100 {
3101         struct kvm_mmu_page *sp;
3102         int ret = RET_PF_INVALID;
3103         u64 spte = 0ull;
3104         u64 *sptep = NULL;
3105         uint retry_count = 0;
3106
3107         if (!page_fault_can_be_fast(fault))
3108                 return ret;
3109
3110         walk_shadow_page_lockless_begin(vcpu);
3111
3112         do {
3113                 u64 new_spte;
3114
3115                 if (is_tdp_mmu(vcpu->arch.mmu))
3116                         sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3117                 else
3118                         sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3119
3120                 if (!is_shadow_present_pte(spte))
3121                         break;
3122
3123                 sp = sptep_to_sp(sptep);
3124                 if (!is_last_spte(spte, sp->role.level))
3125                         break;
3126
3127                 /*
3128                  * Check whether the memory access that caused the fault would
3129                  * still cause it if it were to be performed right now. If not,
3130                  * then this is a spurious fault caused by TLB lazily flushed,
3131                  * or some other CPU has already fixed the PTE after the
3132                  * current CPU took the fault.
3133                  *
3134                  * Need not check the access of upper level table entries since
3135                  * they are always ACC_ALL.
3136                  */
3137                 if (is_access_allowed(fault, spte)) {
3138                         ret = RET_PF_SPURIOUS;
3139                         break;
3140                 }
3141
3142                 new_spte = spte;
3143
3144                 if (is_access_track_spte(spte))
3145                         new_spte = restore_acc_track_spte(new_spte);
3146
3147                 /*
3148                  * Currently, to simplify the code, write-protection can
3149                  * be removed in the fast path only if the SPTE was
3150                  * write-protected for dirty-logging or access tracking.
3151                  */
3152                 if (fault->write &&
3153                     spte_can_locklessly_be_made_writable(spte)) {
3154                         new_spte |= PT_WRITABLE_MASK;
3155
3156                         /*
3157                          * Do not fix write-permission on the large spte when
3158                          * dirty logging is enabled. Since we only dirty the
3159                          * first page into the dirty-bitmap in
3160                          * fast_pf_fix_direct_spte(), other pages are missed
3161                          * if its slot has dirty logging enabled.
3162                          *
3163                          * Instead, we let the slow page fault path create a
3164                          * normal spte to fix the access.
3165                          */
3166                         if (sp->role.level > PG_LEVEL_4K &&
3167                             kvm_slot_dirty_track_enabled(fault->slot))
3168                                 break;
3169                 }
3170
3171                 /* Verify that the fault can be handled in the fast path */
3172                 if (new_spte == spte ||
3173                     !is_access_allowed(fault, new_spte))
3174                         break;
3175
3176                 /*
3177                  * Currently, fast page fault only works for direct mapping
3178                  * since the gfn is not stable for indirect shadow page. See
3179                  * Documentation/virt/kvm/locking.rst to get more detail.
3180                  */
3181                 if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
3182                         ret = RET_PF_FIXED;
3183                         break;
3184                 }
3185
3186                 if (++retry_count > 4) {
3187                         printk_once(KERN_WARNING
3188                                 "kvm: Fast #PF retrying more than 4 times.\n");
3189                         break;
3190                 }
3191
3192         } while (true);
3193
3194         trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
3195         walk_shadow_page_lockless_end(vcpu);
3196
3197         return ret;
3198 }
3199
3200 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3201                                struct list_head *invalid_list)
3202 {
3203         struct kvm_mmu_page *sp;
3204
3205         if (!VALID_PAGE(*root_hpa))
3206                 return;
3207
3208         sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
3209         if (WARN_ON(!sp))
3210                 return;
3211
3212         if (is_tdp_mmu_page(sp))
3213                 kvm_tdp_mmu_put_root(kvm, sp, false);
3214         else if (!--sp->root_count && sp->role.invalid)
3215                 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3216
3217         *root_hpa = INVALID_PAGE;
3218 }
3219
3220 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3221 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
3222                         ulong roots_to_free)
3223 {
3224         int i;
3225         LIST_HEAD(invalid_list);
3226         bool free_active_root;
3227
3228         BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3229
3230         /* Before acquiring the MMU lock, see if we need to do any real work. */
3231         free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
3232                 && VALID_PAGE(mmu->root.hpa);
3233
3234         if (!free_active_root) {
3235                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3236                         if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3237                             VALID_PAGE(mmu->prev_roots[i].hpa))
3238                                 break;
3239
3240                 if (i == KVM_MMU_NUM_PREV_ROOTS)
3241                         return;
3242         }
3243
3244         write_lock(&kvm->mmu_lock);
3245
3246         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3247                 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3248                         mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
3249                                            &invalid_list);
3250
3251         if (free_active_root) {
3252                 if (to_shadow_page(mmu->root.hpa)) {
3253                         mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
3254                 } else if (mmu->pae_root) {
3255                         for (i = 0; i < 4; ++i) {
3256                                 if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
3257                                         continue;
3258
3259                                 mmu_free_root_page(kvm, &mmu->pae_root[i],
3260                                                    &invalid_list);
3261                                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3262                         }
3263                 }
3264                 mmu->root.hpa = INVALID_PAGE;
3265                 mmu->root.pgd = 0;
3266         }
3267
3268         kvm_mmu_commit_zap_page(kvm, &invalid_list);
3269         write_unlock(&kvm->mmu_lock);
3270 }
3271 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3272
3273 void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
3274 {
3275         unsigned long roots_to_free = 0;
3276         hpa_t root_hpa;
3277         int i;
3278
3279         /*
3280          * This should not be called while L2 is active, L2 can't invalidate
3281          * _only_ its own roots, e.g. INVVPID unconditionally exits.
3282          */
3283         WARN_ON_ONCE(mmu->mmu_role.base.guest_mode);
3284
3285         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
3286                 root_hpa = mmu->prev_roots[i].hpa;
3287                 if (!VALID_PAGE(root_hpa))
3288                         continue;
3289
3290                 if (!to_shadow_page(root_hpa) ||
3291                         to_shadow_page(root_hpa)->role.guest_mode)
3292                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3293         }
3294
3295         kvm_mmu_free_roots(kvm, mmu, roots_to_free);
3296 }
3297 EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
3298
3299
3300 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3301 {
3302         int ret = 0;
3303
3304         if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
3305                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3306                 ret = 1;
3307         }
3308
3309         return ret;
3310 }
3311
3312 static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
3313                             u8 level, bool direct)
3314 {
3315         struct kvm_mmu_page *sp;
3316
3317         sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
3318         ++sp->root_count;
3319
3320         return __pa(sp->spt);
3321 }
3322
3323 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3324 {
3325         struct kvm_mmu *mmu = vcpu->arch.mmu;
3326         u8 shadow_root_level = mmu->shadow_root_level;
3327         hpa_t root;
3328         unsigned i;
3329         int r;
3330
3331         write_lock(&vcpu->kvm->mmu_lock);
3332         r = make_mmu_pages_available(vcpu);
3333         if (r < 0)
3334                 goto out_unlock;
3335
3336         if (is_tdp_mmu_enabled(vcpu->kvm)) {
3337                 root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
3338                 mmu->root.hpa = root;
3339         } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
3340                 root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
3341                 mmu->root.hpa = root;
3342         } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
3343                 if (WARN_ON_ONCE(!mmu->pae_root)) {
3344                         r = -EIO;
3345                         goto out_unlock;
3346                 }
3347
3348                 for (i = 0; i < 4; ++i) {
3349                         WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3350
3351                         root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
3352                                               i << 30, PT32_ROOT_LEVEL, true);
3353                         mmu->pae_root[i] = root | PT_PRESENT_MASK |
3354                                            shadow_me_mask;
3355                 }
3356                 mmu->root.hpa = __pa(mmu->pae_root);
3357         } else {
3358                 WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
3359                 r = -EIO;
3360                 goto out_unlock;
3361         }
3362
3363         /* root.pgd is ignored for direct MMUs. */
3364         mmu->root.pgd = 0;
3365 out_unlock:
3366         write_unlock(&vcpu->kvm->mmu_lock);
3367         return r;
3368 }
3369
3370 static int mmu_first_shadow_root_alloc(struct kvm *kvm)
3371 {
3372         struct kvm_memslots *slots;
3373         struct kvm_memory_slot *slot;
3374         int r = 0, i, bkt;
3375
3376         /*
3377          * Check if this is the first shadow root being allocated before
3378          * taking the lock.
3379          */
3380         if (kvm_shadow_root_allocated(kvm))
3381                 return 0;
3382
3383         mutex_lock(&kvm->slots_arch_lock);
3384
3385         /* Recheck, under the lock, whether this is the first shadow root. */
3386         if (kvm_shadow_root_allocated(kvm))
3387                 goto out_unlock;
3388
3389         /*
3390          * Check if anything actually needs to be allocated, e.g. all metadata
3391          * will be allocated upfront if TDP is disabled.
3392          */
3393         if (kvm_memslots_have_rmaps(kvm) &&
3394             kvm_page_track_write_tracking_enabled(kvm))
3395                 goto out_success;
3396
3397         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
3398                 slots = __kvm_memslots(kvm, i);
3399                 kvm_for_each_memslot(slot, bkt, slots) {
3400                         /*
3401                          * Both of these functions are no-ops if the target is
3402                          * already allocated, so unconditionally calling both
3403                          * is safe.  Intentionally do NOT free allocations on
3404                          * failure to avoid having to track which allocations
3405                          * were made now versus when the memslot was created.
3406                          * The metadata is guaranteed to be freed when the slot
3407                          * is freed, and will be kept/used if userspace retries
3408                          * KVM_RUN instead of killing the VM.
3409                          */
3410                         r = memslot_rmap_alloc(slot, slot->npages);
3411                         if (r)
3412                                 goto out_unlock;
3413                         r = kvm_page_track_write_tracking_alloc(slot);
3414                         if (r)
3415                                 goto out_unlock;
3416                 }
3417         }
3418
3419         /*
3420          * Ensure that shadow_root_allocated becomes true strictly after
3421          * all the related pointers are set.
3422          */
3423 out_success:
3424         smp_store_release(&kvm->arch.shadow_root_allocated, true);
3425
3426 out_unlock:
3427         mutex_unlock(&kvm->slots_arch_lock);
3428         return r;
3429 }
3430
3431 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3432 {
3433         struct kvm_mmu *mmu = vcpu->arch.mmu;
3434         u64 pdptrs[4], pm_mask;
3435         gfn_t root_gfn, root_pgd;
3436         hpa_t root;
3437         unsigned i;
3438         int r;
3439
3440         root_pgd = mmu->get_guest_pgd(vcpu);
3441         root_gfn = root_pgd >> PAGE_SHIFT;
3442
3443         if (mmu_check_root(vcpu, root_gfn))
3444                 return 1;
3445
3446         /*
3447          * On SVM, reading PDPTRs might access guest memory, which might fault
3448          * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
3449          */
3450         if (mmu->root_level == PT32E_ROOT_LEVEL) {
3451                 for (i = 0; i < 4; ++i) {
3452                         pdptrs[i] = mmu->get_pdptr(vcpu, i);
3453                         if (!(pdptrs[i] & PT_PRESENT_MASK))
3454                                 continue;
3455
3456                         if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
3457                                 return 1;
3458                 }
3459         }
3460
3461         r = mmu_first_shadow_root_alloc(vcpu->kvm);
3462         if (r)
3463                 return r;
3464
3465         write_lock(&vcpu->kvm->mmu_lock);
3466         r = make_mmu_pages_available(vcpu);
3467         if (r < 0)
3468                 goto out_unlock;
3469
3470         /*
3471          * Do we shadow a long mode page table? If so we need to
3472          * write-protect the guests page table root.
3473          */
3474         if (mmu->root_level >= PT64_ROOT_4LEVEL) {
3475                 root = mmu_alloc_root(vcpu, root_gfn, 0,
3476                                       mmu->shadow_root_level, false);
3477                 mmu->root.hpa = root;
3478                 goto set_root_pgd;
3479         }
3480
3481         if (WARN_ON_ONCE(!mmu->pae_root)) {
3482                 r = -EIO;
3483                 goto out_unlock;
3484         }
3485
3486         /*
3487          * We shadow a 32 bit page table. This may be a legacy 2-level
3488          * or a PAE 3-level page table. In either case we need to be aware that
3489          * the shadow page table may be a PAE or a long mode page table.
3490          */
3491         pm_mask = PT_PRESENT_MASK | shadow_me_mask;
3492         if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3493                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3494
3495                 if (WARN_ON_ONCE(!mmu->pml4_root)) {
3496                         r = -EIO;
3497                         goto out_unlock;
3498                 }
3499                 mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
3500
3501                 if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
3502                         if (WARN_ON_ONCE(!mmu->pml5_root)) {
3503                                 r = -EIO;
3504                                 goto out_unlock;
3505                         }
3506                         mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
3507                 }
3508         }
3509
3510         for (i = 0; i < 4; ++i) {
3511                 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3512
3513                 if (mmu->root_level == PT32E_ROOT_LEVEL) {
3514                         if (!(pdptrs[i] & PT_PRESENT_MASK)) {
3515                                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3516                                 continue;
3517                         }
3518                         root_gfn = pdptrs[i] >> PAGE_SHIFT;
3519                 }
3520
3521                 root = mmu_alloc_root(vcpu, root_gfn, i << 30,
3522                                       PT32_ROOT_LEVEL, false);
3523                 mmu->pae_root[i] = root | pm_mask;
3524         }
3525
3526         if (mmu->shadow_root_level == PT64_ROOT_5LEVEL)
3527                 mmu->root.hpa = __pa(mmu->pml5_root);
3528         else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3529                 mmu->root.hpa = __pa(mmu->pml4_root);
3530         else
3531                 mmu->root.hpa = __pa(mmu->pae_root);
3532
3533 set_root_pgd:
3534         mmu->root.pgd = root_pgd;
3535 out_unlock:
3536         write_unlock(&vcpu->kvm->mmu_lock);
3537
3538         return 0;
3539 }
3540
3541 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
3542 {
3543         struct kvm_mmu *mmu = vcpu->arch.mmu;
3544         bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL;
3545         u64 *pml5_root = NULL;
3546         u64 *pml4_root = NULL;
3547         u64 *pae_root;
3548
3549         /*
3550          * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
3551          * tables are allocated and initialized at root creation as there is no
3552          * equivalent level in the guest's NPT to shadow.  Allocate the tables
3553          * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
3554          */
3555         if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
3556             mmu->shadow_root_level < PT64_ROOT_4LEVEL)
3557                 return 0;
3558
3559         /*
3560          * NPT, the only paging mode that uses this horror, uses a fixed number
3561          * of levels for the shadow page tables, e.g. all MMUs are 4-level or
3562          * all MMus are 5-level.  Thus, this can safely require that pml5_root
3563          * is allocated if the other roots are valid and pml5 is needed, as any
3564          * prior MMU would also have required pml5.
3565          */
3566         if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
3567                 return 0;
3568
3569         /*
3570          * The special roots should always be allocated in concert.  Yell and
3571          * bail if KVM ends up in a state where only one of the roots is valid.
3572          */
3573         if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
3574                          (need_pml5 && mmu->pml5_root)))
3575                 return -EIO;
3576
3577         /*
3578          * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
3579          * doesn't need to be decrypted.
3580          */
3581         pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3582         if (!pae_root)
3583                 return -ENOMEM;
3584
3585 #ifdef CONFIG_X86_64
3586         pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3587         if (!pml4_root)
3588                 goto err_pml4;
3589
3590         if (need_pml5) {
3591                 pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3592                 if (!pml5_root)
3593                         goto err_pml5;
3594         }
3595 #endif
3596
3597         mmu->pae_root = pae_root;
3598         mmu->pml4_root = pml4_root;
3599         mmu->pml5_root = pml5_root;
3600
3601         return 0;
3602
3603 #ifdef CONFIG_X86_64
3604 err_pml5:
3605         free_page((unsigned long)pml4_root);
3606 err_pml4:
3607         free_page((unsigned long)pae_root);
3608         return -ENOMEM;
3609 #endif
3610 }
3611
3612 static bool is_unsync_root(hpa_t root)
3613 {
3614         struct kvm_mmu_page *sp;
3615
3616         if (!VALID_PAGE(root))
3617                 return false;
3618
3619         /*
3620          * The read barrier orders the CPU's read of SPTE.W during the page table
3621          * walk before the reads of sp->unsync/sp->unsync_children here.
3622          *
3623          * Even if another CPU was marking the SP as unsync-ed simultaneously,
3624          * any guest page table changes are not guaranteed to be visible anyway
3625          * until this VCPU issues a TLB flush strictly after those changes are
3626          * made.  We only need to ensure that the other CPU sets these flags
3627          * before any actual changes to the page tables are made.  The comments
3628          * in mmu_try_to_unsync_pages() describe what could go wrong if this
3629          * requirement isn't satisfied.
3630          */
3631         smp_rmb();
3632         sp = to_shadow_page(root);
3633         if (sp->unsync || sp->unsync_children)
3634                 return true;
3635
3636         return false;
3637 }
3638
3639 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3640 {
3641         int i;
3642         struct kvm_mmu_page *sp;
3643
3644         if (vcpu->arch.mmu->direct_map)
3645                 return;
3646
3647         if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
3648                 return;
3649
3650         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3651
3652         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3653                 hpa_t root = vcpu->arch.mmu->root.hpa;
3654                 sp = to_shadow_page(root);
3655
3656                 if (!is_unsync_root(root))
3657                         return;
3658
3659                 write_lock(&vcpu->kvm->mmu_lock);
3660                 mmu_sync_children(vcpu, sp, true);
3661                 write_unlock(&vcpu->kvm->mmu_lock);
3662                 return;
3663         }
3664
3665         write_lock(&vcpu->kvm->mmu_lock);
3666
3667         for (i = 0; i < 4; ++i) {
3668                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3669
3670                 if (IS_VALID_PAE_ROOT(root)) {
3671                         root &= PT64_BASE_ADDR_MASK;
3672                         sp = to_shadow_page(root);
3673                         mmu_sync_children(vcpu, sp, true);
3674                 }
3675         }
3676
3677         write_unlock(&vcpu->kvm->mmu_lock);
3678 }
3679
3680 void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
3681 {
3682         unsigned long roots_to_free = 0;
3683         int i;
3684
3685         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3686                 if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
3687                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3688
3689         /* sync prev_roots by simply freeing them */
3690         kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
3691 }
3692
3693 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3694                                   gpa_t vaddr, u32 access,
3695                                   struct x86_exception *exception)
3696 {
3697         if (exception)
3698                 exception->error_code = 0;
3699         return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
3700 }
3701
3702 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3703 {
3704         /*
3705          * A nested guest cannot use the MMIO cache if it is using nested
3706          * page tables, because cr2 is a nGPA while the cache stores GPAs.
3707          */
3708         if (mmu_is_nested(vcpu))
3709                 return false;
3710
3711         if (direct)
3712                 return vcpu_match_mmio_gpa(vcpu, addr);
3713
3714         return vcpu_match_mmio_gva(vcpu, addr);
3715 }
3716
3717 /*
3718  * Return the level of the lowest level SPTE added to sptes.
3719  * That SPTE may be non-present.
3720  *
3721  * Must be called between walk_shadow_page_lockless_{begin,end}.
3722  */
3723 static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
3724 {
3725         struct kvm_shadow_walk_iterator iterator;
3726         int leaf = -1;
3727         u64 spte;
3728
3729         for (shadow_walk_init(&iterator, vcpu, addr),
3730              *root_level = iterator.level;
3731              shadow_walk_okay(&iterator);
3732              __shadow_walk_next(&iterator, spte)) {
3733                 leaf = iterator.level;
3734                 spte = mmu_spte_get_lockless(iterator.sptep);
3735
3736                 sptes[leaf] = spte;
3737         }
3738
3739         return leaf;
3740 }
3741
3742 /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
3743 static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3744 {
3745         u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
3746         struct rsvd_bits_validate *rsvd_check;
3747         int root, leaf, level;
3748         bool reserved = false;
3749
3750         walk_shadow_page_lockless_begin(vcpu);
3751
3752         if (is_tdp_mmu(vcpu->arch.mmu))
3753                 leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
3754         else
3755                 leaf = get_walk(vcpu, addr, sptes, &root);
3756
3757         walk_shadow_page_lockless_end(vcpu);
3758
3759         if (unlikely(leaf < 0)) {
3760                 *sptep = 0ull;
3761                 return reserved;
3762         }
3763
3764         *sptep = sptes[leaf];
3765
3766         /*
3767          * Skip reserved bits checks on the terminal leaf if it's not a valid
3768          * SPTE.  Note, this also (intentionally) skips MMIO SPTEs, which, by
3769          * design, always have reserved bits set.  The purpose of the checks is
3770          * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
3771          */
3772         if (!is_shadow_present_pte(sptes[leaf]))
3773                 leaf++;
3774
3775         rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
3776
3777         for (level = root; level >= leaf; level--)
3778                 reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
3779
3780         if (reserved) {
3781                 pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
3782                        __func__, addr);
3783                 for (level = root; level >= leaf; level--)
3784                         pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
3785                                sptes[level], level,
3786                                get_rsvd_bits(rsvd_check, sptes[level], level));
3787         }
3788
3789         return reserved;
3790 }
3791
3792 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3793 {
3794         u64 spte;
3795         bool reserved;
3796
3797         if (mmio_info_in_cache(vcpu, addr, direct))
3798                 return RET_PF_EMULATE;
3799
3800         reserved = get_mmio_spte(vcpu, addr, &spte);
3801         if (WARN_ON(reserved))
3802                 return -EINVAL;
3803
3804         if (is_mmio_spte(spte)) {
3805                 gfn_t gfn = get_mmio_spte_gfn(spte);
3806                 unsigned int access = get_mmio_spte_access(spte);
3807
3808                 if (!check_mmio_spte(vcpu, spte))
3809                         return RET_PF_INVALID;
3810
3811                 if (direct)
3812                         addr = 0;
3813
3814                 trace_handle_mmio_page_fault(addr, gfn, access);
3815                 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3816                 return RET_PF_EMULATE;
3817         }
3818
3819         /*
3820          * If the page table is zapped by other cpus, let CPU fault again on
3821          * the address.
3822          */
3823         return RET_PF_RETRY;
3824 }
3825
3826 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
3827                                          struct kvm_page_fault *fault)
3828 {
3829         if (unlikely(fault->rsvd))
3830                 return false;
3831
3832         if (!fault->present || !fault->write)
3833                 return false;
3834
3835         /*
3836          * guest is writing the page which is write tracked which can
3837          * not be fixed by page fault handler.
3838          */
3839         if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
3840                 return true;
3841
3842         return false;
3843 }
3844
3845 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
3846 {
3847         struct kvm_shadow_walk_iterator iterator;
3848         u64 spte;
3849
3850         walk_shadow_page_lockless_begin(vcpu);
3851         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
3852                 clear_sp_write_flooding_count(iterator.sptep);
3853         walk_shadow_page_lockless_end(vcpu);
3854 }
3855
3856 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
3857                                     gfn_t gfn)
3858 {
3859         struct kvm_arch_async_pf arch;
3860
3861         arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
3862         arch.gfn = gfn;
3863         arch.direct_map = vcpu->arch.mmu->direct_map;
3864         arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
3865
3866         return kvm_setup_async_pf(vcpu, cr2_or_gpa,
3867                                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
3868 }
3869
3870 static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r)
3871 {
3872         struct kvm_memory_slot *slot = fault->slot;
3873         bool async;
3874
3875         /*
3876          * Retry the page fault if the gfn hit a memslot that is being deleted
3877          * or moved.  This ensures any existing SPTEs for the old memslot will
3878          * be zapped before KVM inserts a new MMIO SPTE for the gfn.
3879          */
3880         if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
3881                 goto out_retry;
3882
3883         if (!kvm_is_visible_memslot(slot)) {
3884                 /* Don't expose private memslots to L2. */
3885                 if (is_guest_mode(vcpu)) {
3886                         fault->slot = NULL;
3887                         fault->pfn = KVM_PFN_NOSLOT;
3888                         fault->map_writable = false;
3889                         return false;
3890                 }
3891                 /*
3892                  * If the APIC access page exists but is disabled, go directly
3893                  * to emulation without caching the MMIO access or creating a
3894                  * MMIO SPTE.  That way the cache doesn't need to be purged
3895                  * when the AVIC is re-enabled.
3896                  */
3897                 if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
3898                     !kvm_apicv_activated(vcpu->kvm)) {
3899                         *r = RET_PF_EMULATE;
3900                         return true;
3901                 }
3902         }
3903
3904         async = false;
3905         fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
3906                                           fault->write, &fault->map_writable,
3907                                           &fault->hva);
3908         if (!async)
3909                 return false; /* *pfn has correct page already */
3910
3911         if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
3912                 trace_kvm_try_async_get_page(fault->addr, fault->gfn);
3913                 if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
3914                         trace_kvm_async_pf_doublefault(fault->addr, fault->gfn);
3915                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
3916                         goto out_retry;
3917                 } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn))
3918                         goto out_retry;
3919         }
3920
3921         fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
3922                                           fault->write, &fault->map_writable,
3923                                           &fault->hva);
3924         return false;
3925
3926 out_retry:
3927         *r = RET_PF_RETRY;
3928         return true;
3929 }
3930
3931 /*
3932  * Returns true if the page fault is stale and needs to be retried, i.e. if the
3933  * root was invalidated by a memslot update or a relevant mmu_notifier fired.
3934  */
3935 static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
3936                                 struct kvm_page_fault *fault, int mmu_seq)
3937 {
3938         struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
3939
3940         /* Special roots, e.g. pae_root, are not backed by shadow pages. */
3941         if (sp && is_obsolete_sp(vcpu->kvm, sp))
3942                 return true;
3943
3944         /*
3945          * Roots without an associated shadow page are considered invalid if
3946          * there is a pending request to free obsolete roots.  The request is
3947          * only a hint that the current root _may_ be obsolete and needs to be
3948          * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
3949          * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
3950          * to reload even if no vCPU is actively using the root.
3951          */
3952         if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu))
3953                 return true;
3954
3955         return fault->slot &&
3956                mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
3957 }
3958
3959 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3960 {
3961         bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
3962
3963         unsigned long mmu_seq;
3964         int r;
3965
3966         fault->gfn = fault->addr >> PAGE_SHIFT;
3967         fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
3968
3969         if (page_fault_handle_page_track(vcpu, fault))
3970                 return RET_PF_EMULATE;
3971
3972         r = fast_page_fault(vcpu, fault);
3973         if (r != RET_PF_INVALID)
3974                 return r;
3975
3976         r = mmu_topup_memory_caches(vcpu, false);
3977         if (r)
3978                 return r;
3979
3980         mmu_seq = vcpu->kvm->mmu_notifier_seq;
3981         smp_rmb();
3982
3983         if (kvm_faultin_pfn(vcpu, fault, &r))
3984                 return r;
3985
3986         if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r))
3987                 return r;
3988
3989         r = RET_PF_RETRY;
3990
3991         if (is_tdp_mmu_fault)
3992                 read_lock(&vcpu->kvm->mmu_lock);
3993         else
3994                 write_lock(&vcpu->kvm->mmu_lock);
3995
3996         if (is_page_fault_stale(vcpu, fault, mmu_seq))
3997                 goto out_unlock;
3998
3999         r = make_mmu_pages_available(vcpu);
4000         if (r)
4001                 goto out_unlock;
4002
4003         if (is_tdp_mmu_fault)
4004                 r = kvm_tdp_mmu_map(vcpu, fault);
4005         else
4006                 r = __direct_map(vcpu, fault);
4007
4008 out_unlock:
4009         if (is_tdp_mmu_fault)
4010                 read_unlock(&vcpu->kvm->mmu_lock);
4011         else
4012                 write_unlock(&vcpu->kvm->mmu_lock);
4013         kvm_release_pfn_clean(fault->pfn);
4014         return r;
4015 }
4016
4017 static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
4018                                 struct kvm_page_fault *fault)
4019 {
4020         pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
4021
4022         /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
4023         fault->max_level = PG_LEVEL_2M;
4024         return direct_page_fault(vcpu, fault);
4025 }
4026
4027 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4028                                 u64 fault_address, char *insn, int insn_len)
4029 {
4030         int r = 1;
4031         u32 flags = vcpu->arch.apf.host_apf_flags;
4032
4033 #ifndef CONFIG_X86_64
4034         /* A 64-bit CR2 should be impossible on 32-bit KVM. */
4035         if (WARN_ON_ONCE(fault_address >> 32))
4036                 return -EFAULT;
4037 #endif
4038
4039         vcpu->arch.l1tf_flush_l1d = true;
4040         if (!flags) {
4041                 trace_kvm_page_fault(fault_address, error_code);
4042
4043                 if (kvm_event_needs_reinjection(vcpu))
4044                         kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4045                 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4046                                 insn_len);
4047         } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
4048                 vcpu->arch.apf.host_apf_flags = 0;
4049                 local_irq_disable();
4050                 kvm_async_pf_task_wait_schedule(fault_address);
4051                 local_irq_enable();
4052         } else {
4053                 WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
4054         }
4055
4056         return r;
4057 }
4058 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4059
4060 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4061 {
4062         while (fault->max_level > PG_LEVEL_4K) {
4063                 int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
4064                 gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
4065
4066                 if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
4067                         break;
4068
4069                 --fault->max_level;
4070         }
4071
4072         return direct_page_fault(vcpu, fault);
4073 }
4074
4075 static void nonpaging_init_context(struct kvm_mmu *context)
4076 {
4077         context->page_fault = nonpaging_page_fault;
4078         context->gva_to_gpa = nonpaging_gva_to_gpa;
4079         context->sync_page = nonpaging_sync_page;
4080         context->invlpg = NULL;
4081         context->direct_map = true;
4082 }
4083
4084 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
4085                                   union kvm_mmu_page_role role)
4086 {
4087         return (role.direct || pgd == root->pgd) &&
4088                VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) &&
4089                role.word == to_shadow_page(root->hpa)->role.word;
4090 }
4091
4092 /*
4093  * Find out if a previously cached root matching the new pgd/role is available.
4094  * The current root is also inserted into the cache.
4095  * If a matching root was found, it is assigned to kvm_mmu->root.hpa and true is
4096  * returned.
4097  * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root.hpa and
4098  * false is returned. This root should now be freed by the caller.
4099  */
4100 static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd,
4101                                   union kvm_mmu_page_role new_role)
4102 {
4103         uint i;
4104         struct kvm_mmu *mmu = vcpu->arch.mmu;
4105
4106         if (is_root_usable(&mmu->root, new_pgd, new_role))
4107                 return true;
4108
4109         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4110                 swap(mmu->root, mmu->prev_roots[i]);
4111
4112                 if (is_root_usable(&mmu->root, new_pgd, new_role))
4113                         break;
4114         }
4115
4116         return i < KVM_MMU_NUM_PREV_ROOTS;
4117 }
4118
4119 static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
4120                             union kvm_mmu_page_role new_role)
4121 {
4122         struct kvm_mmu *mmu = vcpu->arch.mmu;
4123
4124         /*
4125          * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
4126          * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4127          * later if necessary.
4128          */
4129         if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4130             mmu->root_level >= PT64_ROOT_4LEVEL)
4131                 return cached_root_available(vcpu, new_pgd, new_role);
4132
4133         return false;
4134 }
4135
4136 static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
4137                               union kvm_mmu_page_role new_role)
4138 {
4139         struct kvm_mmu *mmu = vcpu->arch.mmu;
4140
4141         if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
4142                 kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
4143                 return;
4144         }
4145
4146         /*
4147          * It's possible that the cached previous root page is obsolete because
4148          * of a change in the MMU generation number. However, changing the
4149          * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will
4150          * free the root set here and allocate a new one.
4151          */
4152         kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
4153
4154         if (force_flush_and_sync_on_reuse) {
4155                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4156                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
4157         }
4158
4159         /*
4160          * The last MMIO access's GVA and GPA are cached in the VCPU. When
4161          * switching to a new CR3, that GVA->GPA mapping may no longer be
4162          * valid. So clear any cached MMIO info even when we don't need to sync
4163          * the shadow page tables.
4164          */
4165         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4166
4167         /*
4168          * If this is a direct root page, it doesn't have a write flooding
4169          * count. Otherwise, clear the write flooding count.
4170          */
4171         if (!new_role.direct)
4172                 __clear_sp_write_flooding_count(
4173                                 to_shadow_page(vcpu->arch.mmu->root.hpa));
4174 }
4175
4176 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
4177 {
4178         __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu));
4179 }
4180 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
4181
4182 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4183 {
4184         return kvm_read_cr3(vcpu);
4185 }
4186
4187 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4188                            unsigned int access)
4189 {
4190         if (unlikely(is_mmio_spte(*sptep))) {
4191                 if (gfn != get_mmio_spte_gfn(*sptep)) {
4192                         mmu_spte_clear_no_track(sptep);
4193                         return true;
4194                 }
4195
4196                 mark_mmio_spte(vcpu, sptep, gfn, access);
4197                 return true;
4198         }
4199
4200         return false;
4201 }
4202
4203 #define PTTYPE_EPT 18 /* arbitrary */
4204 #define PTTYPE PTTYPE_EPT
4205 #include "paging_tmpl.h"
4206 #undef PTTYPE
4207
4208 #define PTTYPE 64
4209 #include "paging_tmpl.h"
4210 #undef PTTYPE
4211
4212 #define PTTYPE 32
4213 #include "paging_tmpl.h"
4214 #undef PTTYPE
4215
4216 static void
4217 __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
4218                         u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
4219                         bool pse, bool amd)
4220 {
4221         u64 gbpages_bit_rsvd = 0;
4222         u64 nonleaf_bit8_rsvd = 0;
4223         u64 high_bits_rsvd;
4224
4225         rsvd_check->bad_mt_xwr = 0;
4226
4227         if (!gbpages)
4228                 gbpages_bit_rsvd = rsvd_bits(7, 7);
4229
4230         if (level == PT32E_ROOT_LEVEL)
4231                 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
4232         else
4233                 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4234
4235         /* Note, NX doesn't exist in PDPTEs, this is handled below. */
4236         if (!nx)
4237                 high_bits_rsvd |= rsvd_bits(63, 63);
4238
4239         /*
4240          * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4241          * leaf entries) on AMD CPUs only.
4242          */
4243         if (amd)
4244                 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4245
4246         switch (level) {
4247         case PT32_ROOT_LEVEL:
4248                 /* no rsvd bits for 2 level 4K page table entries */
4249                 rsvd_check->rsvd_bits_mask[0][1] = 0;
4250                 rsvd_check->rsvd_bits_mask[0][0] = 0;
4251                 rsvd_check->rsvd_bits_mask[1][0] =
4252                         rsvd_check->rsvd_bits_mask[0][0];
4253
4254                 if (!pse) {
4255                         rsvd_check->rsvd_bits_mask[1][1] = 0;
4256                         break;
4257                 }
4258
4259                 if (is_cpuid_PSE36())
4260                         /* 36bits PSE 4MB page */
4261                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4262                 else
4263                         /* 32 bits PSE 4MB page */
4264                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4265                 break;
4266         case PT32E_ROOT_LEVEL:
4267                 rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
4268                                                    high_bits_rsvd |
4269                                                    rsvd_bits(5, 8) |
4270                                                    rsvd_bits(1, 2);     /* PDPTE */
4271                 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;      /* PDE */
4272                 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;      /* PTE */
4273                 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4274                                                    rsvd_bits(13, 20);   /* large page */
4275                 rsvd_check->rsvd_bits_mask[1][0] =
4276                         rsvd_check->rsvd_bits_mask[0][0];
4277                 break;
4278         case PT64_ROOT_5LEVEL:
4279                 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
4280                                                    nonleaf_bit8_rsvd |
4281                                                    rsvd_bits(7, 7);
4282                 rsvd_check->rsvd_bits_mask[1][4] =
4283                         rsvd_check->rsvd_bits_mask[0][4];
4284                 fallthrough;
4285         case PT64_ROOT_4LEVEL:
4286                 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
4287                                                    nonleaf_bit8_rsvd |
4288                                                    rsvd_bits(7, 7);
4289                 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
4290                                                    gbpages_bit_rsvd;
4291                 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
4292                 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4293                 rsvd_check->rsvd_bits_mask[1][3] =
4294                         rsvd_check->rsvd_bits_mask[0][3];
4295                 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
4296                                                    gbpages_bit_rsvd |
4297                                                    rsvd_bits(13, 29);
4298                 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4299                                                    rsvd_bits(13, 20); /* large page */
4300                 rsvd_check->rsvd_bits_mask[1][0] =
4301                         rsvd_check->rsvd_bits_mask[0][0];
4302                 break;
4303         }
4304 }
4305
4306 static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
4307 {
4308         /*
4309          * If TDP is enabled, let the guest use GBPAGES if they're supported in
4310          * hardware.  The hardware page walker doesn't let KVM disable GBPAGES,
4311          * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
4312          * walk for performance and complexity reasons.  Not to mention KVM
4313          * _can't_ solve the problem because GVA->GPA walks aren't visible to
4314          * KVM once a TDP translation is installed.  Mimic hardware behavior so
4315          * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
4316          */
4317         return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
4318                              guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
4319 }
4320
4321 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4322                                   struct kvm_mmu *context)
4323 {
4324         __reset_rsvds_bits_mask(&context->guest_rsvd_check,
4325                                 vcpu->arch.reserved_gpa_bits,
4326                                 context->root_level, is_efer_nx(context),
4327                                 guest_can_use_gbpages(vcpu),
4328                                 is_cr4_pse(context),
4329                                 guest_cpuid_is_amd_or_hygon(vcpu));
4330 }
4331
4332 static void
4333 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4334                             u64 pa_bits_rsvd, bool execonly, int huge_page_level)
4335 {
4336         u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4337         u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
4338         u64 bad_mt_xwr;
4339
4340         if (huge_page_level < PG_LEVEL_1G)
4341                 large_1g_rsvd = rsvd_bits(7, 7);
4342         if (huge_page_level < PG_LEVEL_2M)
4343                 large_2m_rsvd = rsvd_bits(7, 7);
4344
4345         rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
4346         rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
4347         rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
4348         rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
4349         rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
4350
4351         /* large page */
4352         rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4353         rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4354         rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
4355         rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
4356         rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4357
4358         bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
4359         bad_mt_xwr |= 0xFFull << (3 * 8);       /* bits 3..5 must not be 3 */
4360         bad_mt_xwr |= 0xFFull << (7 * 8);       /* bits 3..5 must not be 7 */
4361         bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   /* bits 0..2 must not be 010 */
4362         bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   /* bits 0..2 must not be 110 */
4363         if (!execonly) {
4364                 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4365                 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4366         }
4367         rsvd_check->bad_mt_xwr = bad_mt_xwr;
4368 }
4369
4370 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4371                 struct kvm_mmu *context, bool execonly, int huge_page_level)
4372 {
4373         __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4374                                     vcpu->arch.reserved_gpa_bits, execonly,
4375                                     huge_page_level);
4376 }
4377
4378 static inline u64 reserved_hpa_bits(void)
4379 {
4380         return rsvd_bits(shadow_phys_bits, 63);
4381 }
4382
4383 /*
4384  * the page table on host is the shadow page table for the page
4385  * table in guest or amd nested guest, its mmu features completely
4386  * follow the features in guest.
4387  */
4388 static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4389                                         struct kvm_mmu *context)
4390 {
4391         /*
4392          * KVM uses NX when TDP is disabled to handle a variety of scenarios,
4393          * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
4394          * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
4395          * The iTLB multi-hit workaround can be toggled at any time, so assume
4396          * NX can be used by any non-nested shadow MMU to avoid having to reset
4397          * MMU contexts.  Note, KVM forces EFER.NX=1 when TDP is disabled.
4398          */
4399         bool uses_nx = is_efer_nx(context) || !tdp_enabled;
4400
4401         /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
4402         bool is_amd = true;
4403         /* KVM doesn't use 2-level page tables for the shadow MMU. */
4404         bool is_pse = false;
4405         struct rsvd_bits_validate *shadow_zero_check;
4406         int i;
4407
4408         WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL);
4409
4410         shadow_zero_check = &context->shadow_zero_check;
4411         __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4412                                 context->shadow_root_level, uses_nx,
4413                                 guest_can_use_gbpages(vcpu), is_pse, is_amd);
4414
4415         if (!shadow_me_mask)
4416                 return;
4417
4418         for (i = context->shadow_root_level; --i >= 0;) {
4419                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4420                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4421         }
4422
4423 }
4424
4425 static inline bool boot_cpu_is_amd(void)
4426 {
4427         WARN_ON_ONCE(!tdp_enabled);
4428         return shadow_x_mask == 0;
4429 }
4430
4431 /*
4432  * the direct page table on host, use as much mmu features as
4433  * possible, however, kvm currently does not do execution-protection.
4434  */
4435 static void
4436 reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
4437 {
4438         struct rsvd_bits_validate *shadow_zero_check;
4439         int i;
4440
4441         shadow_zero_check = &context->shadow_zero_check;
4442
4443         if (boot_cpu_is_amd())
4444                 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
4445                                         context->shadow_root_level, false,
4446                                         boot_cpu_has(X86_FEATURE_GBPAGES),
4447                                         false, true);
4448         else
4449                 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4450                                             reserved_hpa_bits(), false,
4451                                             max_huge_page_level);
4452
4453         if (!shadow_me_mask)
4454                 return;
4455
4456         for (i = context->shadow_root_level; --i >= 0;) {
4457                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4458                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4459         }
4460 }
4461
4462 /*
4463  * as the comments in reset_shadow_zero_bits_mask() except it
4464  * is the shadow page table for intel nested guest.
4465  */
4466 static void
4467 reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
4468 {
4469         __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4470                                     reserved_hpa_bits(), execonly,
4471                                     max_huge_page_level);
4472 }
4473
4474 #define BYTE_MASK(access) \
4475         ((1 & (access) ? 2 : 0) | \
4476          (2 & (access) ? 4 : 0) | \
4477          (3 & (access) ? 8 : 0) | \
4478          (4 & (access) ? 16 : 0) | \
4479          (5 & (access) ? 32 : 0) | \
4480          (6 & (access) ? 64 : 0) | \
4481          (7 & (access) ? 128 : 0))
4482
4483
4484 static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
4485 {
4486         unsigned byte;
4487
4488         const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4489         const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4490         const u8 u = BYTE_MASK(ACC_USER_MASK);
4491
4492         bool cr4_smep = is_cr4_smep(mmu);
4493         bool cr4_smap = is_cr4_smap(mmu);
4494         bool cr0_wp = is_cr0_wp(mmu);
4495         bool efer_nx = is_efer_nx(mmu);
4496
4497         for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4498                 unsigned pfec = byte << 1;
4499
4500                 /*
4501                  * Each "*f" variable has a 1 bit for each UWX value
4502                  * that causes a fault with the given PFEC.
4503                  */
4504
4505                 /* Faults from writes to non-writable pages */
4506                 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4507                 /* Faults from user mode accesses to supervisor pages */
4508                 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4509                 /* Faults from fetches of non-executable pages*/
4510                 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4511                 /* Faults from kernel mode fetches of user pages */
4512                 u8 smepf = 0;
4513                 /* Faults from kernel mode accesses of user pages */
4514                 u8 smapf = 0;
4515
4516                 if (!ept) {
4517                         /* Faults from kernel mode accesses to user pages */
4518                         u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4519
4520                         /* Not really needed: !nx will cause pte.nx to fault */
4521                         if (!efer_nx)
4522                                 ff = 0;
4523
4524                         /* Allow supervisor writes if !cr0.wp */
4525                         if (!cr0_wp)
4526                                 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4527
4528                         /* Disallow supervisor fetches of user code if cr4.smep */
4529                         if (cr4_smep)
4530                                 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4531
4532                         /*
4533                          * SMAP:kernel-mode data accesses from user-mode
4534                          * mappings should fault. A fault is considered
4535                          * as a SMAP violation if all of the following
4536                          * conditions are true:
4537                          *   - X86_CR4_SMAP is set in CR4
4538                          *   - A user page is accessed
4539                          *   - The access is not a fetch
4540                          *   - Page fault in kernel mode
4541                          *   - if CPL = 3 or X86_EFLAGS_AC is clear
4542                          *
4543                          * Here, we cover the first three conditions.
4544                          * The fourth is computed dynamically in permission_fault();
4545                          * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4546                          * *not* subject to SMAP restrictions.
4547                          */
4548                         if (cr4_smap)
4549                                 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4550                 }
4551
4552                 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4553         }
4554 }
4555
4556 /*
4557 * PKU is an additional mechanism by which the paging controls access to
4558 * user-mode addresses based on the value in the PKRU register.  Protection
4559 * key violations are reported through a bit in the page fault error code.
4560 * Unlike other bits of the error code, the PK bit is not known at the
4561 * call site of e.g. gva_to_gpa; it must be computed directly in
4562 * permission_fault based on two bits of PKRU, on some machine state (CR4,
4563 * CR0, EFER, CPL), and on other bits of the error code and the page tables.
4564 *
4565 * In particular the following conditions come from the error code, the
4566 * page tables and the machine state:
4567 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4568 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4569 * - PK is always zero if U=0 in the page tables
4570 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4571 *
4572 * The PKRU bitmask caches the result of these four conditions.  The error
4573 * code (minus the P bit) and the page table's U bit form an index into the
4574 * PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
4575 * with the two bits of the PKRU register corresponding to the protection key.
4576 * For the first three conditions above the bits will be 00, thus masking
4577 * away both AD and WD.  For all reads or if the last condition holds, WD
4578 * only will be masked away.
4579 */
4580 static void update_pkru_bitmask(struct kvm_mmu *mmu)
4581 {
4582         unsigned bit;
4583         bool wp;
4584
4585         mmu->pkru_mask = 0;
4586
4587         if (!is_cr4_pke(mmu))
4588                 return;
4589
4590         wp = is_cr0_wp(mmu);
4591
4592         for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4593                 unsigned pfec, pkey_bits;
4594                 bool check_pkey, check_write, ff, uf, wf, pte_user;
4595
4596                 pfec = bit << 1;
4597                 ff = pfec & PFERR_FETCH_MASK;
4598                 uf = pfec & PFERR_USER_MASK;
4599                 wf = pfec & PFERR_WRITE_MASK;
4600
4601                 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
4602                 pte_user = pfec & PFERR_RSVD_MASK;
4603
4604                 /*
4605                  * Only need to check the access which is not an
4606                  * instruction fetch and is to a user page.
4607                  */
4608                 check_pkey = (!ff && pte_user);
4609                 /*
4610                  * write access is controlled by PKRU if it is a
4611                  * user access or CR0.WP = 1.
4612                  */
4613                 check_write = check_pkey && wf && (uf || wp);
4614
4615                 /* PKRU.AD stops both read and write access. */
4616                 pkey_bits = !!check_pkey;
4617                 /* PKRU.WD stops write access. */
4618                 pkey_bits |= (!!check_write) << 1;
4619
4620                 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4621         }
4622 }
4623
4624 static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
4625                                         struct kvm_mmu *mmu)
4626 {
4627         if (!is_cr0_pg(mmu))
4628                 return;
4629
4630         reset_rsvds_bits_mask(vcpu, mmu);
4631         update_permission_bitmask(mmu, false);
4632         update_pkru_bitmask(mmu);
4633 }
4634
4635 static void paging64_init_context(struct kvm_mmu *context)
4636 {
4637         context->page_fault = paging64_page_fault;
4638         context->gva_to_gpa = paging64_gva_to_gpa;
4639         context->sync_page = paging64_sync_page;
4640         context->invlpg = paging64_invlpg;
4641         context->direct_map = false;
4642 }
4643
4644 static void paging32_init_context(struct kvm_mmu *context)
4645 {
4646         context->page_fault = paging32_page_fault;
4647         context->gva_to_gpa = paging32_gva_to_gpa;
4648         context->sync_page = paging32_sync_page;
4649         context->invlpg = paging32_invlpg;
4650         context->direct_map = false;
4651 }
4652
4653 static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu,
4654                                                          struct kvm_mmu_role_regs *regs)
4655 {
4656         union kvm_mmu_extended_role ext = {0};
4657
4658         if (____is_cr0_pg(regs)) {
4659                 ext.cr0_pg = 1;
4660                 ext.cr4_pae = ____is_cr4_pae(regs);
4661                 ext.cr4_smep = ____is_cr4_smep(regs);
4662                 ext.cr4_smap = ____is_cr4_smap(regs);
4663                 ext.cr4_pse = ____is_cr4_pse(regs);
4664
4665                 /* PKEY and LA57 are active iff long mode is active. */
4666                 ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
4667                 ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
4668                 ext.efer_lma = ____is_efer_lma(regs);
4669         }
4670
4671         ext.valid = 1;
4672
4673         return ext;
4674 }
4675
4676 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4677                                                    struct kvm_mmu_role_regs *regs,
4678                                                    bool base_only)
4679 {
4680         union kvm_mmu_role role = {0};
4681
4682         role.base.access = ACC_ALL;
4683         if (____is_cr0_pg(regs)) {
4684                 role.base.efer_nx = ____is_efer_nx(regs);
4685                 role.base.cr0_wp = ____is_cr0_wp(regs);
4686         }
4687         role.base.smm = is_smm(vcpu);
4688         role.base.guest_mode = is_guest_mode(vcpu);
4689
4690         if (base_only)
4691                 return role;
4692
4693         role.ext = kvm_calc_mmu_role_ext(vcpu, regs);
4694
4695         return role;
4696 }
4697
4698 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
4699 {
4700         /* tdp_root_level is architecture forced level, use it if nonzero */
4701         if (tdp_root_level)
4702                 return tdp_root_level;
4703
4704         /* Use 5-level TDP if and only if it's useful/necessary. */
4705         if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
4706                 return 4;
4707
4708         return max_tdp_level;
4709 }
4710
4711 static union kvm_mmu_role
4712 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
4713                                 struct kvm_mmu_role_regs *regs, bool base_only)
4714 {
4715         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
4716
4717         role.base.ad_disabled = (shadow_accessed_mask == 0);
4718         role.base.level = kvm_mmu_get_tdp_level(vcpu);
4719         role.base.direct = true;
4720         role.base.has_4_byte_gpte = false;
4721
4722         return role;
4723 }
4724
4725 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4726 {
4727         struct kvm_mmu *context = &vcpu->arch.root_mmu;
4728         struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
4729         union kvm_mmu_role new_role =
4730                 kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, false);
4731
4732         if (new_role.as_u64 == context->mmu_role.as_u64)
4733                 return;
4734
4735         context->mmu_role.as_u64 = new_role.as_u64;
4736         context->page_fault = kvm_tdp_page_fault;
4737         context->sync_page = nonpaging_sync_page;
4738         context->invlpg = NULL;
4739         context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
4740         context->direct_map = true;
4741         context->get_guest_pgd = get_cr3;
4742         context->get_pdptr = kvm_pdptr_read;
4743         context->inject_page_fault = kvm_inject_page_fault;
4744         context->root_level = role_regs_to_root_level(&regs);
4745
4746         if (!is_cr0_pg(context))
4747                 context->gva_to_gpa = nonpaging_gva_to_gpa;
4748         else if (is_cr4_pae(context))
4749                 context->gva_to_gpa = paging64_gva_to_gpa;
4750         else
4751                 context->gva_to_gpa = paging32_gva_to_gpa;
4752
4753         reset_guest_paging_metadata(vcpu, context);
4754         reset_tdp_shadow_zero_bits_mask(context);
4755 }
4756
4757 static union kvm_mmu_role
4758 kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
4759                                       struct kvm_mmu_role_regs *regs, bool base_only)
4760 {
4761         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
4762
4763         role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
4764         role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
4765         role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs);
4766
4767         return role;
4768 }
4769
4770 static union kvm_mmu_role
4771 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
4772                                    struct kvm_mmu_role_regs *regs, bool base_only)
4773 {
4774         union kvm_mmu_role role =
4775                 kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only);
4776
4777         role.base.direct = !____is_cr0_pg(regs);
4778
4779         if (!____is_efer_lma(regs))
4780                 role.base.level = PT32E_ROOT_LEVEL;
4781         else if (____is_cr4_la57(regs))
4782                 role.base.level = PT64_ROOT_5LEVEL;
4783         else
4784                 role.base.level = PT64_ROOT_4LEVEL;
4785
4786         return role;
4787 }
4788
4789 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
4790                                     struct kvm_mmu_role_regs *regs,
4791                                     union kvm_mmu_role new_role)
4792 {
4793         if (new_role.as_u64 == context->mmu_role.as_u64)
4794                 return;
4795
4796         context->mmu_role.as_u64 = new_role.as_u64;
4797
4798         if (!is_cr0_pg(context))
4799                 nonpaging_init_context(context);
4800         else if (is_cr4_pae(context))
4801                 paging64_init_context(context);
4802         else
4803                 paging32_init_context(context);
4804         context->root_level = role_regs_to_root_level(regs);
4805
4806         reset_guest_paging_metadata(vcpu, context);
4807         context->shadow_root_level = new_role.base.level;
4808
4809         reset_shadow_zero_bits_mask(vcpu, context);
4810 }
4811
4812 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
4813                                 struct kvm_mmu_role_regs *regs)
4814 {
4815         struct kvm_mmu *context = &vcpu->arch.root_mmu;
4816         union kvm_mmu_role new_role =
4817                 kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false);
4818
4819         shadow_mmu_init_context(vcpu, context, regs, new_role);
4820 }
4821
4822 static union kvm_mmu_role
4823 kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
4824                                    struct kvm_mmu_role_regs *regs)
4825 {
4826         union kvm_mmu_role role =
4827                 kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
4828
4829         role.base.direct = false;
4830         role.base.level = kvm_mmu_get_tdp_level(vcpu);
4831
4832         return role;
4833 }
4834
4835 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
4836                              unsigned long cr4, u64 efer, gpa_t nested_cr3)
4837 {
4838         struct kvm_mmu *context = &vcpu->arch.guest_mmu;
4839         struct kvm_mmu_role_regs regs = {
4840                 .cr0 = cr0,
4841                 .cr4 = cr4 & ~X86_CR4_PKE,
4842                 .efer = efer,
4843         };
4844         union kvm_mmu_role new_role;
4845
4846         new_role = kvm_calc_shadow_npt_root_page_role(vcpu, &regs);
4847
4848         __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base);
4849
4850         shadow_mmu_init_context(vcpu, context, &regs, new_role);
4851 }
4852 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
4853
4854 static union kvm_mmu_role
4855 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
4856                                    bool execonly, u8 level)
4857 {
4858         union kvm_mmu_role role = {0};
4859
4860         /* SMM flag is inherited from root_mmu */
4861         role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
4862
4863         role.base.level = level;
4864         role.base.has_4_byte_gpte = false;
4865         role.base.direct = false;
4866         role.base.ad_disabled = !accessed_dirty;
4867         role.base.guest_mode = true;
4868         role.base.access = ACC_ALL;
4869
4870         /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
4871         role.ext.word = 0;
4872         role.ext.execonly = execonly;
4873         role.ext.valid = 1;
4874
4875         return role;
4876 }
4877
4878 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4879                              int huge_page_level, bool accessed_dirty,
4880                              gpa_t new_eptp)
4881 {
4882         struct kvm_mmu *context = &vcpu->arch.guest_mmu;
4883         u8 level = vmx_eptp_page_walk_level(new_eptp);
4884         union kvm_mmu_role new_role =
4885                 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
4886                                                    execonly, level);
4887
4888         __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base);
4889
4890         if (new_role.as_u64 == context->mmu_role.as_u64)
4891                 return;
4892
4893         context->mmu_role.as_u64 = new_role.as_u64;
4894
4895         context->shadow_root_level = level;
4896
4897         context->ept_ad = accessed_dirty;
4898         context->page_fault = ept_page_fault;
4899         context->gva_to_gpa = ept_gva_to_gpa;
4900         context->sync_page = ept_sync_page;
4901         context->invlpg = ept_invlpg;
4902         context->root_level = level;
4903         context->direct_map = false;
4904
4905         update_permission_bitmask(context, true);
4906         context->pkru_mask = 0;
4907         reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
4908         reset_ept_shadow_zero_bits_mask(context, execonly);
4909 }
4910 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
4911
4912 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
4913 {
4914         struct kvm_mmu *context = &vcpu->arch.root_mmu;
4915         struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
4916
4917         kvm_init_shadow_mmu(vcpu, &regs);
4918
4919         context->get_guest_pgd     = get_cr3;
4920         context->get_pdptr         = kvm_pdptr_read;
4921         context->inject_page_fault = kvm_inject_page_fault;
4922 }
4923
4924 static union kvm_mmu_role
4925 kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs)
4926 {
4927         union kvm_mmu_role role;
4928
4929         role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
4930
4931         /*
4932          * Nested MMUs are used only for walking L2's gva->gpa, they never have
4933          * shadow pages of their own and so "direct" has no meaning.   Set it
4934          * to "true" to try to detect bogus usage of the nested MMU.
4935          */
4936         role.base.direct = true;
4937         role.base.level = role_regs_to_root_level(regs);
4938         return role;
4939 }
4940
4941 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
4942 {
4943         struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
4944         union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, &regs);
4945         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
4946
4947         if (new_role.as_u64 == g_context->mmu_role.as_u64)
4948                 return;
4949
4950         g_context->mmu_role.as_u64 = new_role.as_u64;
4951         g_context->get_guest_pgd     = get_cr3;
4952         g_context->get_pdptr         = kvm_pdptr_read;
4953         g_context->inject_page_fault = kvm_inject_page_fault;
4954         g_context->root_level        = new_role.base.level;
4955
4956         /*
4957          * L2 page tables are never shadowed, so there is no need to sync
4958          * SPTEs.
4959          */
4960         g_context->invlpg            = NULL;
4961
4962         /*
4963          * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
4964          * L1's nested page tables (e.g. EPT12). The nested translation
4965          * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
4966          * L2's page tables as the first level of translation and L1's
4967          * nested page tables as the second level of translation. Basically
4968          * the gva_to_gpa functions between mmu and nested_mmu are swapped.
4969          */
4970         if (!is_paging(vcpu))
4971                 g_context->gva_to_gpa = nonpaging_gva_to_gpa;
4972         else if (is_long_mode(vcpu))
4973                 g_context->gva_to_gpa = paging64_gva_to_gpa;
4974         else if (is_pae(vcpu))
4975                 g_context->gva_to_gpa = paging64_gva_to_gpa;
4976         else
4977                 g_context->gva_to_gpa = paging32_gva_to_gpa;
4978
4979         reset_guest_paging_metadata(vcpu, g_context);
4980 }
4981
4982 void kvm_init_mmu(struct kvm_vcpu *vcpu)
4983 {
4984         if (mmu_is_nested(vcpu))
4985                 init_kvm_nested_mmu(vcpu);
4986         else if (tdp_enabled)
4987                 init_kvm_tdp_mmu(vcpu);
4988         else
4989                 init_kvm_softmmu(vcpu);
4990 }
4991 EXPORT_SYMBOL_GPL(kvm_init_mmu);
4992
4993 static union kvm_mmu_page_role
4994 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
4995 {
4996         struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
4997         union kvm_mmu_role role;
4998
4999         if (tdp_enabled)
5000                 role = kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, true);
5001         else
5002                 role = kvm_calc_shadow_mmu_root_page_role(vcpu, &regs, true);
5003
5004         return role.base;
5005 }
5006
5007 void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
5008 {
5009         /*
5010          * Invalidate all MMU roles to force them to reinitialize as CPUID
5011          * information is factored into reserved bit calculations.
5012          *
5013          * Correctly handling multiple vCPU models with respect to paging and
5014          * physical address properties) in a single VM would require tracking
5015          * all relevant CPUID information in kvm_mmu_page_role. That is very
5016          * undesirable as it would increase the memory requirements for
5017          * gfn_track (see struct kvm_mmu_page_role comments).  For now that
5018          * problem is swept under the rug; KVM's CPUID API is horrific and
5019          * it's all but impossible to solve it without introducing a new API.
5020          */
5021         vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
5022         vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
5023         vcpu->arch.nested_mmu.mmu_role.ext.valid = 0;
5024         kvm_mmu_reset_context(vcpu);
5025
5026         /*
5027          * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
5028          * kvm_arch_vcpu_ioctl().
5029          */
5030         KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
5031 }
5032
5033 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5034 {
5035         kvm_mmu_unload(vcpu);
5036         kvm_init_mmu(vcpu);
5037 }
5038 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5039
5040 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5041 {
5042         int r;
5043
5044         r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
5045         if (r)
5046                 goto out;
5047         r = mmu_alloc_special_roots(vcpu);
5048         if (r)
5049                 goto out;
5050         if (vcpu->arch.mmu->direct_map)
5051                 r = mmu_alloc_direct_roots(vcpu);
5052         else
5053                 r = mmu_alloc_shadow_roots(vcpu);
5054         if (r)
5055                 goto out;
5056
5057         kvm_mmu_sync_roots(vcpu);
5058
5059         kvm_mmu_load_pgd(vcpu);
5060         static_call(kvm_x86_flush_tlb_current)(vcpu);
5061 out:
5062         return r;
5063 }
5064
5065 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5066 {
5067         struct kvm *kvm = vcpu->kvm;
5068
5069         kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5070         WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
5071         kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5072         WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
5073 }
5074
5075 static bool need_remote_flush(u64 old, u64 new)
5076 {
5077         if (!is_shadow_present_pte(old))
5078                 return false;
5079         if (!is_shadow_present_pte(new))
5080                 return true;
5081         if ((old ^ new) & PT64_BASE_ADDR_MASK)
5082                 return true;
5083         old ^= shadow_nx_mask;
5084         new ^= shadow_nx_mask;
5085         return (old & ~new & PT64_PERM_MASK) != 0;
5086 }
5087
5088 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5089                                     int *bytes)
5090 {
5091         u64 gentry = 0;
5092         int r;
5093
5094         /*
5095          * Assume that the pte write on a page table of the same type
5096          * as the current vcpu paging mode since we update the sptes only
5097          * when they have the same mode.
5098          */
5099         if (is_pae(vcpu) && *bytes == 4) {
5100                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5101                 *gpa &= ~(gpa_t)7;
5102                 *bytes = 8;
5103         }
5104
5105         if (*bytes == 4 || *bytes == 8) {
5106                 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5107                 if (r)
5108                         gentry = 0;
5109         }
5110
5111         return gentry;
5112 }
5113
5114 /*
5115  * If we're seeing too many writes to a page, it may no longer be a page table,
5116  * or we may be forking, in which case it is better to unmap the page.
5117  */
5118 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5119 {
5120         /*
5121          * Skip write-flooding detected for the sp whose level is 1, because
5122          * it can become unsync, then the guest page is not write-protected.
5123          */
5124         if (sp->role.level == PG_LEVEL_4K)
5125                 return false;
5126
5127         atomic_inc(&sp->write_flooding_count);
5128         return atomic_read(&sp->write_flooding_count) >= 3;
5129 }
5130
5131 /*
5132  * Misaligned accesses are too much trouble to fix up; also, they usually
5133  * indicate a page is not used as a page table.
5134  */
5135 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5136                                     int bytes)
5137 {
5138         unsigned offset, pte_size, misaligned;
5139
5140         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5141                  gpa, bytes, sp->role.word);
5142
5143         offset = offset_in_page(gpa);
5144         pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
5145
5146         /*
5147          * Sometimes, the OS only writes the last one bytes to update status
5148          * bits, for example, in linux, andb instruction is used in clear_bit().
5149          */
5150         if (!(offset & (pte_size - 1)) && bytes == 1)
5151                 return false;
5152
5153         misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5154         misaligned |= bytes < 4;
5155
5156         return misaligned;
5157 }
5158
5159 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5160 {
5161         unsigned page_offset, quadrant;
5162         u64 *spte;
5163         int level;
5164
5165         page_offset = offset_in_page(gpa);
5166         level = sp->role.level;
5167         *nspte = 1;
5168         if (sp->role.has_4_byte_gpte) {
5169                 page_offset <<= 1;      /* 32->64 */
5170                 /*
5171                  * A 32-bit pde maps 4MB while the shadow pdes map
5172                  * only 2MB.  So we need to double the offset again
5173                  * and zap two pdes instead of one.
5174                  */
5175                 if (level == PT32_ROOT_LEVEL) {
5176                         page_offset &= ~7; /* kill rounding error */
5177                         page_offset <<= 1;
5178                         *nspte = 2;
5179                 }
5180                 quadrant = page_offset >> PAGE_SHIFT;
5181                 page_offset &= ~PAGE_MASK;
5182                 if (quadrant != sp->role.quadrant)
5183                         return NULL;
5184         }
5185
5186         spte = &sp->spt[page_offset / sizeof(*spte)];
5187         return spte;
5188 }
5189
5190 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5191                               const u8 *new, int bytes,
5192                               struct kvm_page_track_notifier_node *node)
5193 {
5194         gfn_t gfn = gpa >> PAGE_SHIFT;
5195         struct kvm_mmu_page *sp;
5196         LIST_HEAD(invalid_list);
5197         u64 entry, gentry, *spte;
5198         int npte;
5199         bool flush = false;
5200
5201         /*
5202          * If we don't have indirect shadow pages, it means no page is
5203          * write-protected, so we can exit simply.
5204          */
5205         if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5206                 return;
5207
5208         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5209
5210         /*
5211          * No need to care whether allocation memory is successful
5212          * or not since pte prefetch is skipped if it does not have
5213          * enough objects in the cache.
5214          */
5215         mmu_topup_memory_caches(vcpu, true);
5216
5217         write_lock(&vcpu->kvm->mmu_lock);
5218
5219         gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5220
5221         ++vcpu->kvm->stat.mmu_pte_write;
5222
5223         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5224                 if (detect_write_misaligned(sp, gpa, bytes) ||
5225                       detect_write_flooding(sp)) {
5226                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5227                         ++vcpu->kvm->stat.mmu_flooded;
5228                         continue;
5229                 }
5230
5231                 spte = get_written_sptes(sp, gpa, &npte);
5232                 if (!spte)
5233                         continue;
5234
5235                 while (npte--) {
5236                         entry = *spte;
5237                         mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
5238                         if (gentry && sp->role.level != PG_LEVEL_4K)
5239                                 ++vcpu->kvm->stat.mmu_pde_zapped;
5240                         if (need_remote_flush(entry, *spte))
5241                                 flush = true;
5242                         ++spte;
5243                 }
5244         }
5245         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
5246         write_unlock(&vcpu->kvm->mmu_lock);
5247 }
5248
5249 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5250                        void *insn, int insn_len)
5251 {
5252         int r, emulation_type = EMULTYPE_PF;
5253         bool direct = vcpu->arch.mmu->direct_map;
5254
5255         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
5256                 return RET_PF_RETRY;
5257
5258         r = RET_PF_INVALID;
5259         if (unlikely(error_code & PFERR_RSVD_MASK)) {
5260                 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5261                 if (r == RET_PF_EMULATE)
5262                         goto emulate;
5263         }
5264
5265         if (r == RET_PF_INVALID) {
5266                 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
5267                                           lower_32_bits(error_code), false);
5268                 if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
5269                         return -EIO;
5270         }
5271
5272         if (r < 0)
5273                 return r;
5274         if (r != RET_PF_EMULATE)
5275                 return 1;
5276
5277         /*
5278          * Before emulating the instruction, check if the error code
5279          * was due to a RO violation while translating the guest page.
5280          * This can occur when using nested virtualization with nested
5281          * paging in both guests. If true, we simply unprotect the page
5282          * and resume the guest.
5283          */
5284         if (vcpu->arch.mmu->direct_map &&
5285             (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5286                 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5287                 return 1;
5288         }
5289
5290         /*
5291          * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5292          * optimistically try to just unprotect the page and let the processor
5293          * re-execute the instruction that caused the page fault.  Do not allow
5294          * retrying MMIO emulation, as it's not only pointless but could also
5295          * cause us to enter an infinite loop because the processor will keep
5296          * faulting on the non-existent MMIO address.  Retrying an instruction
5297          * from a nested guest is also pointless and dangerous as we are only
5298          * explicitly shadowing L1's page tables, i.e. unprotecting something
5299          * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5300          */
5301         if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5302                 emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
5303 emulate:
5304         return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5305                                        insn_len);
5306 }
5307 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5308
5309 void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
5310                             gva_t gva, hpa_t root_hpa)
5311 {
5312         int i;
5313
5314         /* It's actually a GPA for vcpu->arch.guest_mmu.  */
5315         if (mmu != &vcpu->arch.guest_mmu) {
5316                 /* INVLPG on a non-canonical address is a NOP according to the SDM.  */
5317                 if (is_noncanonical_address(gva, vcpu))
5318                         return;
5319
5320                 static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
5321         }
5322
5323         if (!mmu->invlpg)
5324                 return;
5325
5326         if (root_hpa == INVALID_PAGE) {
5327                 mmu->invlpg(vcpu, gva, mmu->root.hpa);
5328
5329                 /*
5330                  * INVLPG is required to invalidate any global mappings for the VA,
5331                  * irrespective of PCID. Since it would take us roughly similar amount
5332                  * of work to determine whether any of the prev_root mappings of the VA
5333                  * is marked global, or to just sync it blindly, so we might as well
5334                  * just always sync it.
5335                  *
5336                  * Mappings not reachable via the current cr3 or the prev_roots will be
5337                  * synced when switching to that cr3, so nothing needs to be done here
5338                  * for them.
5339                  */
5340                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5341                         if (VALID_PAGE(mmu->prev_roots[i].hpa))
5342                                 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5343         } else {
5344                 mmu->invlpg(vcpu, gva, root_hpa);
5345         }
5346 }
5347
5348 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5349 {
5350         kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
5351         ++vcpu->stat.invlpg;
5352 }
5353 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5354
5355
5356 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5357 {
5358         struct kvm_mmu *mmu = vcpu->arch.mmu;
5359         bool tlb_flush = false;
5360         uint i;
5361
5362         if (pcid == kvm_get_active_pcid(vcpu)) {
5363                 mmu->invlpg(vcpu, gva, mmu->root.hpa);
5364                 tlb_flush = true;
5365         }
5366
5367         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5368                 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5369                     pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
5370                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5371                         tlb_flush = true;
5372                 }
5373         }
5374
5375         if (tlb_flush)
5376                 static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
5377
5378         ++vcpu->stat.invlpg;
5379
5380         /*
5381          * Mappings not reachable via the current cr3 or the prev_roots will be
5382          * synced when switching to that cr3, so nothing needs to be done here
5383          * for them.
5384          */
5385 }
5386
5387 void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
5388                        int tdp_max_root_level, int tdp_huge_page_level)
5389 {
5390         tdp_enabled = enable_tdp;
5391         tdp_root_level = tdp_forced_root_level;
5392         max_tdp_level = tdp_max_root_level;
5393
5394         /*
5395          * max_huge_page_level reflects KVM's MMU capabilities irrespective
5396          * of kernel support, e.g. KVM may be capable of using 1GB pages when
5397          * the kernel is not.  But, KVM never creates a page size greater than
5398          * what is used by the kernel for any given HVA, i.e. the kernel's
5399          * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
5400          */
5401         if (tdp_enabled)
5402                 max_huge_page_level = tdp_huge_page_level;
5403         else if (boot_cpu_has(X86_FEATURE_GBPAGES))
5404                 max_huge_page_level = PG_LEVEL_1G;
5405         else
5406                 max_huge_page_level = PG_LEVEL_2M;
5407 }
5408 EXPORT_SYMBOL_GPL(kvm_configure_mmu);
5409
5410 /* The return value indicates if tlb flush on all vcpus is needed. */
5411 typedef bool (*slot_level_handler) (struct kvm *kvm,
5412                                     struct kvm_rmap_head *rmap_head,
5413                                     const struct kvm_memory_slot *slot);
5414
5415 /* The caller should hold mmu-lock before calling this function. */
5416 static __always_inline bool
5417 slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5418                         slot_level_handler fn, int start_level, int end_level,
5419                         gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
5420                         bool flush)
5421 {
5422         struct slot_rmap_walk_iterator iterator;
5423
5424         for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5425                         end_gfn, &iterator) {
5426                 if (iterator.rmap)
5427                         flush |= fn(kvm, iterator.rmap, memslot);
5428
5429                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
5430                         if (flush && flush_on_yield) {
5431                                 kvm_flush_remote_tlbs_with_address(kvm,
5432                                                 start_gfn,
5433                                                 iterator.gfn - start_gfn + 1);
5434                                 flush = false;
5435                         }
5436                         cond_resched_rwlock_write(&kvm->mmu_lock);
5437                 }
5438         }
5439
5440         return flush;
5441 }
5442
5443 static __always_inline bool
5444 slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5445                   slot_level_handler fn, int start_level, int end_level,
5446                   bool flush_on_yield)
5447 {
5448         return slot_handle_level_range(kvm, memslot, fn, start_level,
5449                         end_level, memslot->base_gfn,
5450                         memslot->base_gfn + memslot->npages - 1,
5451                         flush_on_yield, false);
5452 }
5453
5454 static __always_inline bool
5455 slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5456                      slot_level_handler fn, bool flush_on_yield)
5457 {
5458         return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
5459                                  PG_LEVEL_4K, flush_on_yield);
5460 }
5461
5462 static void free_mmu_pages(struct kvm_mmu *mmu)
5463 {
5464         if (!tdp_enabled && mmu->pae_root)
5465                 set_memory_encrypted((unsigned long)mmu->pae_root, 1);
5466         free_page((unsigned long)mmu->pae_root);
5467         free_page((unsigned long)mmu->pml4_root);
5468         free_page((unsigned long)mmu->pml5_root);
5469 }
5470
5471 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5472 {
5473         struct page *page;
5474         int i;
5475
5476         mmu->root.hpa = INVALID_PAGE;
5477         mmu->root.pgd = 0;
5478         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5479                 mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5480
5481         /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
5482         if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
5483                 return 0;
5484
5485         /*
5486          * When using PAE paging, the four PDPTEs are treated as 'root' pages,
5487          * while the PDP table is a per-vCPU construct that's allocated at MMU
5488          * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
5489          * x86_64.  Therefore we need to allocate the PDP table in the first
5490          * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
5491          * generally doesn't use PAE paging and can skip allocating the PDP
5492          * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
5493          * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
5494          * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
5495          */
5496         if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5497                 return 0;
5498
5499         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5500         if (!page)
5501                 return -ENOMEM;
5502
5503         mmu->pae_root = page_address(page);
5504
5505         /*
5506          * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
5507          * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
5508          * that KVM's writes and the CPU's reads get along.  Note, this is
5509          * only necessary when using shadow paging, as 64-bit NPT can get at
5510          * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
5511          * by 32-bit kernels (when KVM itself uses 32-bit NPT).
5512          */
5513         if (!tdp_enabled)
5514                 set_memory_decrypted((unsigned long)mmu->pae_root, 1);
5515         else
5516                 WARN_ON_ONCE(shadow_me_mask);
5517
5518         for (i = 0; i < 4; ++i)
5519                 mmu->pae_root[i] = INVALID_PAE_ROOT;
5520
5521         return 0;
5522 }
5523
5524 int kvm_mmu_create(struct kvm_vcpu *vcpu)
5525 {
5526         int ret;
5527
5528         vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
5529         vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
5530
5531         vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
5532         vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
5533
5534         vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
5535
5536         vcpu->arch.mmu = &vcpu->arch.root_mmu;
5537         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5538
5539         ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
5540         if (ret)
5541                 return ret;
5542
5543         ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
5544         if (ret)
5545                 goto fail_allocate_root;
5546
5547         return ret;
5548  fail_allocate_root:
5549         free_mmu_pages(&vcpu->arch.guest_mmu);
5550         return ret;
5551 }
5552
5553 #define BATCH_ZAP_PAGES 10
5554 static void kvm_zap_obsolete_pages(struct kvm *kvm)
5555 {
5556         struct kvm_mmu_page *sp, *node;
5557         int nr_zapped, batch = 0;
5558
5559 restart:
5560         list_for_each_entry_safe_reverse(sp, node,
5561               &kvm->arch.active_mmu_pages, link) {
5562                 /*
5563                  * No obsolete valid page exists before a newly created page
5564                  * since active_mmu_pages is a FIFO list.
5565                  */
5566                 if (!is_obsolete_sp(kvm, sp))
5567                         break;
5568
5569                 /*
5570                  * Invalid pages should never land back on the list of active
5571                  * pages.  Skip the bogus page, otherwise we'll get stuck in an
5572                  * infinite loop if the page gets put back on the list (again).
5573                  */
5574                 if (WARN_ON(sp->role.invalid))
5575                         continue;
5576
5577                 /*
5578                  * No need to flush the TLB since we're only zapping shadow
5579                  * pages with an obsolete generation number and all vCPUS have
5580                  * loaded a new root, i.e. the shadow pages being zapped cannot
5581                  * be in active use by the guest.
5582                  */
5583                 if (batch >= BATCH_ZAP_PAGES &&
5584                     cond_resched_rwlock_write(&kvm->mmu_lock)) {
5585                         batch = 0;
5586                         goto restart;
5587                 }
5588
5589                 if (__kvm_mmu_prepare_zap_page(kvm, sp,
5590                                 &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
5591                         batch += nr_zapped;
5592                         goto restart;
5593                 }
5594         }
5595
5596         /*
5597          * Trigger a remote TLB flush before freeing the page tables to ensure
5598          * KVM is not in the middle of a lockless shadow page table walk, which
5599          * may reference the pages.
5600          */
5601         kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5602 }
5603
5604 /*
5605  * Fast invalidate all shadow pages and use lock-break technique
5606  * to zap obsolete pages.
5607  *
5608  * It's required when memslot is being deleted or VM is being
5609  * destroyed, in these cases, we should ensure that KVM MMU does
5610  * not use any resource of the being-deleted slot or all slots
5611  * after calling the function.
5612  */
5613 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5614 {
5615         lockdep_assert_held(&kvm->slots_lock);
5616
5617         write_lock(&kvm->mmu_lock);
5618         trace_kvm_mmu_zap_all_fast(kvm);
5619
5620         /*
5621          * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
5622          * held for the entire duration of zapping obsolete pages, it's
5623          * impossible for there to be multiple invalid generations associated
5624          * with *valid* shadow pages at any given time, i.e. there is exactly
5625          * one valid generation and (at most) one invalid generation.
5626          */
5627         kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
5628
5629         /* In order to ensure all threads see this change when
5630          * handling the MMU reload signal, this must happen in the
5631          * same critical section as kvm_reload_remote_mmus, and
5632          * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages
5633          * could drop the MMU lock and yield.
5634          */
5635         if (is_tdp_mmu_enabled(kvm))
5636                 kvm_tdp_mmu_invalidate_all_roots(kvm);
5637
5638         /*
5639          * Notify all vcpus to reload its shadow page table and flush TLB.
5640          * Then all vcpus will switch to new shadow page table with the new
5641          * mmu_valid_gen.
5642          *
5643          * Note: we need to do this under the protection of mmu_lock,
5644          * otherwise, vcpu would purge shadow page but miss tlb flush.
5645          */
5646         kvm_reload_remote_mmus(kvm);
5647
5648         kvm_zap_obsolete_pages(kvm);
5649
5650         write_unlock(&kvm->mmu_lock);
5651
5652         if (is_tdp_mmu_enabled(kvm)) {
5653                 read_lock(&kvm->mmu_lock);
5654                 kvm_tdp_mmu_zap_invalidated_roots(kvm);
5655                 read_unlock(&kvm->mmu_lock);
5656         }
5657 }
5658
5659 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5660 {
5661         return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5662 }
5663
5664 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5665                         struct kvm_memory_slot *slot,
5666                         struct kvm_page_track_notifier_node *node)
5667 {
5668         kvm_mmu_zap_all_fast(kvm);
5669 }
5670
5671 void kvm_mmu_init_vm(struct kvm *kvm)
5672 {
5673         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5674
5675         spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
5676
5677         kvm_mmu_init_tdp_mmu(kvm);
5678
5679         node->track_write = kvm_mmu_pte_write;
5680         node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5681         kvm_page_track_register_notifier(kvm, node);
5682 }
5683
5684 void kvm_mmu_uninit_vm(struct kvm *kvm)
5685 {
5686         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5687
5688         kvm_page_track_unregister_notifier(kvm, node);
5689
5690         kvm_mmu_uninit_tdp_mmu(kvm);
5691 }
5692
5693 static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5694 {
5695         const struct kvm_memory_slot *memslot;
5696         struct kvm_memslots *slots;
5697         struct kvm_memslot_iter iter;
5698         bool flush = false;
5699         gfn_t start, end;
5700         int i;
5701
5702         if (!kvm_memslots_have_rmaps(kvm))
5703                 return flush;
5704
5705         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5706                 slots = __kvm_memslots(kvm, i);
5707
5708                 kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
5709                         memslot = iter.slot;
5710                         start = max(gfn_start, memslot->base_gfn);
5711                         end = min(gfn_end, memslot->base_gfn + memslot->npages);
5712                         if (WARN_ON_ONCE(start >= end))
5713                                 continue;
5714
5715                         flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5716
5717                                                         PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
5718                                                         start, end - 1, true, flush);
5719                 }
5720         }
5721
5722         return flush;
5723 }
5724
5725 /*
5726  * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
5727  * (not including it)
5728  */
5729 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5730 {
5731         bool flush;
5732         int i;
5733
5734         if (WARN_ON_ONCE(gfn_end <= gfn_start))
5735                 return;
5736
5737         write_lock(&kvm->mmu_lock);
5738
5739         kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
5740
5741         flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end);
5742
5743         if (is_tdp_mmu_enabled(kvm)) {
5744                 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
5745                         flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
5746                                                           gfn_end, flush);
5747         }
5748
5749         if (flush)
5750                 kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
5751                                                    gfn_end - gfn_start);
5752
5753         kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
5754
5755         write_unlock(&kvm->mmu_lock);
5756 }
5757
5758 static bool slot_rmap_write_protect(struct kvm *kvm,
5759                                     struct kvm_rmap_head *rmap_head,
5760                                     const struct kvm_memory_slot *slot)
5761 {
5762         return rmap_write_protect(rmap_head, false);
5763 }
5764
5765 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5766                                       const struct kvm_memory_slot *memslot,
5767                                       int start_level)
5768 {
5769         bool flush = false;
5770
5771         if (kvm_memslots_have_rmaps(kvm)) {
5772                 write_lock(&kvm->mmu_lock);
5773                 flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
5774                                           start_level, KVM_MAX_HUGEPAGE_LEVEL,
5775                                           false);
5776                 write_unlock(&kvm->mmu_lock);
5777         }
5778
5779         if (is_tdp_mmu_enabled(kvm)) {
5780                 read_lock(&kvm->mmu_lock);
5781                 flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
5782                 read_unlock(&kvm->mmu_lock);
5783         }
5784
5785         /*
5786          * Flush TLBs if any SPTEs had to be write-protected to ensure that
5787          * guest writes are reflected in the dirty bitmap before the memslot
5788          * update completes, i.e. before enabling dirty logging is visible to
5789          * userspace.
5790          *
5791          * Perform the TLB flush outside the mmu_lock to reduce the amount of
5792          * time the lock is held. However, this does mean that another CPU can
5793          * now grab mmu_lock and encounter a write-protected SPTE while CPUs
5794          * still have a writable mapping for the associated GFN in their TLB.
5795          *
5796          * This is safe but requires KVM to be careful when making decisions
5797          * based on the write-protection status of an SPTE. Specifically, KVM
5798          * also write-protects SPTEs to monitor changes to guest page tables
5799          * during shadow paging, and must guarantee no CPUs can write to those
5800          * page before the lock is dropped. As mentioned in the previous
5801          * paragraph, a write-protected SPTE is no guarantee that CPU cannot
5802          * perform writes. So to determine if a TLB flush is truly required, KVM
5803          * will clear a separate software-only bit (MMU-writable) and skip the
5804          * flush if-and-only-if this bit was already clear.
5805          *
5806          * See is_writable_pte() for more details.
5807          */
5808         if (flush)
5809                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
5810 }
5811
5812 /* Must be called with the mmu_lock held in write-mode. */
5813 void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
5814                                    const struct kvm_memory_slot *memslot,
5815                                    u64 start, u64 end,
5816                                    int target_level)
5817 {
5818         if (is_tdp_mmu_enabled(kvm))
5819                 kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
5820                                                  target_level, false);
5821
5822         /*
5823          * A TLB flush is unnecessary at this point for the same resons as in
5824          * kvm_mmu_slot_try_split_huge_pages().
5825          */
5826 }
5827
5828 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
5829                                         const struct kvm_memory_slot *memslot,
5830                                         int target_level)
5831 {
5832         u64 start = memslot->base_gfn;
5833         u64 end = start + memslot->npages;
5834
5835         if (is_tdp_mmu_enabled(kvm)) {
5836                 read_lock(&kvm->mmu_lock);
5837                 kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
5838                 read_unlock(&kvm->mmu_lock);
5839         }
5840
5841         /*
5842          * No TLB flush is necessary here. KVM will flush TLBs after
5843          * write-protecting and/or clearing dirty on the newly split SPTEs to
5844          * ensure that guest writes are reflected in the dirty log before the
5845          * ioctl to enable dirty logging on this memslot completes. Since the
5846          * split SPTEs retain the write and dirty bits of the huge SPTE, it is
5847          * safe for KVM to decide if a TLB flush is necessary based on the split
5848          * SPTEs.
5849          */
5850 }
5851
5852 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
5853                                          struct kvm_rmap_head *rmap_head,
5854                                          const struct kvm_memory_slot *slot)
5855 {
5856         u64 *sptep;
5857         struct rmap_iterator iter;
5858         int need_tlb_flush = 0;
5859         kvm_pfn_t pfn;
5860         struct kvm_mmu_page *sp;
5861
5862 restart:
5863         for_each_rmap_spte(rmap_head, &iter, sptep) {
5864                 sp = sptep_to_sp(sptep);
5865                 pfn = spte_to_pfn(*sptep);
5866
5867                 /*
5868                  * We cannot do huge page mapping for indirect shadow pages,
5869                  * which are found on the last rmap (level = 1) when not using
5870                  * tdp; such shadow pages are synced with the page table in
5871                  * the guest, and the guest page table is using 4K page size
5872                  * mapping if the indirect sp has level = 1.
5873                  */
5874                 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
5875                     sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
5876                                                                pfn, PG_LEVEL_NUM)) {
5877                         pte_list_remove(kvm, rmap_head, sptep);
5878
5879                         if (kvm_available_flush_tlb_with_range())
5880                                 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
5881                                         KVM_PAGES_PER_HPAGE(sp->role.level));
5882                         else
5883                                 need_tlb_flush = 1;
5884
5885                         goto restart;
5886                 }
5887         }
5888
5889         return need_tlb_flush;
5890 }
5891
5892 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
5893                                    const struct kvm_memory_slot *slot)
5894 {
5895         if (kvm_memslots_have_rmaps(kvm)) {
5896                 write_lock(&kvm->mmu_lock);
5897                 /*
5898                  * Zap only 4k SPTEs since the legacy MMU only supports dirty
5899                  * logging at a 4k granularity and never creates collapsible
5900                  * 2m SPTEs during dirty logging.
5901                  */
5902                 if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true))
5903                         kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
5904                 write_unlock(&kvm->mmu_lock);
5905         }
5906
5907         if (is_tdp_mmu_enabled(kvm)) {
5908                 read_lock(&kvm->mmu_lock);
5909                 kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
5910                 read_unlock(&kvm->mmu_lock);
5911         }
5912 }
5913
5914 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
5915                                         const struct kvm_memory_slot *memslot)
5916 {
5917         /*
5918          * All current use cases for flushing the TLBs for a specific memslot
5919          * related to dirty logging, and many do the TLB flush out of mmu_lock.
5920          * The interaction between the various operations on memslot must be
5921          * serialized by slots_locks to ensure the TLB flush from one operation
5922          * is observed by any other operation on the same memslot.
5923          */
5924         lockdep_assert_held(&kvm->slots_lock);
5925         kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5926                                            memslot->npages);
5927 }
5928
5929 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
5930                                    const struct kvm_memory_slot *memslot)
5931 {
5932         bool flush = false;
5933
5934         if (kvm_memslots_have_rmaps(kvm)) {
5935                 write_lock(&kvm->mmu_lock);
5936                 /*
5937                  * Clear dirty bits only on 4k SPTEs since the legacy MMU only
5938                  * support dirty logging at a 4k granularity.
5939                  */
5940                 flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
5941                 write_unlock(&kvm->mmu_lock);
5942         }
5943
5944         if (is_tdp_mmu_enabled(kvm)) {
5945                 read_lock(&kvm->mmu_lock);
5946                 flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
5947                 read_unlock(&kvm->mmu_lock);
5948         }
5949
5950         /*
5951          * It's also safe to flush TLBs out of mmu lock here as currently this
5952          * function is only used for dirty logging, in which case flushing TLB
5953          * out of mmu lock also guarantees no dirty pages will be lost in
5954          * dirty_bitmap.
5955          */
5956         if (flush)
5957                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
5958 }
5959
5960 void kvm_mmu_zap_all(struct kvm *kvm)
5961 {
5962         struct kvm_mmu_page *sp, *node;
5963         LIST_HEAD(invalid_list);
5964         int ign;
5965
5966         write_lock(&kvm->mmu_lock);
5967 restart:
5968         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
5969                 if (WARN_ON(sp->role.invalid))
5970                         continue;
5971                 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
5972                         goto restart;
5973                 if (cond_resched_rwlock_write(&kvm->mmu_lock))
5974                         goto restart;
5975         }
5976
5977         kvm_mmu_commit_zap_page(kvm, &invalid_list);
5978
5979         if (is_tdp_mmu_enabled(kvm))
5980                 kvm_tdp_mmu_zap_all(kvm);
5981
5982         write_unlock(&kvm->mmu_lock);
5983 }
5984
5985 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
5986 {
5987         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
5988
5989         gen &= MMIO_SPTE_GEN_MASK;
5990
5991         /*
5992          * Generation numbers are incremented in multiples of the number of
5993          * address spaces in order to provide unique generations across all
5994          * address spaces.  Strip what is effectively the address space
5995          * modifier prior to checking for a wrap of the MMIO generation so
5996          * that a wrap in any address space is detected.
5997          */
5998         gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
5999
6000         /*
6001          * The very rare case: if the MMIO generation number has wrapped,
6002          * zap all shadow pages.
6003          */
6004         if (unlikely(gen == 0)) {
6005                 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
6006                 kvm_mmu_zap_all_fast(kvm);
6007         }
6008 }
6009
6010 static unsigned long
6011 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
6012 {
6013         struct kvm *kvm;
6014         int nr_to_scan = sc->nr_to_scan;
6015         unsigned long freed = 0;
6016
6017         mutex_lock(&kvm_lock);
6018
6019         list_for_each_entry(kvm, &vm_list, vm_list) {
6020                 int idx;
6021                 LIST_HEAD(invalid_list);
6022
6023                 /*
6024                  * Never scan more than sc->nr_to_scan VM instances.
6025                  * Will not hit this condition practically since we do not try
6026                  * to shrink more than one VM and it is very unlikely to see
6027                  * !n_used_mmu_pages so many times.
6028                  */
6029                 if (!nr_to_scan--)
6030                         break;
6031                 /*
6032                  * n_used_mmu_pages is accessed without holding kvm->mmu_lock
6033                  * here. We may skip a VM instance errorneosly, but we do not
6034                  * want to shrink a VM that only started to populate its MMU
6035                  * anyway.
6036                  */
6037                 if (!kvm->arch.n_used_mmu_pages &&
6038                     !kvm_has_zapped_obsolete_pages(kvm))
6039                         continue;
6040
6041                 idx = srcu_read_lock(&kvm->srcu);
6042                 write_lock(&kvm->mmu_lock);
6043
6044                 if (kvm_has_zapped_obsolete_pages(kvm)) {
6045                         kvm_mmu_commit_zap_page(kvm,
6046                               &kvm->arch.zapped_obsolete_pages);
6047                         goto unlock;
6048                 }
6049
6050                 freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
6051
6052 unlock:
6053                 write_unlock(&kvm->mmu_lock);
6054                 srcu_read_unlock(&kvm->srcu, idx);
6055
6056                 /*
6057                  * unfair on small ones
6058                  * per-vm shrinkers cry out
6059                  * sadness comes quickly
6060                  */
6061                 list_move_tail(&kvm->vm_list, &vm_list);
6062                 break;
6063         }
6064
6065         mutex_unlock(&kvm_lock);
6066         return freed;
6067 }
6068
6069 static unsigned long
6070 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6071 {
6072         return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6073 }
6074
6075 static struct shrinker mmu_shrinker = {
6076         .count_objects = mmu_shrink_count,
6077         .scan_objects = mmu_shrink_scan,
6078         .seeks = DEFAULT_SEEKS * 10,
6079 };
6080
6081 static void mmu_destroy_caches(void)
6082 {
6083         kmem_cache_destroy(pte_list_desc_cache);
6084         kmem_cache_destroy(mmu_page_header_cache);
6085 }
6086
6087 static bool get_nx_auto_mode(void)
6088 {
6089         /* Return true when CPU has the bug, and mitigations are ON */
6090         return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6091 }
6092
6093 static void __set_nx_huge_pages(bool val)
6094 {
6095         nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6096 }
6097
6098 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6099 {
6100         bool old_val = nx_huge_pages;
6101         bool new_val;
6102
6103         /* In "auto" mode deploy workaround only if CPU has the bug. */
6104         if (sysfs_streq(val, "off"))
6105                 new_val = 0;
6106         else if (sysfs_streq(val, "force"))
6107                 new_val = 1;
6108         else if (sysfs_streq(val, "auto"))
6109                 new_val = get_nx_auto_mode();
6110         else if (strtobool(val, &new_val) < 0)
6111                 return -EINVAL;
6112
6113         __set_nx_huge_pages(new_val);
6114
6115         if (new_val != old_val) {
6116                 struct kvm *kvm;
6117
6118                 mutex_lock(&kvm_lock);
6119
6120                 list_for_each_entry(kvm, &vm_list, vm_list) {
6121                         mutex_lock(&kvm->slots_lock);
6122                         kvm_mmu_zap_all_fast(kvm);
6123                         mutex_unlock(&kvm->slots_lock);
6124
6125                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6126                 }
6127                 mutex_unlock(&kvm_lock);
6128         }
6129
6130         return 0;
6131 }
6132
6133 int kvm_mmu_module_init(void)
6134 {
6135         int ret = -ENOMEM;
6136
6137         if (nx_huge_pages == -1)
6138                 __set_nx_huge_pages(get_nx_auto_mode());
6139
6140         /*
6141          * MMU roles use union aliasing which is, generally speaking, an
6142          * undefined behavior. However, we supposedly know how compilers behave
6143          * and the current status quo is unlikely to change. Guardians below are
6144          * supposed to let us know if the assumption becomes false.
6145          */
6146         BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6147         BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6148         BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6149
6150         kvm_mmu_reset_all_pte_masks();
6151
6152         pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6153                                             sizeof(struct pte_list_desc),
6154                                             0, SLAB_ACCOUNT, NULL);
6155         if (!pte_list_desc_cache)
6156                 goto out;
6157
6158         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6159                                                   sizeof(struct kvm_mmu_page),
6160                                                   0, SLAB_ACCOUNT, NULL);
6161         if (!mmu_page_header_cache)
6162                 goto out;
6163
6164         if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6165                 goto out;
6166
6167         ret = register_shrinker(&mmu_shrinker);
6168         if (ret)
6169                 goto out;
6170
6171         return 0;
6172
6173 out:
6174         mmu_destroy_caches();
6175         return ret;
6176 }
6177
6178 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6179 {
6180         kvm_mmu_unload(vcpu);
6181         free_mmu_pages(&vcpu->arch.root_mmu);
6182         free_mmu_pages(&vcpu->arch.guest_mmu);
6183         mmu_free_memory_caches(vcpu);
6184 }
6185
6186 void kvm_mmu_module_exit(void)
6187 {
6188         mmu_destroy_caches();
6189         percpu_counter_destroy(&kvm_total_used_mmu_pages);
6190         unregister_shrinker(&mmu_shrinker);
6191 }
6192
6193 /*
6194  * Calculate the effective recovery period, accounting for '0' meaning "let KVM
6195  * select a halving time of 1 hour".  Returns true if recovery is enabled.
6196  */
6197 static bool calc_nx_huge_pages_recovery_period(uint *period)
6198 {
6199         /*
6200          * Use READ_ONCE to get the params, this may be called outside of the
6201          * param setters, e.g. by the kthread to compute its next timeout.
6202          */
6203         bool enabled = READ_ONCE(nx_huge_pages);
6204         uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6205
6206         if (!enabled || !ratio)
6207                 return false;
6208
6209         *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
6210         if (!*period) {
6211                 /* Make sure the period is not less than one second.  */
6212                 ratio = min(ratio, 3600u);
6213                 *period = 60 * 60 * 1000 / ratio;
6214         }
6215         return true;
6216 }
6217
6218 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
6219 {
6220         bool was_recovery_enabled, is_recovery_enabled;
6221         uint old_period, new_period;
6222         int err;
6223
6224         was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
6225
6226         err = param_set_uint(val, kp);
6227         if (err)
6228                 return err;
6229
6230         is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
6231
6232         if (is_recovery_enabled &&
6233             (!was_recovery_enabled || old_period > new_period)) {
6234                 struct kvm *kvm;
6235
6236                 mutex_lock(&kvm_lock);
6237
6238                 list_for_each_entry(kvm, &vm_list, vm_list)
6239                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6240
6241                 mutex_unlock(&kvm_lock);
6242         }
6243
6244         return err;
6245 }
6246
6247 static void kvm_recover_nx_lpages(struct kvm *kvm)
6248 {
6249         unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
6250         int rcu_idx;
6251         struct kvm_mmu_page *sp;
6252         unsigned int ratio;
6253         LIST_HEAD(invalid_list);
6254         bool flush = false;
6255         ulong to_zap;
6256
6257         rcu_idx = srcu_read_lock(&kvm->srcu);
6258         write_lock(&kvm->mmu_lock);
6259
6260         ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6261         to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
6262         for ( ; to_zap; --to_zap) {
6263                 if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
6264                         break;
6265
6266                 /*
6267                  * We use a separate list instead of just using active_mmu_pages
6268                  * because the number of lpage_disallowed pages is expected to
6269                  * be relatively small compared to the total.
6270                  */
6271                 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6272                                       struct kvm_mmu_page,
6273                                       lpage_disallowed_link);
6274                 WARN_ON_ONCE(!sp->lpage_disallowed);
6275                 if (is_tdp_mmu_page(sp)) {
6276                         flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
6277                 } else {
6278                         kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6279                         WARN_ON_ONCE(sp->lpage_disallowed);
6280                 }
6281
6282                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
6283                         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6284                         cond_resched_rwlock_write(&kvm->mmu_lock);
6285                         flush = false;
6286                 }
6287         }
6288         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
6289
6290         write_unlock(&kvm->mmu_lock);
6291         srcu_read_unlock(&kvm->srcu, rcu_idx);
6292 }
6293
6294 static long get_nx_lpage_recovery_timeout(u64 start_time)
6295 {
6296         bool enabled;
6297         uint period;
6298
6299         enabled = calc_nx_huge_pages_recovery_period(&period);
6300
6301         return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
6302                        : MAX_SCHEDULE_TIMEOUT;
6303 }
6304
6305 static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6306 {
6307         u64 start_time;
6308         long remaining_time;
6309
6310         while (true) {
6311                 start_time = get_jiffies_64();
6312                 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6313
6314                 set_current_state(TASK_INTERRUPTIBLE);
6315                 while (!kthread_should_stop() && remaining_time > 0) {
6316                         schedule_timeout(remaining_time);
6317                         remaining_time = get_nx_lpage_recovery_timeout(start_time);
6318                         set_current_state(TASK_INTERRUPTIBLE);
6319                 }
6320
6321                 set_current_state(TASK_RUNNING);
6322
6323                 if (kthread_should_stop())
6324                         return 0;
6325
6326                 kvm_recover_nx_lpages(kvm);
6327         }
6328 }
6329
6330 int kvm_mmu_post_init_vm(struct kvm *kvm)
6331 {
6332         int err;
6333
6334         err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6335                                           "kvm-nx-lpage-recovery",
6336                                           &kvm->arch.nx_lpage_recovery_thread);
6337         if (!err)
6338                 kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6339
6340         return err;
6341 }
6342
6343 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6344 {
6345         if (kvm->arch.nx_lpage_recovery_thread)
6346                 kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6347 }