arch/x86/kvm/mmu.c

   1 /*
   2  * Kernel-based Virtual Machine driver for Linux
   3  *
   4  * This module enables machines with Intel VT-x extensions to run virtual
   5  * machines without emulation or binary translation.
   6  *
   7  * MMU support
   8  *
   9  * Copyright (C) 2006 Qumranet, Inc.
  10  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  11  *
  12  * Authors:
  13  *   Yaniv Kamay  <yaniv@qumranet.com>
  14  *   Avi Kivity   <avi@qumranet.com>
  15  *
  16  * This work is licensed under the terms of the GNU GPL, version 2.  See
  17  * the COPYING file in the top-level directory.
  18  *
  19  */
  20
  21 #include "irq.h"
  22 #include "mmu.h"
  23 #include "x86.h"
  24 #include "kvm_cache_regs.h"
  25 #include "cpuid.h"
  26
  27 #include <linux/kvm_host.h>
  28 #include <linux/types.h>
  29 #include <linux/string.h>
  30 #include <linux/mm.h>
  31 #include <linux/highmem.h>
  32 #include <linux/moduleparam.h>
  33 #include <linux/export.h>
  34 #include <linux/swap.h>
  35 #include <linux/hugetlb.h>
  36 #include <linux/compiler.h>
  37 #include <linux/srcu.h>
  38 #include <linux/slab.h>
  39 #include <linux/sched/signal.h>
  40 #include <linux/uaccess.h>
  41 #include <linux/hash.h>
  42 #include <linux/kern_levels.h>
  43
  44 #include <asm/page.h>
  45 #include <asm/pat.h>
  46 #include <asm/cmpxchg.h>
  47 #include <asm/e820/api.h>
  48 #include <asm/io.h>
  49 #include <asm/vmx.h>
  50 #include <asm/kvm_page_track.h>
  51 #include "trace.h"
  52
  53 /*
  54  * When setting this variable to true it enables Two-Dimensional-Paging
  55  * where the hardware walks 2 page tables:
  56  * 1. the guest-virtual to guest-physical
  57  * 2. while doing 1. it walks guest-physical to host-physical
  58  * If the hardware supports that we don't need to do shadow paging.
  59  */
  60 bool tdp_enabled = false;
  61
  62 enum {
  63         AUDIT_PRE_PAGE_FAULT,
  64         AUDIT_POST_PAGE_FAULT,
  65         AUDIT_PRE_PTE_WRITE,
  66         AUDIT_POST_PTE_WRITE,
  67         AUDIT_PRE_SYNC,
  68         AUDIT_POST_SYNC
  69 };
  70
  71 #undef MMU_DEBUG
  72
  73 #ifdef MMU_DEBUG
  74 static bool dbg = 0;
  75 module_param(dbg, bool, 0644);
  76
  77 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
  78 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
  79 #define MMU_WARN_ON(x) WARN_ON(x)
  80 #else
  81 #define pgprintk(x...) do { } while (0)
  82 #define rmap_printk(x...) do { } while (0)
  83 #define MMU_WARN_ON(x) do { } while (0)
  84 #endif
  85
  86 #define PTE_PREFETCH_NUM                8
  87
  88 #define PT_FIRST_AVAIL_BITS_SHIFT 10
  89 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
  90
  91 #define PT64_LEVEL_BITS 9
  92
  93 #define PT64_LEVEL_SHIFT(level) \
  94                 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
  95
  96 #define PT64_INDEX(address, level)\
  97         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
  98
  99
 100 #define PT32_LEVEL_BITS 10
 101
 102 #define PT32_LEVEL_SHIFT(level) \
 103                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
 104
 105 #define PT32_LVL_OFFSET_MASK(level) \
 106         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 107                                                 * PT32_LEVEL_BITS))) - 1))
 108
 109 #define PT32_INDEX(address, level)\
 110         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 111
 112
 113 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
 114 #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
 115 #else
 116 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
 117 #endif
 118 #define PT64_LVL_ADDR_MASK(level) \
 119         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 120                                                 * PT64_LEVEL_BITS))) - 1))
 121 #define PT64_LVL_OFFSET_MASK(level) \
 122         (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 123                                                 * PT64_LEVEL_BITS))) - 1))
 124
 125 #define PT32_BASE_ADDR_MASK PAGE_MASK
 126 #define PT32_DIR_BASE_ADDR_MASK \
 127         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
 128 #define PT32_LVL_ADDR_MASK(level) \
 129         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 130                                             * PT32_LEVEL_BITS))) - 1))
 131
 132 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
 133                         | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
 134
 135 #define ACC_EXEC_MASK    1
 136 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
 137 #define ACC_USER_MASK    PT_USER_MASK
 138 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 139
 140 /* The mask for the R/X bits in EPT PTEs */
 141 #define PT64_EPT_READABLE_MASK                  0x1ull
 142 #define PT64_EPT_EXECUTABLE_MASK                0x4ull
 143
 144 #include <trace/events/kvm.h>
 145
 146 #define CREATE_TRACE_POINTS
 147 #include "mmutrace.h"
 148
 149 #define SPTE_HOST_WRITEABLE     (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 150 #define SPTE_MMU_WRITEABLE      (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
 151
 152 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 153
 154 /* make pte_list_desc fit well in cache line */
 155 #define PTE_LIST_EXT 3
 156
 157 /*
 158  * Return values of handle_mmio_page_fault and mmu.page_fault:
 159  * RET_PF_RETRY: let CPU fault again on the address.
 160  * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
 161  *
 162  * For handle_mmio_page_fault only:
 163  * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
 164  */
 165 enum {
 166         RET_PF_RETRY = 0,
 167         RET_PF_EMULATE = 1,
 168         RET_PF_INVALID = 2,
 169 };
 170
 171 struct pte_list_desc {
 172         u64 *sptes[PTE_LIST_EXT];
 173         struct pte_list_desc *more;
 174 };
 175
 176 struct kvm_shadow_walk_iterator {
 177         u64 addr;
 178         hpa_t shadow_addr;
 179         u64 *sptep;
 180         int level;
 181         unsigned index;
 182 };
 183
 184 static const union kvm_mmu_page_role mmu_base_role_mask = {
 185         .cr0_wp = 1,
 186         .gpte_is_8_bytes = 1,
 187         .nxe = 1,
 188         .smep_andnot_wp = 1,
 189         .smap_andnot_wp = 1,
 190         .smm = 1,
 191         .guest_mode = 1,
 192         .ad_disabled = 1,
 193 };
 194
 195 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
 196         for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
 197                                          (_root), (_addr));                \
 198              shadow_walk_okay(&(_walker));                                 \
 199              shadow_walk_next(&(_walker)))
 200
 201 #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
 202         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
 203              shadow_walk_okay(&(_walker));                      \
 204              shadow_walk_next(&(_walker)))
 205
 206 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)     \
 207         for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
 208              shadow_walk_okay(&(_walker)) &&                            \
 209                 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
 210              __shadow_walk_next(&(_walker), spte))
 211
 212 static struct kmem_cache *pte_list_desc_cache;
 213 static struct kmem_cache *mmu_page_header_cache;
 214 static struct percpu_counter kvm_total_used_mmu_pages;
 215
 216 static u64 __read_mostly shadow_nx_mask;
 217 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 218 static u64 __read_mostly shadow_user_mask;
 219 static u64 __read_mostly shadow_accessed_mask;
 220 static u64 __read_mostly shadow_dirty_mask;
 221 static u64 __read_mostly shadow_mmio_mask;
 222 static u64 __read_mostly shadow_mmio_value;
 223 static u64 __read_mostly shadow_present_mask;
 224 static u64 __read_mostly shadow_me_mask;
 225
 226 /*
 227  * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
 228  * Non-present SPTEs with shadow_acc_track_value set are in place for access
 229  * tracking.
 230  */
 231 static u64 __read_mostly shadow_acc_track_mask;
 232 static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
 233
 234 /*
 235  * The mask/shift to use for saving the original R/X bits when marking the PTE
 236  * as not-present for access tracking purposes. We do not save the W bit as the
 237  * PTEs being access tracked also need to be dirty tracked, so the W bit will be
 238  * restored only when a write is attempted to the page.
 239  */
 240 static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
 241                                                     PT64_EPT_EXECUTABLE_MASK;
 242 static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
 243
 244 /*
 245  * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
 246  * to guard against L1TF attacks.
 247  */
 248 static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
 249
 250 /*
 251  * The number of high-order 1 bits to use in the mask above.
 252  */
 253 static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
 254
 255 /*
 256  * In some cases, we need to preserve the GFN of a non-present or reserved
 257  * SPTE when we usurp the upper five bits of the physical address space to
 258  * defend against L1TF, e.g. for MMIO SPTEs.  To preserve the GFN, we'll
 259  * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
 260  * left into the reserved bits, i.e. the GFN in the SPTE will be split into
 261  * high and low parts.  This mask covers the lower bits of the GFN.
 262  */
 263 static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
 264
 265
 266 static void mmu_spte_set(u64 *sptep, u64 spte);
 267 static union kvm_mmu_page_role
 268 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
 269
 270
 271 static inline bool kvm_available_flush_tlb_with_range(void)
 272 {
 273         return kvm_x86_ops->tlb_remote_flush_with_range;
 274 }
 275
 276 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
 277                 struct kvm_tlb_range *range)
 278 {
 279         int ret = -ENOTSUPP;
 280
 281         if (range && kvm_x86_ops->tlb_remote_flush_with_range)
 282                 ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range);
 283
 284         if (ret)
 285                 kvm_flush_remote_tlbs(kvm);
 286 }
 287
 288 static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
 289                 u64 start_gfn, u64 pages)
 290 {
 291         struct kvm_tlb_range range;
 292
 293         range.start_gfn = start_gfn;
 294         range.pages = pages;
 295
 296         kvm_flush_remote_tlbs_with_range(kvm, &range);
 297 }
 298
 299 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
 300 {
 301         BUG_ON((mmio_mask & mmio_value) != mmio_value);
 302         shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
 303         shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
 304 }
 305 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 306
 307 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
 308 {
 309         return sp->role.ad_disabled;
 310 }
 311
 312 static inline bool spte_ad_enabled(u64 spte)
 313 {
 314         MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
 315         return !(spte & shadow_acc_track_value);
 316 }
 317
 318 static inline u64 spte_shadow_accessed_mask(u64 spte)
 319 {
 320         MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
 321         return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
 322 }
 323
 324 static inline u64 spte_shadow_dirty_mask(u64 spte)
 325 {
 326         MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
 327         return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
 328 }
 329
 330 static inline bool is_access_track_spte(u64 spte)
 331 {
 332         return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
 333 }
 334
 335 /*
 336  * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
 337  * the memslots generation and is derived as follows:
 338  *
 339  * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
 340  * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
 341  *
 342  * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
 343  * the MMIO generation number, as doing so would require stealing a bit from
 344  * the "real" generation number and thus effectively halve the maximum number
 345  * of MMIO generations that can be handled before encountering a wrap (which
 346  * requires a full MMU zap).  The flag is instead explicitly queried when
 347  * checking for MMIO spte cache hits.
 348  */
 349 #define MMIO_SPTE_GEN_MASK              GENMASK_ULL(18, 0)
 350
 351 #define MMIO_SPTE_GEN_LOW_START         3
 352 #define MMIO_SPTE_GEN_LOW_END           11
 353 #define MMIO_SPTE_GEN_LOW_MASK          GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
 354                                                     MMIO_SPTE_GEN_LOW_START)
 355
 356 #define MMIO_SPTE_GEN_HIGH_START        52
 357 #define MMIO_SPTE_GEN_HIGH_END          61
 358 #define MMIO_SPTE_GEN_HIGH_MASK         GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
 359                                                     MMIO_SPTE_GEN_HIGH_START)
 360 static u64 generation_mmio_spte_mask(u64 gen)
 361 {
 362         u64 mask;
 363
 364         WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
 365
 366         mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
 367         mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
 368         return mask;
 369 }
 370
 371 static u64 get_mmio_spte_generation(u64 spte)
 372 {
 373         u64 gen;
 374
 375         spte &= ~shadow_mmio_mask;
 376
 377         gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
 378         gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
 379         return gen;
 380 }
 381
 382 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 383                            unsigned access)
 384 {
 385         u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
 386         u64 mask = generation_mmio_spte_mask(gen);
 387         u64 gpa = gfn << PAGE_SHIFT;
 388
 389         access &= ACC_WRITE_MASK | ACC_USER_MASK;
 390         mask |= shadow_mmio_value | access;
 391         mask |= gpa | shadow_nonpresent_or_rsvd_mask;
 392         mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
 393                 << shadow_nonpresent_or_rsvd_mask_len;
 394
 395         page_header(__pa(sptep))->mmio_cached = true;
 396
 397         trace_mark_mmio_spte(sptep, gfn, access, gen);
 398         mmu_spte_set(sptep, mask);
 399 }
 400
 401 static bool is_mmio_spte(u64 spte)
 402 {
 403         return (spte & shadow_mmio_mask) == shadow_mmio_value;
 404 }
 405
 406 static gfn_t get_mmio_spte_gfn(u64 spte)
 407 {
 408         u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
 409
 410         gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
 411                & shadow_nonpresent_or_rsvd_mask;
 412
 413         return gpa >> PAGE_SHIFT;
 414 }
 415
 416 static unsigned get_mmio_spte_access(u64 spte)
 417 {
 418         u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask;
 419         return (spte & ~mask) & ~PAGE_MASK;
 420 }
 421
 422 static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 423                           kvm_pfn_t pfn, unsigned access)
 424 {
 425         if (unlikely(is_noslot_pfn(pfn))) {
 426                 mark_mmio_spte(vcpu, sptep, gfn, access);
 427                 return true;
 428         }
 429
 430         return false;
 431 }
 432
 433 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 434 {
 435         u64 kvm_gen, spte_gen, gen;
 436
 437         gen = kvm_vcpu_memslots(vcpu)->generation;
 438         if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
 439                 return false;
 440
 441         kvm_gen = gen & MMIO_SPTE_GEN_MASK;
 442         spte_gen = get_mmio_spte_generation(spte);
 443
 444         trace_check_mmio_spte(spte, kvm_gen, spte_gen);
 445         return likely(kvm_gen == spte_gen);
 446 }
 447
 448 /*
 449  * Sets the shadow PTE masks used by the MMU.
 450  *
 451  * Assumptions:
 452  *  - Setting either @accessed_mask or @dirty_mask requires setting both
 453  *  - At least one of @accessed_mask or @acc_track_mask must be set
 454  */
 455 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 456                 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
 457                 u64 acc_track_mask, u64 me_mask)
 458 {
 459         BUG_ON(!dirty_mask != !accessed_mask);
 460         BUG_ON(!accessed_mask && !acc_track_mask);
 461         BUG_ON(acc_track_mask & shadow_acc_track_value);
 462
 463         shadow_user_mask = user_mask;
 464         shadow_accessed_mask = accessed_mask;
 465         shadow_dirty_mask = dirty_mask;
 466         shadow_nx_mask = nx_mask;
 467         shadow_x_mask = x_mask;
 468         shadow_present_mask = p_mask;
 469         shadow_acc_track_mask = acc_track_mask;
 470         shadow_me_mask = me_mask;
 471 }
 472 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 473
 474 static void kvm_mmu_reset_all_pte_masks(void)
 475 {
 476         u8 low_phys_bits;
 477
 478         shadow_user_mask = 0;
 479         shadow_accessed_mask = 0;
 480         shadow_dirty_mask = 0;
 481         shadow_nx_mask = 0;
 482         shadow_x_mask = 0;
 483         shadow_mmio_mask = 0;
 484         shadow_present_mask = 0;
 485         shadow_acc_track_mask = 0;
 486
 487         /*
 488          * If the CPU has 46 or less physical address bits, then set an
 489          * appropriate mask to guard against L1TF attacks. Otherwise, it is
 490          * assumed that the CPU is not vulnerable to L1TF.
 491          *
 492          * Some Intel CPUs address the L1 cache using more PA bits than are
 493          * reported by CPUID. Use the PA width of the L1 cache when possible
 494          * to achieve more effective mitigation, e.g. if system RAM overlaps
 495          * the most significant bits of legal physical address space.
 496          */
 497         shadow_nonpresent_or_rsvd_mask = 0;
 498         low_phys_bits = boot_cpu_data.x86_cache_bits;
 499         if (boot_cpu_data.x86_cache_bits <
 500             52 - shadow_nonpresent_or_rsvd_mask_len) {
 501                 shadow_nonpresent_or_rsvd_mask =
 502                         rsvd_bits(boot_cpu_data.x86_cache_bits -
 503                                   shadow_nonpresent_or_rsvd_mask_len,
 504                                   boot_cpu_data.x86_cache_bits - 1);
 505                 low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
 506         } else
 507                 WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF));
 508
 509         shadow_nonpresent_or_rsvd_lower_gfn_mask =
 510                 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
 511 }
 512
 513 static int is_cpuid_PSE36(void)
 514 {
 515         return 1;
 516 }
 517
 518 static int is_nx(struct kvm_vcpu *vcpu)
 519 {
 520         return vcpu->arch.efer & EFER_NX;
 521 }
 522
 523 static int is_shadow_present_pte(u64 pte)
 524 {
 525         return (pte != 0) && !is_mmio_spte(pte);
 526 }
 527
 528 static int is_large_pte(u64 pte)
 529 {
 530         return pte & PT_PAGE_SIZE_MASK;
 531 }
 532
 533 static int is_last_spte(u64 pte, int level)
 534 {
 535         if (level == PT_PAGE_TABLE_LEVEL)
 536                 return 1;
 537         if (is_large_pte(pte))
 538                 return 1;
 539         return 0;
 540 }
 541
 542 static bool is_executable_pte(u64 spte)
 543 {
 544         return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
 545 }
 546
 547 static kvm_pfn_t spte_to_pfn(u64 pte)
 548 {
 549         return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 550 }
 551
 552 static gfn_t pse36_gfn_delta(u32 gpte)
 553 {
 554         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
 555
 556         return (gpte & PT32_DIR_PSE36_MASK) << shift;
 557 }
 558
 559 #ifdef CONFIG_X86_64
 560 static void __set_spte(u64 *sptep, u64 spte)
 561 {
 562         WRITE_ONCE(*sptep, spte);
 563 }
 564
 565 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 566 {
 567         WRITE_ONCE(*sptep, spte);
 568 }
 569
 570 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 571 {
 572         return xchg(sptep, spte);
 573 }
 574
 575 static u64 __get_spte_lockless(u64 *sptep)
 576 {
 577         return READ_ONCE(*sptep);
 578 }
 579 #else
 580 union split_spte {
 581         struct {
 582                 u32 spte_low;
 583                 u32 spte_high;
 584         };
 585         u64 spte;
 586 };
 587
 588 static void count_spte_clear(u64 *sptep, u64 spte)
 589 {
 590         struct kvm_mmu_page *sp =  page_header(__pa(sptep));
 591
 592         if (is_shadow_present_pte(spte))
 593                 return;
 594
 595         /* Ensure the spte is completely set before we increase the count */
 596         smp_wmb();
 597         sp->clear_spte_count++;
 598 }
 599
 600 static void __set_spte(u64 *sptep, u64 spte)
 601 {
 602         union split_spte *ssptep, sspte;
 603
 604         ssptep = (union split_spte *)sptep;
 605         sspte = (union split_spte)spte;
 606
 607         ssptep->spte_high = sspte.spte_high;
 608
 609         /*
 610          * If we map the spte from nonpresent to present, We should store
 611          * the high bits firstly, then set present bit, so cpu can not
 612          * fetch this spte while we are setting the spte.
 613          */
 614         smp_wmb();
 615
 616         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 617 }
 618
 619 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 620 {
 621         union split_spte *ssptep, sspte;
 622
 623         ssptep = (union split_spte *)sptep;
 624         sspte = (union split_spte)spte;
 625
 626         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 627
 628         /*
 629          * If we map the spte from present to nonpresent, we should clear
 630          * present bit firstly to avoid vcpu fetch the old high bits.
 631          */
 632         smp_wmb();
 633
 634         ssptep->spte_high = sspte.spte_high;
 635         count_spte_clear(sptep, spte);
 636 }
 637
 638 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 639 {
 640         union split_spte *ssptep, sspte, orig;
 641
 642         ssptep = (union split_spte *)sptep;
 643         sspte = (union split_spte)spte;
 644
 645         /* xchg acts as a barrier before the setting of the high bits */
 646         orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
 647         orig.spte_high = ssptep->spte_high;
 648         ssptep->spte_high = sspte.spte_high;
 649         count_spte_clear(sptep, spte);
 650
 651         return orig.spte;
 652 }
 653
 654 /*
 655  * The idea using the light way get the spte on x86_32 guest is from
 656  * gup_get_pte(arch/x86/mm/gup.c).
 657  *
 658  * An spte tlb flush may be pending, because kvm_set_pte_rmapp
 659  * coalesces them and we are running out of the MMU lock.  Therefore
 660  * we need to protect against in-progress updates of the spte.
 661  *
 662  * Reading the spte while an update is in progress may get the old value
 663  * for the high part of the spte.  The race is fine for a present->non-present
 664  * change (because the high part of the spte is ignored for non-present spte),
 665  * but for a present->present change we must reread the spte.
 666  *
 667  * All such changes are done in two steps (present->non-present and
 668  * non-present->present), hence it is enough to count the number of
 669  * present->non-present updates: if it changed while reading the spte,
 670  * we might have hit the race.  This is done using clear_spte_count.
 671  */
 672 static u64 __get_spte_lockless(u64 *sptep)
 673 {
 674         struct kvm_mmu_page *sp =  page_header(__pa(sptep));
 675         union split_spte spte, *orig = (union split_spte *)sptep;
 676         int count;
 677
 678 retry:
 679         count = sp->clear_spte_count;
 680         smp_rmb();
 681
 682         spte.spte_low = orig->spte_low;
 683         smp_rmb();
 684
 685         spte.spte_high = orig->spte_high;
 686         smp_rmb();
 687
 688         if (unlikely(spte.spte_low != orig->spte_low ||
 689               count != sp->clear_spte_count))
 690                 goto retry;
 691
 692         return spte.spte;
 693 }
 694 #endif
 695
 696 static bool spte_can_locklessly_be_made_writable(u64 spte)
 697 {
 698         return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
 699                 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
 700 }
 701
 702 static bool spte_has_volatile_bits(u64 spte)
 703 {
 704         if (!is_shadow_present_pte(spte))
 705                 return false;
 706
 707         /*
 708          * Always atomically update spte if it can be updated
 709          * out of mmu-lock, it can ensure dirty bit is not lost,
 710          * also, it can help us to get a stable is_writable_pte()
 711          * to ensure tlb flush is not missed.
 712          */
 713         if (spte_can_locklessly_be_made_writable(spte) ||
 714             is_access_track_spte(spte))
 715                 return true;
 716
 717         if (spte_ad_enabled(spte)) {
 718                 if ((spte & shadow_accessed_mask) == 0 ||
 719                     (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
 720                         return true;
 721         }
 722
 723         return false;
 724 }
 725
 726 static bool is_accessed_spte(u64 spte)
 727 {
 728         u64 accessed_mask = spte_shadow_accessed_mask(spte);
 729
 730         return accessed_mask ? spte & accessed_mask
 731                              : !is_access_track_spte(spte);
 732 }
 733
 734 static bool is_dirty_spte(u64 spte)
 735 {
 736         u64 dirty_mask = spte_shadow_dirty_mask(spte);
 737
 738         return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
 739 }
 740
 741 /* Rules for using mmu_spte_set:
 742  * Set the sptep from nonpresent to present.
 743  * Note: the sptep being assigned *must* be either not present
 744  * or in a state where the hardware will not attempt to update
 745  * the spte.
 746  */
 747 static void mmu_spte_set(u64 *sptep, u64 new_spte)
 748 {
 749         WARN_ON(is_shadow_present_pte(*sptep));
 750         __set_spte(sptep, new_spte);
 751 }
 752
 753 /*
 754  * Update the SPTE (excluding the PFN), but do not track changes in its
 755  * accessed/dirty status.
 756  */
 757 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
 758 {
 759         u64 old_spte = *sptep;
 760
 761         WARN_ON(!is_shadow_present_pte(new_spte));
 762
 763         if (!is_shadow_present_pte(old_spte)) {
 764                 mmu_spte_set(sptep, new_spte);
 765                 return old_spte;
 766         }
 767
 768         if (!spte_has_volatile_bits(old_spte))
 769                 __update_clear_spte_fast(sptep, new_spte);
 770         else
 771                 old_spte = __update_clear_spte_slow(sptep, new_spte);
 772
 773         WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
 774
 775         return old_spte;
 776 }
 777
 778 /* Rules for using mmu_spte_update:
 779  * Update the state bits, it means the mapped pfn is not changed.
 780  *
 781  * Whenever we overwrite a writable spte with a read-only one we
 782  * should flush remote TLBs. Otherwise rmap_write_protect
 783  * will find a read-only spte, even though the writable spte
 784  * might be cached on a CPU's TLB, the return value indicates this
 785  * case.
 786  *
 787  * Returns true if the TLB needs to be flushed
 788  */
 789 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 790 {
 791         bool flush = false;
 792         u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
 793
 794         if (!is_shadow_present_pte(old_spte))
 795                 return false;
 796
 797         /*
 798          * For the spte updated out of mmu-lock is safe, since
 799          * we always atomically update it, see the comments in
 800          * spte_has_volatile_bits().
 801          */
 802         if (spte_can_locklessly_be_made_writable(old_spte) &&
 803               !is_writable_pte(new_spte))
 804                 flush = true;
 805
 806         /*
 807          * Flush TLB when accessed/dirty states are changed in the page tables,
 808          * to guarantee consistency between TLB and page tables.
 809          */
 810
 811         if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
 812                 flush = true;
 813                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 814         }
 815
 816         if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
 817                 flush = true;
 818                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 819         }
 820
 821         return flush;
 822 }
 823
 824 /*
 825  * Rules for using mmu_spte_clear_track_bits:
 826  * It sets the sptep from present to nonpresent, and track the
 827  * state bits, it is used to clear the last level sptep.
 828  * Returns non-zero if the PTE was previously valid.
 829  */
 830 static int mmu_spte_clear_track_bits(u64 *sptep)
 831 {
 832         kvm_pfn_t pfn;
 833         u64 old_spte = *sptep;
 834
 835         if (!spte_has_volatile_bits(old_spte))
 836                 __update_clear_spte_fast(sptep, 0ull);
 837         else
 838                 old_spte = __update_clear_spte_slow(sptep, 0ull);
 839
 840         if (!is_shadow_present_pte(old_spte))
 841                 return 0;
 842
 843         pfn = spte_to_pfn(old_spte);
 844
 845         /*
 846          * KVM does not hold the refcount of the page used by
 847          * kvm mmu, before reclaiming the page, we should
 848          * unmap it from mmu first.
 849          */
 850         WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
 851
 852         if (is_accessed_spte(old_spte))
 853                 kvm_set_pfn_accessed(pfn);
 854
 855         if (is_dirty_spte(old_spte))
 856                 kvm_set_pfn_dirty(pfn);
 857
 858         return 1;
 859 }
 860
 861 /*
 862  * Rules for using mmu_spte_clear_no_track:
 863  * Directly clear spte without caring the state bits of sptep,
 864  * it is used to set the upper level spte.
 865  */
 866 static void mmu_spte_clear_no_track(u64 *sptep)
 867 {
 868         __update_clear_spte_fast(sptep, 0ull);
 869 }
 870
 871 static u64 mmu_spte_get_lockless(u64 *sptep)
 872 {
 873         return __get_spte_lockless(sptep);
 874 }
 875
 876 static u64 mark_spte_for_access_track(u64 spte)
 877 {
 878         if (spte_ad_enabled(spte))
 879                 return spte & ~shadow_accessed_mask;
 880
 881         if (is_access_track_spte(spte))
 882                 return spte;
 883
 884         /*
 885          * Making an Access Tracking PTE will result in removal of write access
 886          * from the PTE. So, verify that we will be able to restore the write
 887          * access in the fast page fault path later on.
 888          */
 889         WARN_ONCE((spte & PT_WRITABLE_MASK) &&
 890                   !spte_can_locklessly_be_made_writable(spte),
 891                   "kvm: Writable SPTE is not locklessly dirty-trackable\n");
 892
 893         WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
 894                           shadow_acc_track_saved_bits_shift),
 895                   "kvm: Access Tracking saved bit locations are not zero\n");
 896
 897         spte |= (spte & shadow_acc_track_saved_bits_mask) <<
 898                 shadow_acc_track_saved_bits_shift;
 899         spte &= ~shadow_acc_track_mask;
 900
 901         return spte;
 902 }
 903
 904 /* Restore an acc-track PTE back to a regular PTE */
 905 static u64 restore_acc_track_spte(u64 spte)
 906 {
 907         u64 new_spte = spte;
 908         u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
 909                          & shadow_acc_track_saved_bits_mask;
 910
 911         WARN_ON_ONCE(spte_ad_enabled(spte));
 912         WARN_ON_ONCE(!is_access_track_spte(spte));
 913
 914         new_spte &= ~shadow_acc_track_mask;
 915         new_spte &= ~(shadow_acc_track_saved_bits_mask <<
 916                       shadow_acc_track_saved_bits_shift);
 917         new_spte |= saved_bits;
 918
 919         return new_spte;
 920 }
 921
 922 /* Returns the Accessed status of the PTE and resets it at the same time. */
 923 static bool mmu_spte_age(u64 *sptep)
 924 {
 925         u64 spte = mmu_spte_get_lockless(sptep);
 926
 927         if (!is_accessed_spte(spte))
 928                 return false;
 929
 930         if (spte_ad_enabled(spte)) {
 931                 clear_bit((ffs(shadow_accessed_mask) - 1),
 932                           (unsigned long *)sptep);
 933         } else {
 934                 /*
 935                  * Capture the dirty status of the page, so that it doesn't get
 936                  * lost when the SPTE is marked for access tracking.
 937                  */
 938                 if (is_writable_pte(spte))
 939                         kvm_set_pfn_dirty(spte_to_pfn(spte));
 940
 941                 spte = mark_spte_for_access_track(spte);
 942                 mmu_spte_update_no_track(sptep, spte);
 943         }
 944
 945         return true;
 946 }
 947
 948 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 949 {
 950         /*
 951          * Prevent page table teardown by making any free-er wait during
 952          * kvm_flush_remote_tlbs() IPI to all active vcpus.
 953          */
 954         local_irq_disable();
 955
 956         /*
 957          * Make sure a following spte read is not reordered ahead of the write
 958          * to vcpu->mode.
 959          */
 960         smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
 961 }
 962
 963 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 964 {
 965         /*
 966          * Make sure the write to vcpu->mode is not reordered in front of
 967          * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
 968          * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
 969          */
 970         smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
 971         local_irq_enable();
 972 }
 973
 974 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 975                                   struct kmem_cache *base_cache, int min)
 976 {
 977         void *obj;
 978
 979         if (cache->nobjs >= min)
 980                 return 0;
 981         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
 982                 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
 983                 if (!obj)
 984                         return cache->nobjs >= min ? 0 : -ENOMEM;
 985                 cache->objects[cache->nobjs++] = obj;
 986         }
 987         return 0;
 988 }
 989
 990 static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
 991 {
 992         return cache->nobjs;
 993 }
 994
 995 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
 996                                   struct kmem_cache *cache)
 997 {
 998         while (mc->nobjs)
 999                 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
1000 }
1001
1002 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
1003                                        int min)
1004 {
1005         void *page;
1006
1007         if (cache->nobjs >= min)
1008                 return 0;
1009         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1010                 page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
1011                 if (!page)
1012                         return cache->nobjs >= min ? 0 : -ENOMEM;
1013                 cache->objects[cache->nobjs++] = page;
1014         }
1015         return 0;
1016 }
1017
1018 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
1019 {
1020         while (mc->nobjs)
1021                 free_page((unsigned long)mc->objects[--mc->nobjs]);
1022 }
1023
1024 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
1025 {
1026         int r;
1027
1028         r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1029                                    pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
1030         if (r)
1031                 goto out;
1032         r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
1033         if (r)
1034                 goto out;
1035         r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
1036                                    mmu_page_header_cache, 4);
1037 out:
1038         return r;
1039 }
1040
1041 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1042 {
1043         mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1044                                 pte_list_desc_cache);
1045         mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
1046         mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
1047                                 mmu_page_header_cache);
1048 }
1049
1050 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
1051 {
1052         void *p;
1053
1054         BUG_ON(!mc->nobjs);
1055         p = mc->objects[--mc->nobjs];
1056         return p;
1057 }
1058
1059 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
1060 {
1061         return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
1062 }
1063
1064 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
1065 {
1066         kmem_cache_free(pte_list_desc_cache, pte_list_desc);
1067 }
1068
1069 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
1070 {
1071         if (!sp->role.direct)
1072                 return sp->gfns[index];
1073
1074         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
1075 }
1076
1077 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
1078 {
1079         if (sp->role.direct)
1080                 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
1081         else
1082                 sp->gfns[index] = gfn;
1083 }
1084
1085 /*
1086  * Return the pointer to the large page information for a given gfn,
1087  * handling slots that are not large page aligned.
1088  */
1089 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
1090                                               struct kvm_memory_slot *slot,
1091                                               int level)
1092 {
1093         unsigned long idx;
1094
1095         idx = gfn_to_index(gfn, slot->base_gfn, level);
1096         return &slot->arch.lpage_info[level - 2][idx];
1097 }
1098
1099 static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
1100                                             gfn_t gfn, int count)
1101 {
1102         struct kvm_lpage_info *linfo;
1103         int i;
1104
1105         for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1106                 linfo = lpage_info_slot(gfn, slot, i);
1107                 linfo->disallow_lpage += count;
1108                 WARN_ON(linfo->disallow_lpage < 0);
1109         }
1110 }
1111
1112 void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1113 {
1114         update_gfn_disallow_lpage_count(slot, gfn, 1);
1115 }
1116
1117 void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1118 {
1119         update_gfn_disallow_lpage_count(slot, gfn, -1);
1120 }
1121
1122 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1123 {
1124         struct kvm_memslots *slots;
1125         struct kvm_memory_slot *slot;
1126         gfn_t gfn;
1127
1128         kvm->arch.indirect_shadow_pages++;
1129         gfn = sp->gfn;
1130         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1131         slot = __gfn_to_memslot(slots, gfn);
1132
1133         /* the non-leaf shadow pages are keeping readonly. */
1134         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1135                 return kvm_slot_page_track_add_page(kvm, slot, gfn,
1136                                                     KVM_PAGE_TRACK_WRITE);
1137
1138         kvm_mmu_gfn_disallow_lpage(slot, gfn);
1139 }
1140
1141 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1142 {
1143         struct kvm_memslots *slots;
1144         struct kvm_memory_slot *slot;
1145         gfn_t gfn;
1146
1147         kvm->arch.indirect_shadow_pages--;
1148         gfn = sp->gfn;
1149         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1150         slot = __gfn_to_memslot(slots, gfn);
1151         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1152                 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
1153                                                        KVM_PAGE_TRACK_WRITE);
1154
1155         kvm_mmu_gfn_allow_lpage(slot, gfn);
1156 }
1157
1158 static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
1159                                           struct kvm_memory_slot *slot)
1160 {
1161         struct kvm_lpage_info *linfo;
1162
1163         if (slot) {
1164                 linfo = lpage_info_slot(gfn, slot, level);
1165                 return !!linfo->disallow_lpage;
1166         }
1167
1168         return true;
1169 }
1170
1171 static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
1172                                         int level)
1173 {
1174         struct kvm_memory_slot *slot;
1175
1176         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1177         return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
1178 }
1179
1180 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
1181 {
1182         unsigned long page_size;
1183         int i, ret = 0;
1184
1185         page_size = kvm_host_page_size(kvm, gfn);
1186
1187         for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1188                 if (page_size >= KVM_HPAGE_SIZE(i))
1189                         ret = i;
1190                 else
1191                         break;
1192         }
1193
1194         return ret;
1195 }
1196
1197 static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
1198                                           bool no_dirty_log)
1199 {
1200         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1201                 return false;
1202         if (no_dirty_log && slot->dirty_bitmap)
1203                 return false;
1204
1205         return true;
1206 }
1207
1208 static struct kvm_memory_slot *
1209 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
1210                             bool no_dirty_log)
1211 {
1212         struct kvm_memory_slot *slot;
1213
1214         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1215         if (!memslot_valid_for_gpte(slot, no_dirty_log))
1216                 slot = NULL;
1217
1218         return slot;
1219 }
1220
1221 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
1222                          bool *force_pt_level)
1223 {
1224         int host_level, level, max_level;
1225         struct kvm_memory_slot *slot;
1226
1227         if (unlikely(*force_pt_level))
1228                 return PT_PAGE_TABLE_LEVEL;
1229
1230         slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
1231         *force_pt_level = !memslot_valid_for_gpte(slot, true);
1232         if (unlikely(*force_pt_level))
1233                 return PT_PAGE_TABLE_LEVEL;
1234
1235         host_level = host_mapping_level(vcpu->kvm, large_gfn);
1236
1237         if (host_level == PT_PAGE_TABLE_LEVEL)
1238                 return host_level;
1239
1240         max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
1241
1242         for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
1243                 if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
1244                         break;
1245
1246         return level - 1;
1247 }
1248
1249 /*
1250  * About rmap_head encoding:
1251  *
1252  * If the bit zero of rmap_head->val is clear, then it points to the only spte
1253  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
1254  * pte_list_desc containing more mappings.
1255  */
1256
1257 /*
1258  * Returns the number of pointers in the rmap chain, not counting the new one.
1259  */
1260 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
1261                         struct kvm_rmap_head *rmap_head)
1262 {
1263         struct pte_list_desc *desc;
1264         int i, count = 0;
1265
1266         if (!rmap_head->val) {
1267                 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
1268                 rmap_head->val = (unsigned long)spte;
1269         } else if (!(rmap_head->val & 1)) {
1270                 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
1271                 desc = mmu_alloc_pte_list_desc(vcpu);
1272                 desc->sptes[0] = (u64 *)rmap_head->val;
1273                 desc->sptes[1] = spte;
1274                 rmap_head->val = (unsigned long)desc | 1;
1275                 ++count;
1276         } else {
1277                 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
1278                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1279                 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
1280                         desc = desc->more;
1281                         count += PTE_LIST_EXT;
1282                 }
1283                 if (desc->sptes[PTE_LIST_EXT-1]) {
1284                         desc->more = mmu_alloc_pte_list_desc(vcpu);
1285                         desc = desc->more;
1286                 }
1287                 for (i = 0; desc->sptes[i]; ++i)
1288                         ++count;
1289                 desc->sptes[i] = spte;
1290         }
1291         return count;
1292 }
1293
1294 static void
1295 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
1296                            struct pte_list_desc *desc, int i,
1297                            struct pte_list_desc *prev_desc)
1298 {
1299         int j;
1300
1301         for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
1302                 ;
1303         desc->sptes[i] = desc->sptes[j];
1304         desc->sptes[j] = NULL;
1305         if (j != 0)
1306                 return;
1307         if (!prev_desc && !desc->more)
1308                 rmap_head->val = (unsigned long)desc->sptes[0];
1309         else
1310                 if (prev_desc)
1311                         prev_desc->more = desc->more;
1312                 else
1313                         rmap_head->val = (unsigned long)desc->more | 1;
1314         mmu_free_pte_list_desc(desc);
1315 }
1316
1317 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
1318 {
1319         struct pte_list_desc *desc;
1320         struct pte_list_desc *prev_desc;
1321         int i;
1322
1323         if (!rmap_head->val) {
1324                 pr_err("%s: %p 0->BUG\n", __func__, spte);
1325                 BUG();
1326         } else if (!(rmap_head->val & 1)) {
1327                 rmap_printk("%s:  %p 1->0\n", __func__, spte);
1328                 if ((u64 *)rmap_head->val != spte) {
1329                         pr_err("%s:  %p 1->BUG\n", __func__, spte);
1330                         BUG();
1331                 }
1332                 rmap_head->val = 0;
1333         } else {
1334                 rmap_printk("%s:  %p many->many\n", __func__, spte);
1335                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1336                 prev_desc = NULL;
1337                 while (desc) {
1338                         for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
1339                                 if (desc->sptes[i] == spte) {
1340                                         pte_list_desc_remove_entry(rmap_head,
1341                                                         desc, i, prev_desc);
1342                                         return;
1343                                 }
1344                         }
1345                         prev_desc = desc;
1346                         desc = desc->more;
1347                 }
1348                 pr_err("%s: %p many->many\n", __func__, spte);
1349                 BUG();
1350         }
1351 }
1352
1353 static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
1354 {
1355         mmu_spte_clear_track_bits(sptep);
1356         __pte_list_remove(sptep, rmap_head);
1357 }
1358
1359 static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
1360                                            struct kvm_memory_slot *slot)
1361 {
1362         unsigned long idx;
1363
1364         idx = gfn_to_index(gfn, slot->base_gfn, level);
1365         return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
1366 }
1367
1368 static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
1369                                          struct kvm_mmu_page *sp)
1370 {
1371         struct kvm_memslots *slots;
1372         struct kvm_memory_slot *slot;
1373
1374         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1375         slot = __gfn_to_memslot(slots, gfn);
1376         return __gfn_to_rmap(gfn, sp->role.level, slot);
1377 }
1378
1379 static bool rmap_can_add(struct kvm_vcpu *vcpu)
1380 {
1381         struct kvm_mmu_memory_cache *cache;
1382
1383         cache = &vcpu->arch.mmu_pte_list_desc_cache;
1384         return mmu_memory_cache_free_objects(cache);
1385 }
1386
1387 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1388 {
1389         struct kvm_mmu_page *sp;
1390         struct kvm_rmap_head *rmap_head;
1391
1392         sp = page_header(__pa(spte));
1393         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1394         rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1395         return pte_list_add(vcpu, spte, rmap_head);
1396 }
1397
1398 static void rmap_remove(struct kvm *kvm, u64 *spte)
1399 {
1400         struct kvm_mmu_page *sp;
1401         gfn_t gfn;
1402         struct kvm_rmap_head *rmap_head;
1403
1404         sp = page_header(__pa(spte));
1405         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1406         rmap_head = gfn_to_rmap(kvm, gfn, sp);
1407         __pte_list_remove(spte, rmap_head);
1408 }
1409
1410 /*
1411  * Used by the following functions to iterate through the sptes linked by a
1412  * rmap.  All fields are private and not assumed to be used outside.
1413  */
1414 struct rmap_iterator {
1415         /* private fields */
1416         struct pte_list_desc *desc;     /* holds the sptep if not NULL */
1417         int pos;                        /* index of the sptep */
1418 };
1419
1420 /*
1421  * Iteration must be started by this function.  This should also be used after
1422  * removing/dropping sptes from the rmap link because in such cases the
1423  * information in the itererator may not be valid.
1424  *
1425  * Returns sptep if found, NULL otherwise.
1426  */
1427 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1428                            struct rmap_iterator *iter)
1429 {
1430         u64 *sptep;
1431
1432         if (!rmap_head->val)
1433                 return NULL;
1434
1435         if (!(rmap_head->val & 1)) {
1436                 iter->desc = NULL;
1437                 sptep = (u64 *)rmap_head->val;
1438                 goto out;
1439         }
1440
1441         iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1442         iter->pos = 0;
1443         sptep = iter->desc->sptes[iter->pos];
1444 out:
1445         BUG_ON(!is_shadow_present_pte(*sptep));
1446         return sptep;
1447 }
1448
1449 /*
1450  * Must be used with a valid iterator: e.g. after rmap_get_first().
1451  *
1452  * Returns sptep if found, NULL otherwise.
1453  */
1454 static u64 *rmap_get_next(struct rmap_iterator *iter)
1455 {
1456         u64 *sptep;
1457
1458         if (iter->desc) {
1459                 if (iter->pos < PTE_LIST_EXT - 1) {
1460                         ++iter->pos;
1461                         sptep = iter->desc->sptes[iter->pos];
1462                         if (sptep)
1463                                 goto out;
1464                 }
1465
1466                 iter->desc = iter->desc->more;
1467
1468                 if (iter->desc) {
1469                         iter->pos = 0;
1470                         /* desc->sptes[0] cannot be NULL */
1471                         sptep = iter->desc->sptes[iter->pos];
1472                         goto out;
1473                 }
1474         }
1475
1476         return NULL;
1477 out:
1478         BUG_ON(!is_shadow_present_pte(*sptep));
1479         return sptep;
1480 }
1481
1482 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)                 \
1483         for (_spte_ = rmap_get_first(_rmap_head_, _iter_);              \
1484              _spte_; _spte_ = rmap_get_next(_iter_))
1485
1486 static void drop_spte(struct kvm *kvm, u64 *sptep)
1487 {
1488         if (mmu_spte_clear_track_bits(sptep))
1489                 rmap_remove(kvm, sptep);
1490 }
1491
1492
1493 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1494 {
1495         if (is_large_pte(*sptep)) {
1496                 WARN_ON(page_header(__pa(sptep))->role.level ==
1497                         PT_PAGE_TABLE_LEVEL);
1498                 drop_spte(kvm, sptep);
1499                 --kvm->stat.lpages;
1500                 return true;
1501         }
1502
1503         return false;
1504 }
1505
1506 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1507 {
1508         if (__drop_large_spte(vcpu->kvm, sptep)) {
1509                 struct kvm_mmu_page *sp = page_header(__pa(sptep));
1510
1511                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1512                         KVM_PAGES_PER_HPAGE(sp->role.level));
1513         }
1514 }
1515
1516 /*
1517  * Write-protect on the specified @sptep, @pt_protect indicates whether
1518  * spte write-protection is caused by protecting shadow page table.
1519  *
1520  * Note: write protection is difference between dirty logging and spte
1521  * protection:
1522  * - for dirty logging, the spte can be set to writable at anytime if
1523  *   its dirty bitmap is properly set.
1524  * - for spte protection, the spte can be writable only after unsync-ing
1525  *   shadow page.
1526  *
1527  * Return true if tlb need be flushed.
1528  */
1529 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1530 {
1531         u64 spte = *sptep;
1532
1533         if (!is_writable_pte(spte) &&
1534               !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1535                 return false;
1536
1537         rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1538
1539         if (pt_protect)
1540                 spte &= ~SPTE_MMU_WRITEABLE;
1541         spte = spte & ~PT_WRITABLE_MASK;
1542
1543         return mmu_spte_update(sptep, spte);
1544 }
1545
1546 static bool __rmap_write_protect(struct kvm *kvm,
1547                                  struct kvm_rmap_head *rmap_head,
1548                                  bool pt_protect)
1549 {
1550         u64 *sptep;
1551         struct rmap_iterator iter;
1552         bool flush = false;
1553
1554         for_each_rmap_spte(rmap_head, &iter, sptep)
1555                 flush |= spte_write_protect(sptep, pt_protect);
1556
1557         return flush;
1558 }
1559
1560 static bool spte_clear_dirty(u64 *sptep)
1561 {
1562         u64 spte = *sptep;
1563
1564         rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1565
1566         spte &= ~shadow_dirty_mask;
1567
1568         return mmu_spte_update(sptep, spte);
1569 }
1570
1571 static bool wrprot_ad_disabled_spte(u64 *sptep)
1572 {
1573         bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1574                                                (unsigned long *)sptep);
1575         if (was_writable)
1576                 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1577
1578         return was_writable;
1579 }
1580
1581 /*
1582  * Gets the GFN ready for another round of dirty logging by clearing the
1583  *      - D bit on ad-enabled SPTEs, and
1584  *      - W bit on ad-disabled SPTEs.
1585  * Returns true iff any D or W bits were cleared.
1586  */
1587 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1588 {
1589         u64 *sptep;
1590         struct rmap_iterator iter;
1591         bool flush = false;
1592
1593         for_each_rmap_spte(rmap_head, &iter, sptep)
1594                 if (spte_ad_enabled(*sptep))
1595                         flush |= spte_clear_dirty(sptep);
1596                 else
1597                         flush |= wrprot_ad_disabled_spte(sptep);
1598
1599         return flush;
1600 }
1601
1602 static bool spte_set_dirty(u64 *sptep)
1603 {
1604         u64 spte = *sptep;
1605
1606         rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1607
1608         spte |= shadow_dirty_mask;
1609
1610         return mmu_spte_update(sptep, spte);
1611 }
1612
1613 static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1614 {
1615         u64 *sptep;
1616         struct rmap_iterator iter;
1617         bool flush = false;
1618
1619         for_each_rmap_spte(rmap_head, &iter, sptep)
1620                 if (spte_ad_enabled(*sptep))
1621                         flush |= spte_set_dirty(sptep);
1622
1623         return flush;
1624 }
1625
1626 /**
1627  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1628  * @kvm: kvm instance
1629  * @slot: slot to protect
1630  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1631  * @mask: indicates which pages we should protect
1632  *
1633  * Used when we do not need to care about huge page mappings: e.g. during dirty
1634  * logging we do not have any such mappings.
1635  */
1636 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1637                                      struct kvm_memory_slot *slot,
1638                                      gfn_t gfn_offset, unsigned long mask)
1639 {
1640         struct kvm_rmap_head *rmap_head;
1641
1642         while (mask) {
1643                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1644                                           PT_PAGE_TABLE_LEVEL, slot);
1645                 __rmap_write_protect(kvm, rmap_head, false);
1646
1647                 /* clear the first set bit */
1648                 mask &= mask - 1;
1649         }
1650 }
1651
1652 /**
1653  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1654  * protect the page if the D-bit isn't supported.
1655  * @kvm: kvm instance
1656  * @slot: slot to clear D-bit
1657  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1658  * @mask: indicates which pages we should clear D-bit
1659  *
1660  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1661  */
1662 void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1663                                      struct kvm_memory_slot *slot,
1664                                      gfn_t gfn_offset, unsigned long mask)
1665 {
1666         struct kvm_rmap_head *rmap_head;
1667
1668         while (mask) {
1669                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1670                                           PT_PAGE_TABLE_LEVEL, slot);
1671                 __rmap_clear_dirty(kvm, rmap_head);
1672
1673                 /* clear the first set bit */
1674                 mask &= mask - 1;
1675         }
1676 }
1677 EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1678
1679 /**
1680  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1681  * PT level pages.
1682  *
1683  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1684  * enable dirty logging for them.
1685  *
1686  * Used when we do not need to care about huge page mappings: e.g. during dirty
1687  * logging we do not have any such mappings.
1688  */
1689 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1690                                 struct kvm_memory_slot *slot,
1691                                 gfn_t gfn_offset, unsigned long mask)
1692 {
1693         if (kvm_x86_ops->enable_log_dirty_pt_masked)
1694                 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1695                                 mask);
1696         else
1697                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1698 }
1699
1700 /**
1701  * kvm_arch_write_log_dirty - emulate dirty page logging
1702  * @vcpu: Guest mode vcpu
1703  *
1704  * Emulate arch specific page modification logging for the
1705  * nested hypervisor
1706  */
1707 int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
1708 {
1709         if (kvm_x86_ops->write_log_dirty)
1710                 return kvm_x86_ops->write_log_dirty(vcpu);
1711
1712         return 0;
1713 }
1714
1715 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1716                                     struct kvm_memory_slot *slot, u64 gfn)
1717 {
1718         struct kvm_rmap_head *rmap_head;
1719         int i;
1720         bool write_protected = false;
1721
1722         for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1723                 rmap_head = __gfn_to_rmap(gfn, i, slot);
1724                 write_protected |= __rmap_write_protect(kvm, rmap_head, true);
1725         }
1726
1727         return write_protected;
1728 }
1729
1730 static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1731 {
1732         struct kvm_memory_slot *slot;
1733
1734         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1735         return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
1736 }
1737
1738 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1739 {
1740         u64 *sptep;
1741         struct rmap_iterator iter;
1742         bool flush = false;
1743
1744         while ((sptep = rmap_get_first(rmap_head, &iter))) {
1745                 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1746
1747                 pte_list_remove(rmap_head, sptep);
1748                 flush = true;
1749         }
1750
1751         return flush;
1752 }
1753
1754 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1755                            struct kvm_memory_slot *slot, gfn_t gfn, int level,
1756                            unsigned long data)
1757 {
1758         return kvm_zap_rmapp(kvm, rmap_head);
1759 }
1760
1761 static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1762                              struct kvm_memory_slot *slot, gfn_t gfn, int level,
1763                              unsigned long data)
1764 {
1765         u64 *sptep;
1766         struct rmap_iterator iter;
1767         int need_flush = 0;
1768         u64 new_spte;
1769         pte_t *ptep = (pte_t *)data;
1770         kvm_pfn_t new_pfn;
1771
1772         WARN_ON(pte_huge(*ptep));
1773         new_pfn = pte_pfn(*ptep);
1774
1775 restart:
1776         for_each_rmap_spte(rmap_head, &iter, sptep) {
1777                 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1778                             sptep, *sptep, gfn, level);
1779
1780                 need_flush = 1;
1781
1782                 if (pte_write(*ptep)) {
1783                         pte_list_remove(rmap_head, sptep);
1784                         goto restart;
1785                 } else {
1786                         new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1787                         new_spte |= (u64)new_pfn << PAGE_SHIFT;
1788
1789                         new_spte &= ~PT_WRITABLE_MASK;
1790                         new_spte &= ~SPTE_HOST_WRITEABLE;
1791
1792                         new_spte = mark_spte_for_access_track(new_spte);
1793
1794                         mmu_spte_clear_track_bits(sptep);
1795                         mmu_spte_set(sptep, new_spte);
1796                 }
1797         }
1798
1799         if (need_flush && kvm_available_flush_tlb_with_range()) {
1800                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1801                 return 0;
1802         }
1803
1804         return need_flush;
1805 }
1806
1807 struct slot_rmap_walk_iterator {
1808         /* input fields. */
1809         struct kvm_memory_slot *slot;
1810         gfn_t start_gfn;
1811         gfn_t end_gfn;
1812         int start_level;
1813         int end_level;
1814
1815         /* output fields. */
1816         gfn_t gfn;
1817         struct kvm_rmap_head *rmap;
1818         int level;
1819
1820         /* private field. */
1821         struct kvm_rmap_head *end_rmap;
1822 };
1823
1824 static void
1825 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1826 {
1827         iterator->level = level;
1828         iterator->gfn = iterator->start_gfn;
1829         iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1830         iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1831                                            iterator->slot);
1832 }
1833
1834 static void
1835 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1836                     struct kvm_memory_slot *slot, int start_level,
1837                     int end_level, gfn_t start_gfn, gfn_t end_gfn)
1838 {
1839         iterator->slot = slot;
1840         iterator->start_level = start_level;
1841         iterator->end_level = end_level;
1842         iterator->start_gfn = start_gfn;
1843         iterator->end_gfn = end_gfn;
1844
1845         rmap_walk_init_level(iterator, iterator->start_level);
1846 }
1847
1848 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1849 {
1850         return !!iterator->rmap;
1851 }
1852
1853 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1854 {
1855         if (++iterator->rmap <= iterator->end_rmap) {
1856                 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1857                 return;
1858         }
1859
1860         if (++iterator->level > iterator->end_level) {
1861                 iterator->rmap = NULL;
1862                 return;
1863         }
1864
1865         rmap_walk_init_level(iterator, iterator->level);
1866 }
1867
1868 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1869            _start_gfn, _end_gfn, _iter_)                                \
1870         for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
1871                                  _end_level_, _start_gfn, _end_gfn);    \
1872              slot_rmap_walk_okay(_iter_);                               \
1873              slot_rmap_walk_next(_iter_))
1874
1875 static int kvm_handle_hva_range(struct kvm *kvm,
1876                                 unsigned long start,
1877                                 unsigned long end,
1878                                 unsigned long data,
1879                                 int (*handler)(struct kvm *kvm,
1880                                                struct kvm_rmap_head *rmap_head,
1881                                                struct kvm_memory_slot *slot,
1882                                                gfn_t gfn,
1883                                                int level,
1884                                                unsigned long data))
1885 {
1886         struct kvm_memslots *slots;
1887         struct kvm_memory_slot *memslot;
1888         struct slot_rmap_walk_iterator iterator;
1889         int ret = 0;
1890         int i;
1891
1892         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1893                 slots = __kvm_memslots(kvm, i);
1894                 kvm_for_each_memslot(memslot, slots) {
1895                         unsigned long hva_start, hva_end;
1896                         gfn_t gfn_start, gfn_end;
1897
1898                         hva_start = max(start, memslot->userspace_addr);
1899                         hva_end = min(end, memslot->userspace_addr +
1900                                       (memslot->npages << PAGE_SHIFT));
1901                         if (hva_start >= hva_end)
1902                                 continue;
1903                         /*
1904                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
1905                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1906                          */
1907                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1908                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1909
1910                         for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
1911                                                  PT_MAX_HUGEPAGE_LEVEL,
1912                                                  gfn_start, gfn_end - 1,
1913                                                  &iterator)
1914                                 ret |= handler(kvm, iterator.rmap, memslot,
1915                                                iterator.gfn, iterator.level, data);
1916                 }
1917         }
1918
1919         return ret;
1920 }
1921
1922 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1923                           unsigned long data,
1924                           int (*handler)(struct kvm *kvm,
1925                                          struct kvm_rmap_head *rmap_head,
1926                                          struct kvm_memory_slot *slot,
1927                                          gfn_t gfn, int level,
1928                                          unsigned long data))
1929 {
1930         return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
1931 }
1932
1933 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1934 {
1935         return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1936 }
1937
1938 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1939 {
1940         return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1941 }
1942
1943 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1944                          struct kvm_memory_slot *slot, gfn_t gfn, int level,
1945                          unsigned long data)
1946 {
1947         u64 *sptep;
1948         struct rmap_iterator uninitialized_var(iter);
1949         int young = 0;
1950
1951         for_each_rmap_spte(rmap_head, &iter, sptep)
1952                 young |= mmu_spte_age(sptep);
1953
1954         trace_kvm_age_page(gfn, level, slot, young);
1955         return young;
1956 }
1957
1958 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1959                               struct kvm_memory_slot *slot, gfn_t gfn,
1960                               int level, unsigned long data)
1961 {
1962         u64 *sptep;
1963         struct rmap_iterator iter;
1964
1965         for_each_rmap_spte(rmap_head, &iter, sptep)
1966                 if (is_accessed_spte(*sptep))
1967                         return 1;
1968         return 0;
1969 }
1970
1971 #define RMAP_RECYCLE_THRESHOLD 1000
1972
1973 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1974 {
1975         struct kvm_rmap_head *rmap_head;
1976         struct kvm_mmu_page *sp;
1977
1978         sp = page_header(__pa(spte));
1979
1980         rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1981
1982         kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
1983         kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1984                         KVM_PAGES_PER_HPAGE(sp->role.level));
1985 }
1986
1987 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1988 {
1989         return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
1990 }
1991
1992 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1993 {
1994         return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1995 }
1996
1997 #ifdef MMU_DEBUG
1998 static int is_empty_shadow_page(u64 *spt)
1999 {
2000         u64 *pos;
2001         u64 *end;
2002
2003         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
2004                 if (is_shadow_present_pte(*pos)) {
2005                         printk(KERN_ERR "%s: %p %llx\n", __func__,
2006                                pos, *pos);
2007                         return 0;
2008                 }
2009         return 1;
2010 }
2011 #endif
2012
2013 /*
2014  * This value is the sum of all of the kvm instances's
2015  * kvm->arch.n_used_mmu_pages values.  We need a global,
2016  * aggregate version in order to make the slab shrinker
2017  * faster
2018  */
2019 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
2020 {
2021         kvm->arch.n_used_mmu_pages += nr;
2022         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
2023 }
2024
2025 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
2026 {
2027         MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
2028         hlist_del(&sp->hash_link);
2029         list_del(&sp->link);
2030         free_page((unsigned long)sp->spt);
2031         if (!sp->role.direct)
2032                 free_page((unsigned long)sp->gfns);
2033         kmem_cache_free(mmu_page_header_cache, sp);
2034 }
2035
2036 static unsigned kvm_page_table_hashfn(gfn_t gfn)
2037 {
2038         return hash_64(gfn, KVM_MMU_HASH_SHIFT);
2039 }
2040
2041 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
2042                                     struct kvm_mmu_page *sp, u64 *parent_pte)
2043 {
2044         if (!parent_pte)
2045                 return;
2046
2047         pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
2048 }
2049
2050 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
2051                                        u64 *parent_pte)
2052 {
2053         __pte_list_remove(parent_pte, &sp->parent_ptes);
2054 }
2055
2056 static void drop_parent_pte(struct kvm_mmu_page *sp,
2057                             u64 *parent_pte)
2058 {
2059         mmu_page_remove_parent_pte(sp, parent_pte);
2060         mmu_spte_clear_no_track(parent_pte);
2061 }
2062
2063 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
2064 {
2065         struct kvm_mmu_page *sp;
2066
2067         sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
2068         sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2069         if (!direct)
2070                 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2071         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2072         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2073         kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2074         return sp;
2075 }
2076
2077 static void mark_unsync(u64 *spte);
2078 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
2079 {
2080         u64 *sptep;
2081         struct rmap_iterator iter;
2082
2083         for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
2084                 mark_unsync(sptep);
2085         }
2086 }
2087
2088 static void mark_unsync(u64 *spte)
2089 {
2090         struct kvm_mmu_page *sp;
2091         unsigned int index;
2092
2093         sp = page_header(__pa(spte));
2094         index = spte - sp->spt;
2095         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
2096                 return;
2097         if (sp->unsync_children++)
2098                 return;
2099         kvm_mmu_mark_parents_unsync(sp);
2100 }
2101
2102 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
2103                                struct kvm_mmu_page *sp)
2104 {
2105         return 0;
2106 }
2107
2108 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
2109 {
2110 }
2111
2112 static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
2113                                  struct kvm_mmu_page *sp, u64 *spte,
2114                                  const void *pte)
2115 {
2116         WARN_ON(1);
2117 }
2118
2119 #define KVM_PAGE_ARRAY_NR 16
2120
2121 struct kvm_mmu_pages {
2122         struct mmu_page_and_offset {
2123                 struct kvm_mmu_page *sp;
2124                 unsigned int idx;
2125         } page[KVM_PAGE_ARRAY_NR];
2126         unsigned int nr;
2127 };
2128
2129 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
2130                          int idx)
2131 {
2132         int i;
2133
2134         if (sp->unsync)
2135                 for (i=0; i < pvec->nr; i++)
2136                         if (pvec->page[i].sp == sp)
2137                                 return 0;
2138
2139         pvec->page[pvec->nr].sp = sp;
2140         pvec->page[pvec->nr].idx = idx;
2141         pvec->nr++;
2142         return (pvec->nr == KVM_PAGE_ARRAY_NR);
2143 }
2144
2145 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
2146 {
2147         --sp->unsync_children;
2148         WARN_ON((int)sp->unsync_children < 0);
2149         __clear_bit(idx, sp->unsync_child_bitmap);
2150 }
2151
2152 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
2153                            struct kvm_mmu_pages *pvec)
2154 {
2155         int i, ret, nr_unsync_leaf = 0;
2156
2157         for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
2158                 struct kvm_mmu_page *child;
2159                 u64 ent = sp->spt[i];
2160
2161                 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
2162                         clear_unsync_child_bit(sp, i);
2163                         continue;
2164                 }
2165
2166                 child = page_header(ent & PT64_BASE_ADDR_MASK);
2167
2168                 if (child->unsync_children) {
2169                         if (mmu_pages_add(pvec, child, i))
2170                                 return -ENOSPC;
2171
2172                         ret = __mmu_unsync_walk(child, pvec);
2173                         if (!ret) {
2174                                 clear_unsync_child_bit(sp, i);
2175                                 continue;
2176                         } else if (ret > 0) {
2177                                 nr_unsync_leaf += ret;
2178                         } else
2179                                 return ret;
2180                 } else if (child->unsync) {
2181                         nr_unsync_leaf++;
2182                         if (mmu_pages_add(pvec, child, i))
2183                                 return -ENOSPC;
2184                 } else
2185                         clear_unsync_child_bit(sp, i);
2186         }
2187
2188         return nr_unsync_leaf;
2189 }
2190
2191 #define INVALID_INDEX (-1)
2192
2193 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
2194                            struct kvm_mmu_pages *pvec)
2195 {
2196         pvec->nr = 0;
2197         if (!sp->unsync_children)
2198                 return 0;
2199
2200         mmu_pages_add(pvec, sp, INVALID_INDEX);
2201         return __mmu_unsync_walk(sp, pvec);
2202 }
2203
2204 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2205 {
2206         WARN_ON(!sp->unsync);
2207         trace_kvm_mmu_sync_page(sp);
2208         sp->unsync = 0;
2209         --kvm->stat.mmu_unsync;
2210 }
2211
2212 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2213                                      struct list_head *invalid_list);
2214 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2215                                     struct list_head *invalid_list);
2216
2217
2218 #define for_each_valid_sp(_kvm, _sp, _gfn)                              \
2219         hlist_for_each_entry(_sp,                                       \
2220           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2221                 if ((_sp)->role.invalid) {    \
2222                 } else
2223
2224 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                 \
2225         for_each_valid_sp(_kvm, _sp, _gfn)                              \
2226                 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
2227
2228 static inline bool is_ept_sp(struct kvm_mmu_page *sp)
2229 {
2230         return sp->role.cr0_wp && sp->role.smap_andnot_wp;
2231 }
2232
2233 /* @sp->gfn should be write-protected at the call site */
2234 static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2235                             struct list_head *invalid_list)
2236 {
2237         if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
2238             vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
2239                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
2240                 return false;
2241         }
2242
2243         return true;
2244 }
2245
2246 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
2247                                         struct list_head *invalid_list,
2248                                         bool remote_flush)
2249 {
2250         if (!remote_flush && list_empty(invalid_list))
2251                 return false;
2252
2253         if (!list_empty(invalid_list))
2254                 kvm_mmu_commit_zap_page(kvm, invalid_list);
2255         else
2256                 kvm_flush_remote_tlbs(kvm);
2257         return true;
2258 }
2259
2260 static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2261                                  struct list_head *invalid_list,
2262                                  bool remote_flush, bool local_flush)
2263 {
2264         if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
2265                 return;
2266
2267         if (local_flush)
2268                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2269 }
2270
2271 #ifdef CONFIG_KVM_MMU_AUDIT
2272 #include "mmu_audit.c"
2273 #else
2274 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2275 static void mmu_audit_disable(void) { }
2276 #endif
2277
2278 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2279                          struct list_head *invalid_list)
2280 {
2281         kvm_unlink_unsync_page(vcpu->kvm, sp);
2282         return __kvm_sync_page(vcpu, sp, invalid_list);
2283 }
2284
2285 /* @gfn should be write-protected at the call site */
2286 static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2287                            struct list_head *invalid_list)
2288 {
2289         struct kvm_mmu_page *s;
2290         bool ret = false;
2291
2292         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
2293                 if (!s->unsync)
2294                         continue;
2295
2296                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
2297                 ret |= kvm_sync_page(vcpu, s, invalid_list);
2298         }
2299
2300         return ret;
2301 }
2302
2303 struct mmu_page_path {
2304         struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2305         unsigned int idx[PT64_ROOT_MAX_LEVEL];
2306 };
2307
2308 #define for_each_sp(pvec, sp, parents, i)                       \
2309                 for (i = mmu_pages_first(&pvec, &parents);      \
2310                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
2311                         i = mmu_pages_next(&pvec, &parents, i))
2312
2313 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2314                           struct mmu_page_path *parents,
2315                           int i)
2316 {
2317         int n;
2318
2319         for (n = i+1; n < pvec->nr; n++) {
2320                 struct kvm_mmu_page *sp = pvec->page[n].sp;
2321                 unsigned idx = pvec->page[n].idx;
2322                 int level = sp->role.level;
2323
2324                 parents->idx[level-1] = idx;
2325                 if (level == PT_PAGE_TABLE_LEVEL)
2326                         break;
2327
2328                 parents->parent[level-2] = sp;
2329         }
2330
2331         return n;
2332 }
2333
2334 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2335                            struct mmu_page_path *parents)
2336 {
2337         struct kvm_mmu_page *sp;
2338         int level;
2339
2340         if (pvec->nr == 0)
2341                 return 0;
2342
2343         WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2344
2345         sp = pvec->page[0].sp;
2346         level = sp->role.level;
2347         WARN_ON(level == PT_PAGE_TABLE_LEVEL);
2348
2349         parents->parent[level-2] = sp;
2350
2351         /* Also set up a sentinel.  Further entries in pvec are all
2352          * children of sp, so this element is never overwritten.
2353          */
2354         parents->parent[level-1] = NULL;
2355         return mmu_pages_next(pvec, parents, 0);
2356 }
2357
2358 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2359 {
2360         struct kvm_mmu_page *sp;
2361         unsigned int level = 0;
2362
2363         do {
2364                 unsigned int idx = parents->idx[level];
2365                 sp = parents->parent[level];
2366                 if (!sp)
2367                         return;
2368
2369                 WARN_ON(idx == INVALID_INDEX);
2370                 clear_unsync_child_bit(sp, idx);
2371                 level++;
2372         } while (!sp->unsync_children);
2373 }
2374
2375 static void mmu_sync_children(struct kvm_vcpu *vcpu,
2376                               struct kvm_mmu_page *parent)
2377 {
2378         int i;
2379         struct kvm_mmu_page *sp;
2380         struct mmu_page_path parents;
2381         struct kvm_mmu_pages pages;
2382         LIST_HEAD(invalid_list);
2383         bool flush = false;
2384
2385         while (mmu_unsync_walk(parent, &pages)) {
2386                 bool protected = false;
2387
2388                 for_each_sp(pages, sp, parents, i)
2389                         protected |= rmap_write_protect(vcpu, sp->gfn);
2390
2391                 if (protected) {
2392                         kvm_flush_remote_tlbs(vcpu->kvm);
2393                         flush = false;
2394                 }
2395
2396                 for_each_sp(pages, sp, parents, i) {
2397                         flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2398                         mmu_pages_clear_parents(&parents);
2399                 }
2400                 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
2401                         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2402                         cond_resched_lock(&vcpu->kvm->mmu_lock);
2403                         flush = false;
2404                 }
2405         }
2406
2407         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2408 }
2409
2410 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2411 {
2412         atomic_set(&sp->write_flooding_count,  0);
2413 }
2414
2415 static void clear_sp_write_flooding_count(u64 *spte)
2416 {
2417         struct kvm_mmu_page *sp =  page_header(__pa(spte));
2418
2419         __clear_sp_write_flooding_count(sp);
2420 }
2421
2422 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2423                                              gfn_t gfn,
2424                                              gva_t gaddr,
2425                                              unsigned level,
2426                                              int direct,
2427                                              unsigned access)
2428 {
2429         union kvm_mmu_page_role role;
2430         unsigned quadrant;
2431         struct kvm_mmu_page *sp;
2432         bool need_sync = false;
2433         bool flush = false;
2434         int collisions = 0;
2435         LIST_HEAD(invalid_list);
2436
2437         role = vcpu->arch.mmu->mmu_role.base;
2438         role.level = level;
2439         role.direct = direct;
2440         if (role.direct)
2441                 role.gpte_is_8_bytes = true;
2442         role.access = access;
2443         if (!vcpu->arch.mmu->direct_map
2444             && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
2445                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2446                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2447                 role.quadrant = quadrant;
2448         }
2449         for_each_valid_sp(vcpu->kvm, sp, gfn) {
2450                 if (sp->gfn != gfn) {
2451                         collisions++;
2452                         continue;
2453                 }
2454
2455                 if (!need_sync && sp->unsync)
2456                         need_sync = true;
2457
2458                 if (sp->role.word != role.word)
2459                         continue;
2460
2461                 if (sp->unsync) {
2462                         /* The page is good, but __kvm_sync_page might still end
2463                          * up zapping it.  If so, break in order to rebuild it.
2464                          */
2465                         if (!__kvm_sync_page(vcpu, sp, &invalid_list))
2466                                 break;
2467
2468                         WARN_ON(!list_empty(&invalid_list));
2469                         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2470                 }
2471
2472                 if (sp->unsync_children)
2473                         kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2474
2475                 __clear_sp_write_flooding_count(sp);
2476                 trace_kvm_mmu_get_page(sp, false);
2477                 goto out;
2478         }
2479
2480         ++vcpu->kvm->stat.mmu_cache_miss;
2481
2482         sp = kvm_mmu_alloc_page(vcpu, direct);
2483
2484         sp->gfn = gfn;
2485         sp->role = role;
2486         hlist_add_head(&sp->hash_link,
2487                 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
2488         if (!direct) {
2489                 /*
2490                  * we should do write protection before syncing pages
2491                  * otherwise the content of the synced shadow page may
2492                  * be inconsistent with guest page table.
2493                  */
2494                 account_shadowed(vcpu->kvm, sp);
2495                 if (level == PT_PAGE_TABLE_LEVEL &&
2496                       rmap_write_protect(vcpu, gfn))
2497                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2498
2499                 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2500                         flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2501         }
2502         clear_page(sp->spt);
2503         trace_kvm_mmu_get_page(sp, true);
2504
2505         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2506 out:
2507         if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2508                 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2509         return sp;
2510 }
2511
2512 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2513                                         struct kvm_vcpu *vcpu, hpa_t root,
2514                                         u64 addr)
2515 {
2516         iterator->addr = addr;
2517         iterator->shadow_addr = root;
2518         iterator->level = vcpu->arch.mmu->shadow_root_level;
2519
2520         if (iterator->level == PT64_ROOT_4LEVEL &&
2521             vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2522             !vcpu->arch.mmu->direct_map)
2523                 --iterator->level;
2524
2525         if (iterator->level == PT32E_ROOT_LEVEL) {
2526                 /*
2527                  * prev_root is currently only used for 64-bit hosts. So only
2528                  * the active root_hpa is valid here.
2529                  */
2530                 BUG_ON(root != vcpu->arch.mmu->root_hpa);
2531
2532                 iterator->shadow_addr
2533                         = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2534                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2535                 --iterator->level;
2536                 if (!iterator->shadow_addr)
2537                         iterator->level = 0;
2538         }
2539 }
2540
2541 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2542                              struct kvm_vcpu *vcpu, u64 addr)
2543 {
2544         shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
2545                                     addr);
2546 }
2547
2548 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2549 {
2550         if (iterator->level < PT_PAGE_TABLE_LEVEL)
2551                 return false;
2552
2553         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2554         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2555         return true;
2556 }
2557
2558 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2559                                u64 spte)
2560 {
2561         if (is_last_spte(spte, iterator->level)) {
2562                 iterator->level = 0;
2563                 return;
2564         }
2565
2566         iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2567         --iterator->level;
2568 }
2569
2570 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2571 {
2572         __shadow_walk_next(iterator, *iterator->sptep);
2573 }
2574
2575 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2576                              struct kvm_mmu_page *sp)
2577 {
2578         u64 spte;
2579
2580         BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2581
2582         spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2583                shadow_user_mask | shadow_x_mask | shadow_me_mask;
2584
2585         if (sp_ad_disabled(sp))
2586                 spte |= shadow_acc_track_value;
2587         else
2588                 spte |= shadow_accessed_mask;
2589
2590         mmu_spte_set(sptep, spte);
2591
2592         mmu_page_add_parent_pte(vcpu, sp, sptep);
2593
2594         if (sp->unsync_children || sp->unsync)
2595                 mark_unsync(sptep);
2596 }
2597
2598 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2599                                    unsigned direct_access)
2600 {
2601         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2602                 struct kvm_mmu_page *child;
2603
2604                 /*
2605                  * For the direct sp, if the guest pte's dirty bit
2606                  * changed form clean to dirty, it will corrupt the
2607                  * sp's access: allow writable in the read-only sp,
2608                  * so we should update the spte at this point to get
2609                  * a new sp with the correct access.
2610                  */
2611                 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
2612                 if (child->role.access == direct_access)
2613                         return;
2614
2615                 drop_parent_pte(child, sptep);
2616                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2617         }
2618 }
2619
2620 static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2621                              u64 *spte)
2622 {
2623         u64 pte;
2624         struct kvm_mmu_page *child;
2625
2626         pte = *spte;
2627         if (is_shadow_present_pte(pte)) {
2628                 if (is_last_spte(pte, sp->role.level)) {
2629                         drop_spte(kvm, spte);
2630                         if (is_large_pte(pte))
2631                                 --kvm->stat.lpages;
2632                 } else {
2633                         child = page_header(pte & PT64_BASE_ADDR_MASK);
2634                         drop_parent_pte(child, spte);
2635                 }
2636                 return true;
2637         }
2638
2639         if (is_mmio_spte(pte))
2640                 mmu_spte_clear_no_track(spte);
2641
2642         return false;
2643 }
2644
2645 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
2646                                          struct kvm_mmu_page *sp)
2647 {
2648         unsigned i;
2649
2650         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2651                 mmu_page_zap_pte(kvm, sp, sp->spt + i);
2652 }
2653
2654 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2655 {
2656         u64 *sptep;
2657         struct rmap_iterator iter;
2658
2659         while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2660                 drop_parent_pte(sp, sptep);
2661 }
2662
2663 static int mmu_zap_unsync_children(struct kvm *kvm,
2664                                    struct kvm_mmu_page *parent,
2665                                    struct list_head *invalid_list)
2666 {
2667         int i, zapped = 0;
2668         struct mmu_page_path parents;
2669         struct kvm_mmu_pages pages;
2670
2671         if (parent->role.level == PT_PAGE_TABLE_LEVEL)
2672                 return 0;
2673
2674         while (mmu_unsync_walk(parent, &pages)) {
2675                 struct kvm_mmu_page *sp;
2676
2677                 for_each_sp(pages, sp, parents, i) {
2678                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2679                         mmu_pages_clear_parents(&parents);
2680                         zapped++;
2681                 }
2682         }
2683
2684         return zapped;
2685 }
2686
2687 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2688                                        struct kvm_mmu_page *sp,
2689                                        struct list_head *invalid_list,
2690                                        int *nr_zapped)
2691 {
2692         bool list_unstable;
2693
2694         trace_kvm_mmu_prepare_zap_page(sp);
2695         ++kvm->stat.mmu_shadow_zapped;
2696         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2697         kvm_mmu_page_unlink_children(kvm, sp);
2698         kvm_mmu_unlink_parents(kvm, sp);
2699
2700         /* Zapping children means active_mmu_pages has become unstable. */
2701         list_unstable = *nr_zapped;
2702
2703         if (!sp->role.invalid && !sp->role.direct)
2704                 unaccount_shadowed(kvm, sp);
2705
2706         if (sp->unsync)
2707                 kvm_unlink_unsync_page(kvm, sp);
2708         if (!sp->root_count) {
2709                 /* Count self */
2710                 (*nr_zapped)++;
2711                 list_move(&sp->link, invalid_list);
2712                 kvm_mod_used_mmu_pages(kvm, -1);
2713         } else {
2714                 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2715
2716                 if (!sp->role.invalid)
2717                         kvm_reload_remote_mmus(kvm);
2718         }
2719
2720         sp->role.invalid = 1;
2721         return list_unstable;
2722 }
2723
2724 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2725                                      struct list_head *invalid_list)
2726 {
2727         int nr_zapped;
2728
2729         __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2730         return nr_zapped;
2731 }
2732
2733 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2734                                     struct list_head *invalid_list)
2735 {
2736         struct kvm_mmu_page *sp, *nsp;
2737
2738         if (list_empty(invalid_list))
2739                 return;
2740
2741         /*
2742          * We need to make sure everyone sees our modifications to
2743          * the page tables and see changes to vcpu->mode here. The barrier
2744          * in the kvm_flush_remote_tlbs() achieves this. This pairs
2745          * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2746          *
2747          * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2748          * guest mode and/or lockless shadow page table walks.
2749          */
2750         kvm_flush_remote_tlbs(kvm);
2751
2752         list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2753                 WARN_ON(!sp->role.invalid || sp->root_count);
2754                 kvm_mmu_free_page(sp);
2755         }
2756 }
2757
2758 static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2759                                         struct list_head *invalid_list)
2760 {
2761         struct kvm_mmu_page *sp;
2762
2763         if (list_empty(&kvm->arch.active_mmu_pages))
2764                 return false;
2765
2766         sp = list_last_entry(&kvm->arch.active_mmu_pages,
2767                              struct kvm_mmu_page, link);
2768         return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2769 }
2770
2771 /*
2772  * Changing the number of mmu pages allocated to the vm
2773  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2774  */
2775 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2776 {
2777         LIST_HEAD(invalid_list);
2778
2779         spin_lock(&kvm->mmu_lock);
2780
2781         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2782                 /* Need to free some mmu pages to achieve the goal. */
2783                 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2784                         if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2785                                 break;
2786
2787                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2788                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2789         }
2790
2791         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2792
2793         spin_unlock(&kvm->mmu_lock);
2794 }
2795
2796 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2797 {
2798         struct kvm_mmu_page *sp;
2799         LIST_HEAD(invalid_list);
2800         int r;
2801
2802         pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2803         r = 0;
2804         spin_lock(&kvm->mmu_lock);
2805         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2806                 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2807                          sp->role.word);
2808                 r = 1;
2809                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2810         }
2811         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2812         spin_unlock(&kvm->mmu_lock);
2813
2814         return r;
2815 }
2816 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2817
2818 static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2819 {
2820         trace_kvm_mmu_unsync_page(sp);
2821         ++vcpu->kvm->stat.mmu_unsync;
2822         sp->unsync = 1;
2823
2824         kvm_mmu_mark_parents_unsync(sp);
2825 }
2826
2827 static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2828                                    bool can_unsync)
2829 {
2830         struct kvm_mmu_page *sp;
2831
2832         if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2833                 return true;
2834
2835         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
2836                 if (!can_unsync)
2837                         return true;
2838
2839                 if (sp->unsync)
2840                         continue;
2841
2842                 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
2843                 kvm_unsync_page(vcpu, sp);
2844         }
2845
2846         /*
2847          * We need to ensure that the marking of unsync pages is visible
2848          * before the SPTE is updated to allow writes because
2849          * kvm_mmu_sync_roots() checks the unsync flags without holding
2850          * the MMU lock and so can race with this. If the SPTE was updated
2851          * before the page had been marked as unsync-ed, something like the
2852          * following could happen:
2853          *
2854          * CPU 1                    CPU 2
2855          * ---------------------------------------------------------------------
2856          * 1.2 Host updates SPTE
2857          *     to be writable
2858          *                      2.1 Guest writes a GPTE for GVA X.
2859          *                          (GPTE being in the guest page table shadowed
2860          *                           by the SP from CPU 1.)
2861          *                          This reads SPTE during the page table walk.
2862          *                          Since SPTE.W is read as 1, there is no
2863          *                          fault.
2864          *
2865          *                      2.2 Guest issues TLB flush.
2866          *                          That causes a VM Exit.
2867          *
2868          *                      2.3 kvm_mmu_sync_pages() reads sp->unsync.
2869          *                          Since it is false, so it just returns.
2870          *
2871          *                      2.4 Guest accesses GVA X.
2872          *                          Since the mapping in the SP was not updated,
2873          *                          so the old mapping for GVA X incorrectly
2874          *                          gets used.
2875          * 1.1 Host marks SP
2876          *     as unsync
2877          *     (sp->unsync = true)
2878          *
2879          * The write barrier below ensures that 1.1 happens before 1.2 and thus
2880          * the situation in 2.4 does not arise. The implicit barrier in 2.2
2881          * pairs with this write barrier.
2882          */
2883         smp_wmb();
2884
2885         return false;
2886 }
2887
2888 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
2889 {
2890         if (pfn_valid(pfn))
2891                 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
2892                         /*
2893                          * Some reserved pages, such as those from NVDIMM
2894                          * DAX devices, are not for MMIO, and can be mapped
2895                          * with cached memory type for better performance.
2896                          * However, the above check misconceives those pages
2897                          * as MMIO, and results in KVM mapping them with UC
2898                          * memory type, which would hurt the performance.
2899                          * Therefore, we check the host memory type in addition
2900                          * and only treat UC/UC-/WC pages as MMIO.
2901                          */
2902                         (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
2903
2904         return !e820__mapped_raw_any(pfn_to_hpa(pfn),
2905                                      pfn_to_hpa(pfn + 1) - 1,
2906                                      E820_TYPE_RAM);
2907 }
2908
2909 /* Bits which may be returned by set_spte() */
2910 #define SET_SPTE_WRITE_PROTECTED_PT     BIT(0)
2911 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH  BIT(1)
2912
2913 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2914                     unsigned pte_access, int level,
2915                     gfn_t gfn, kvm_pfn_t pfn, bool speculative,
2916                     bool can_unsync, bool host_writable)
2917 {
2918         u64 spte = 0;
2919         int ret = 0;
2920         struct kvm_mmu_page *sp;
2921
2922         if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
2923                 return 0;
2924
2925         sp = page_header(__pa(sptep));
2926         if (sp_ad_disabled(sp))
2927                 spte |= shadow_acc_track_value;
2928
2929         /*
2930          * For the EPT case, shadow_present_mask is 0 if hardware
2931          * supports exec-only page table entries.  In that case,
2932          * ACC_USER_MASK and shadow_user_mask are used to represent
2933          * read access.  See FNAME(gpte_access) in paging_tmpl.h.
2934          */
2935         spte |= shadow_present_mask;
2936         if (!speculative)
2937                 spte |= spte_shadow_accessed_mask(spte);
2938
2939         if (pte_access & ACC_EXEC_MASK)
2940                 spte |= shadow_x_mask;
2941         else
2942                 spte |= shadow_nx_mask;
2943
2944         if (pte_access & ACC_USER_MASK)
2945                 spte |= shadow_user_mask;
2946
2947         if (level > PT_PAGE_TABLE_LEVEL)
2948                 spte |= PT_PAGE_SIZE_MASK;
2949         if (tdp_enabled)
2950                 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
2951                         kvm_is_mmio_pfn(pfn));
2952
2953         if (host_writable)
2954                 spte |= SPTE_HOST_WRITEABLE;
2955         else
2956                 pte_access &= ~ACC_WRITE_MASK;
2957
2958         if (!kvm_is_mmio_pfn(pfn))
2959                 spte |= shadow_me_mask;
2960
2961         spte |= (u64)pfn << PAGE_SHIFT;
2962
2963         if (pte_access & ACC_WRITE_MASK) {
2964
2965                 /*
2966                  * Other vcpu creates new sp in the window between
2967                  * mapping_level() and acquiring mmu-lock. We can
2968                  * allow guest to retry the access, the mapping can
2969                  * be fixed if guest refault.
2970                  */
2971                 if (level > PT_PAGE_TABLE_LEVEL &&
2972                     mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
2973                         goto done;
2974
2975                 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2976
2977                 /*
2978                  * Optimization: for pte sync, if spte was writable the hash
2979                  * lookup is unnecessary (and expensive). Write protection
2980                  * is responsibility of mmu_get_page / kvm_sync_page.
2981                  * Same reasoning can be applied to dirty page accounting.
2982                  */
2983                 if (!can_unsync && is_writable_pte(*sptep))
2984                         goto set_pte;
2985
2986                 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
2987                         pgprintk("%s: found shadow page for %llx, marking ro\n",
2988                                  __func__, gfn);
2989                         ret |= SET_SPTE_WRITE_PROTECTED_PT;
2990                         pte_access &= ~ACC_WRITE_MASK;
2991                         spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
2992                 }
2993         }
2994
2995         if (pte_access & ACC_WRITE_MASK) {
2996                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
2997                 spte |= spte_shadow_dirty_mask(spte);
2998         }
2999
3000         if (speculative)
3001                 spte = mark_spte_for_access_track(spte);
3002
3003 set_pte:
3004         if (mmu_spte_update(sptep, spte))
3005                 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
3006 done:
3007         return ret;
3008 }
3009
3010 static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
3011                         int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
3012                         bool speculative, bool host_writable)
3013 {
3014         int was_rmapped = 0;
3015         int rmap_count;
3016         int set_spte_ret;
3017         int ret = RET_PF_RETRY;
3018         bool flush = false;
3019
3020         pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
3021                  *sptep, write_fault, gfn);
3022
3023         if (is_shadow_present_pte(*sptep)) {
3024                 /*
3025                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
3026                  * the parent of the now unreachable PTE.
3027                  */
3028                 if (level > PT_PAGE_TABLE_LEVEL &&
3029                     !is_large_pte(*sptep)) {
3030                         struct kvm_mmu_page *child;
3031                         u64 pte = *sptep;
3032
3033                         child = page_header(pte & PT64_BASE_ADDR_MASK);
3034                         drop_parent_pte(child, sptep);
3035                         flush = true;
3036                 } else if (pfn != spte_to_pfn(*sptep)) {
3037                         pgprintk("hfn old %llx new %llx\n",
3038                                  spte_to_pfn(*sptep), pfn);
3039                         drop_spte(vcpu->kvm, sptep);
3040                         flush = true;
3041                 } else
3042                         was_rmapped = 1;
3043         }
3044
3045         set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
3046                                 speculative, true, host_writable);
3047         if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
3048                 if (write_fault)
3049                         ret = RET_PF_EMULATE;
3050                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3051         }
3052
3053         if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
3054                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
3055                                 KVM_PAGES_PER_HPAGE(level));
3056
3057         if (unlikely(is_mmio_spte(*sptep)))
3058                 ret = RET_PF_EMULATE;
3059
3060         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
3061         pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
3062                  is_large_pte(*sptep)? "2MB" : "4kB",
3063                  *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
3064                  *sptep, sptep);
3065         if (!was_rmapped && is_large_pte(*sptep))
3066                 ++vcpu->kvm->stat.lpages;
3067
3068         if (is_shadow_present_pte(*sptep)) {
3069                 if (!was_rmapped) {
3070                         rmap_count = rmap_add(vcpu, sptep, gfn);
3071                         if (rmap_count > RMAP_RECYCLE_THRESHOLD)
3072                                 rmap_recycle(vcpu, sptep, gfn);
3073                 }
3074         }
3075
3076         kvm_release_pfn_clean(pfn);
3077
3078         return ret;
3079 }
3080
3081 static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
3082                                      bool no_dirty_log)
3083 {
3084         struct kvm_memory_slot *slot;
3085
3086         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
3087         if (!slot)
3088                 return KVM_PFN_ERR_FAULT;
3089
3090         return gfn_to_pfn_memslot_atomic(slot, gfn);
3091 }
3092
3093 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
3094                                     struct kvm_mmu_page *sp,
3095                                     u64 *start, u64 *end)
3096 {
3097         struct page *pages[PTE_PREFETCH_NUM];
3098         struct kvm_memory_slot *slot;
3099         unsigned access = sp->role.access;
3100         int i, ret;
3101         gfn_t gfn;
3102
3103         gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
3104         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
3105         if (!slot)
3106                 return -1;
3107
3108         ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
3109         if (ret <= 0)
3110                 return -1;
3111
3112         for (i = 0; i < ret; i++, gfn++, start++)
3113                 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
3114                              page_to_pfn(pages[i]), true, true);
3115
3116         return 0;
3117 }
3118
3119 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3120                                   struct kvm_mmu_page *sp, u64 *sptep)
3121 {
3122         u64 *spte, *start = NULL;
3123         int i;
3124
3125         WARN_ON(!sp->role.direct);
3126
3127         i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
3128         spte = sp->spt + i;
3129
3130         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3131                 if (is_shadow_present_pte(*spte) || spte == sptep) {
3132                         if (!start)
3133                                 continue;
3134                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3135                                 break;
3136                         start = NULL;
3137                 } else if (!start)
3138                         start = spte;
3139         }
3140 }
3141
3142 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3143 {
3144         struct kvm_mmu_page *sp;
3145
3146         sp = page_header(__pa(sptep));
3147
3148         /*
3149          * Without accessed bits, there's no way to distinguish between
3150          * actually accessed translations and prefetched, so disable pte
3151          * prefetch if accessed bits aren't available.
3152          */
3153         if (sp_ad_disabled(sp))
3154                 return;
3155
3156         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3157                 return;
3158
3159         __direct_pte_prefetch(vcpu, sp, sptep);
3160 }
3161
3162 static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
3163                         int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
3164 {
3165         struct kvm_shadow_walk_iterator iterator;
3166         struct kvm_mmu_page *sp;
3167         int emulate = 0;
3168         gfn_t pseudo_gfn;
3169
3170         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3171                 return 0;
3172
3173         for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
3174                 if (iterator.level == level) {
3175                         emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
3176                                                write, level, gfn, pfn, prefault,
3177                                                map_writable);
3178                         direct_pte_prefetch(vcpu, iterator.sptep);
3179                         ++vcpu->stat.pf_fixed;
3180                         break;
3181                 }
3182
3183                 drop_large_spte(vcpu, iterator.sptep);
3184                 if (!is_shadow_present_pte(*iterator.sptep)) {
3185                         u64 base_addr = iterator.addr;
3186
3187                         base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
3188                         pseudo_gfn = base_addr >> PAGE_SHIFT;
3189                         sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
3190                                               iterator.level - 1, 1, ACC_ALL);
3191
3192                         link_shadow_page(vcpu, iterator.sptep, sp);
3193                 }
3194         }
3195         return emulate;
3196 }
3197
3198 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3199 {
3200         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3201 }
3202
3203 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3204 {
3205         /*
3206          * Do not cache the mmio info caused by writing the readonly gfn
3207          * into the spte otherwise read access on readonly gfn also can
3208          * caused mmio page fault and treat it as mmio access.
3209          */
3210         if (pfn == KVM_PFN_ERR_RO_FAULT)
3211                 return RET_PF_EMULATE;
3212
3213         if (pfn == KVM_PFN_ERR_HWPOISON) {
3214                 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3215                 return RET_PF_RETRY;
3216         }
3217
3218         return -EFAULT;
3219 }
3220
3221 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
3222                                         gfn_t *gfnp, kvm_pfn_t *pfnp,
3223                                         int *levelp)
3224 {
3225         kvm_pfn_t pfn = *pfnp;
3226         gfn_t gfn = *gfnp;
3227         int level = *levelp;
3228
3229         /*
3230          * Check if it's a transparent hugepage. If this would be an
3231          * hugetlbfs page, level wouldn't be set to
3232          * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
3233          * here.
3234          */
3235         if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
3236             level == PT_PAGE_TABLE_LEVEL &&
3237             PageTransCompoundMap(pfn_to_page(pfn)) &&
3238             !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
3239                 unsigned long mask;
3240                 /*
3241                  * mmu_notifier_retry was successful and we hold the
3242                  * mmu_lock here, so the pmd can't become splitting
3243                  * from under us, and in turn
3244                  * __split_huge_page_refcount() can't run from under
3245                  * us and we can safely transfer the refcount from
3246                  * PG_tail to PG_head as we switch the pfn to tail to
3247                  * head.
3248                  */
3249                 *levelp = level = PT_DIRECTORY_LEVEL;
3250                 mask = KVM_PAGES_PER_HPAGE(level) - 1;
3251                 VM_BUG_ON((gfn & mask) != (pfn & mask));
3252                 if (pfn & mask) {
3253                         gfn &= ~mask;
3254                         *gfnp = gfn;
3255                         kvm_release_pfn_clean(pfn);
3256                         pfn &= ~mask;
3257                         kvm_get_pfn(pfn);
3258                         *pfnp = pfn;
3259                 }
3260         }
3261 }
3262
3263 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
3264                                 kvm_pfn_t pfn, unsigned access, int *ret_val)
3265 {
3266         /* The pfn is invalid, report the error! */
3267         if (unlikely(is_error_pfn(pfn))) {
3268                 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
3269                 return true;
3270         }
3271
3272         if (unlikely(is_noslot_pfn(pfn)))
3273                 vcpu_cache_mmio_info(vcpu, gva, gfn, access);
3274
3275         return false;
3276 }
3277
3278 static bool page_fault_can_be_fast(u32 error_code)
3279 {
3280         /*
3281          * Do not fix the mmio spte with invalid generation number which
3282          * need to be updated by slow page fault path.
3283          */
3284         if (unlikely(error_code & PFERR_RSVD_MASK))
3285                 return false;
3286
3287         /* See if the page fault is due to an NX violation */
3288         if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3289                       == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3290                 return false;
3291
3292         /*
3293          * #PF can be fast if:
3294          * 1. The shadow page table entry is not present, which could mean that
3295          *    the fault is potentially caused by access tracking (if enabled).
3296          * 2. The shadow page table entry is present and the fault
3297          *    is caused by write-protect, that means we just need change the W
3298          *    bit of the spte which can be done out of mmu-lock.
3299          *
3300          * However, if access tracking is disabled we know that a non-present
3301          * page must be a genuine page fault where we have to create a new SPTE.
3302          * So, if access tracking is disabled, we return true only for write
3303          * accesses to a present page.
3304          */
3305
3306         return shadow_acc_track_mask != 0 ||
3307                ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3308                 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
3309 }
3310
3311 /*
3312  * Returns true if the SPTE was fixed successfully. Otherwise,
3313  * someone else modified the SPTE from its original value.
3314  */
3315 static bool
3316 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
3317                         u64 *sptep, u64 old_spte, u64 new_spte)
3318 {
3319         gfn_t gfn;
3320
3321         WARN_ON(!sp->role.direct);
3322
3323         /*
3324          * Theoretically we could also set dirty bit (and flush TLB) here in
3325          * order to eliminate unnecessary PML logging. See comments in
3326          * set_spte. But fast_page_fault is very unlikely to happen with PML
3327          * enabled, so we do not do this. This might result in the same GPA
3328          * to be logged in PML buffer again when the write really happens, and
3329          * eventually to be called by mark_page_dirty twice. But it's also no
3330          * harm. This also avoids the TLB flush needed after setting dirty bit
3331          * so non-PML cases won't be impacted.
3332          *
3333          * Compare with set_spte where instead shadow_dirty_mask is set.
3334          */
3335         if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3336                 return false;
3337
3338         if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3339                 /*
3340                  * The gfn of direct spte is stable since it is
3341                  * calculated by sp->gfn.
3342                  */
3343                 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3344                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3345         }
3346
3347         return true;
3348 }
3349
3350 static bool is_access_allowed(u32 fault_err_code, u64 spte)
3351 {
3352         if (fault_err_code & PFERR_FETCH_MASK)
3353                 return is_executable_pte(spte);
3354
3355         if (fault_err_code & PFERR_WRITE_MASK)
3356                 return is_writable_pte(spte);
3357
3358         /* Fault was on Read access */
3359         return spte & PT_PRESENT_MASK;
3360 }
3361
3362 /*
3363  * Return value:
3364  * - true: let the vcpu to access on the same address again.
3365  * - false: let the real page fault path to fix it.
3366  */
3367 static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
3368                             u32 error_code)
3369 {
3370         struct kvm_shadow_walk_iterator iterator;
3371         struct kvm_mmu_page *sp;
3372         bool fault_handled = false;
3373         u64 spte = 0ull;
3374         uint retry_count = 0;
3375
3376         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3377                 return false;
3378
3379         if (!page_fault_can_be_fast(error_code))
3380                 return false;
3381
3382         walk_shadow_page_lockless_begin(vcpu);
3383
3384         do {
3385                 u64 new_spte;
3386
3387                 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
3388                         if (!is_shadow_present_pte(spte) ||
3389                             iterator.level < level)
3390                                 break;
3391
3392                 sp = page_header(__pa(iterator.sptep));
3393                 if (!is_last_spte(spte, sp->role.level))
3394                         break;
3395
3396                 /*
3397                  * Check whether the memory access that caused the fault would
3398                  * still cause it if it were to be performed right now. If not,
3399                  * then this is a spurious fault caused by TLB lazily flushed,
3400                  * or some other CPU has already fixed the PTE after the
3401                  * current CPU took the fault.
3402                  *
3403                  * Need not check the access of upper level table entries since
3404                  * they are always ACC_ALL.
3405                  */
3406                 if (is_access_allowed(error_code, spte)) {
3407                         fault_handled = true;
3408                         break;
3409                 }
3410
3411                 new_spte = spte;
3412
3413                 if (is_access_track_spte(spte))
3414                         new_spte = restore_acc_track_spte(new_spte);
3415
3416                 /*
3417                  * Currently, to simplify the code, write-protection can
3418                  * be removed in the fast path only if the SPTE was
3419                  * write-protected for dirty-logging or access tracking.
3420                  */
3421                 if ((error_code & PFERR_WRITE_MASK) &&
3422                     spte_can_locklessly_be_made_writable(spte))
3423                 {
3424                         new_spte |= PT_WRITABLE_MASK;
3425
3426                         /*
3427                          * Do not fix write-permission on the large spte.  Since
3428                          * we only dirty the first page into the dirty-bitmap in
3429                          * fast_pf_fix_direct_spte(), other pages are missed
3430                          * if its slot has dirty logging enabled.
3431                          *
3432                          * Instead, we let the slow page fault path create a
3433                          * normal spte to fix the access.
3434                          *
3435                          * See the comments in kvm_arch_commit_memory_region().
3436                          */
3437                         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3438                                 break;
3439                 }
3440
3441                 /* Verify that the fault can be handled in the fast path */
3442                 if (new_spte == spte ||
3443                     !is_access_allowed(error_code, new_spte))
3444                         break;
3445
3446                 /*
3447                  * Currently, fast page fault only works for direct mapping
3448                  * since the gfn is not stable for indirect shadow page. See
3449                  * Documentation/virtual/kvm/locking.txt to get more detail.
3450                  */
3451                 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3452                                                         iterator.sptep, spte,
3453                                                         new_spte);
3454                 if (fault_handled)
3455                         break;
3456
3457                 if (++retry_count > 4) {
3458                         printk_once(KERN_WARNING
3459                                 "kvm: Fast #PF retrying more than 4 times.\n");
3460                         break;
3461                 }
3462
3463         } while (true);
3464
3465         trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
3466                               spte, fault_handled);
3467         walk_shadow_page_lockless_end(vcpu);
3468
3469         return fault_handled;
3470 }
3471
3472 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3473                          gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
3474 static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
3475
3476 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3477                          gfn_t gfn, bool prefault)
3478 {
3479         int r;
3480         int level;
3481         bool force_pt_level = false;
3482         kvm_pfn_t pfn;
3483         unsigned long mmu_seq;
3484         bool map_writable, write = error_code & PFERR_WRITE_MASK;
3485
3486         level = mapping_level(vcpu, gfn, &force_pt_level);
3487         if (likely(!force_pt_level)) {
3488                 /*
3489                  * This path builds a PAE pagetable - so we can map
3490                  * 2mb pages at maximum. Therefore check if the level
3491                  * is larger than that.
3492                  */
3493                 if (level > PT_DIRECTORY_LEVEL)
3494                         level = PT_DIRECTORY_LEVEL;
3495
3496                 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3497         }
3498
3499         if (fast_page_fault(vcpu, v, level, error_code))
3500                 return RET_PF_RETRY;
3501
3502         mmu_seq = vcpu->kvm->mmu_notifier_seq;
3503         smp_rmb();
3504
3505         if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
3506                 return RET_PF_RETRY;
3507
3508         if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
3509                 return r;
3510
3511         spin_lock(&vcpu->kvm->mmu_lock);
3512         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3513                 goto out_unlock;
3514         if (make_mmu_pages_available(vcpu) < 0)
3515                 goto out_unlock;
3516         if (likely(!force_pt_level))
3517                 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
3518         r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
3519         spin_unlock(&vcpu->kvm->mmu_lock);
3520
3521         return r;
3522
3523 out_unlock:
3524         spin_unlock(&vcpu->kvm->mmu_lock);
3525         kvm_release_pfn_clean(pfn);
3526         return RET_PF_RETRY;
3527 }
3528
3529 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3530                                struct list_head *invalid_list)
3531 {
3532         struct kvm_mmu_page *sp;
3533
3534         if (!VALID_PAGE(*root_hpa))
3535                 return;
3536
3537         sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
3538         --sp->root_count;
3539         if (!sp->root_count && sp->role.invalid)
3540                 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3541
3542         *root_hpa = INVALID_PAGE;
3543 }
3544
3545 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3546 void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3547                         ulong roots_to_free)
3548 {
3549         int i;
3550         LIST_HEAD(invalid_list);
3551         bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
3552
3553         BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3554
3555         /* Before acquiring the MMU lock, see if we need to do any real work. */
3556         if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
3557                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3558                         if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3559                             VALID_PAGE(mmu->prev_roots[i].hpa))
3560                                 break;
3561
3562                 if (i == KVM_MMU_NUM_PREV_ROOTS)
3563                         return;
3564         }
3565
3566         spin_lock(&vcpu->kvm->mmu_lock);
3567
3568         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3569                 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3570                         mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
3571                                            &invalid_list);
3572
3573         if (free_active_root) {
3574                 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
3575                     (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3576                         mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
3577                                            &invalid_list);
3578                 } else {
3579                         for (i = 0; i < 4; ++i)
3580                                 if (mmu->pae_root[i] != 0)
3581                                         mmu_free_root_page(vcpu->kvm,
3582                                                            &mmu->pae_root[i],
3583                                                            &invalid_list);
3584                         mmu->root_hpa = INVALID_PAGE;
3585                 }
3586                 mmu->root_cr3 = 0;
3587         }
3588
3589         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3590         spin_unlock(&vcpu->kvm->mmu_lock);
3591 }
3592 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3593
3594 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3595 {
3596         int ret = 0;
3597
3598         if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
3599                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3600                 ret = 1;
3601         }
3602
3603         return ret;
3604 }
3605
3606 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3607 {
3608         struct kvm_mmu_page *sp;
3609         unsigned i;
3610
3611         if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3612                 spin_lock(&vcpu->kvm->mmu_lock);
3613                 if(make_mmu_pages_available(vcpu) < 0) {
3614                         spin_unlock(&vcpu->kvm->mmu_lock);
3615                         return -ENOSPC;
3616                 }
3617                 sp = kvm_mmu_get_page(vcpu, 0, 0,
3618                                 vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
3619                 ++sp->root_count;
3620                 spin_unlock(&vcpu->kvm->mmu_lock);
3621                 vcpu->arch.mmu->root_hpa = __pa(sp->spt);
3622         } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
3623                 for (i = 0; i < 4; ++i) {
3624                         hpa_t root = vcpu->arch.mmu->pae_root[i];
3625
3626                         MMU_WARN_ON(VALID_PAGE(root));
3627                         spin_lock(&vcpu->kvm->mmu_lock);
3628                         if (make_mmu_pages_available(vcpu) < 0) {
3629                                 spin_unlock(&vcpu->kvm->mmu_lock);
3630                                 return -ENOSPC;
3631                         }
3632                         sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
3633                                         i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
3634                         root = __pa(sp->spt);
3635                         ++sp->root_count;
3636                         spin_unlock(&vcpu->kvm->mmu_lock);
3637                         vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
3638                 }
3639                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3640         } else
3641                 BUG();
3642         vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3643
3644         return 0;
3645 }
3646
3647 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3648 {
3649         struct kvm_mmu_page *sp;
3650         u64 pdptr, pm_mask;
3651         gfn_t root_gfn, root_cr3;
3652         int i;
3653
3654         root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3655         root_gfn = root_cr3 >> PAGE_SHIFT;
3656
3657         if (mmu_check_root(vcpu, root_gfn))
3658                 return 1;
3659
3660         /*
3661          * Do we shadow a long mode page table? If so we need to
3662          * write-protect the guests page table root.
3663          */
3664         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3665                 hpa_t root = vcpu->arch.mmu->root_hpa;
3666
3667                 MMU_WARN_ON(VALID_PAGE(root));
3668
3669                 spin_lock(&vcpu->kvm->mmu_lock);
3670                 if (make_mmu_pages_available(vcpu) < 0) {
3671                         spin_unlock(&vcpu->kvm->mmu_lock);
3672                         return -ENOSPC;
3673                 }
3674                 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
3675                                 vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
3676                 root = __pa(sp->spt);
3677                 ++sp->root_count;
3678                 spin_unlock(&vcpu->kvm->mmu_lock);
3679                 vcpu->arch.mmu->root_hpa = root;
3680                 goto set_root_cr3;
3681         }
3682
3683         /*
3684          * We shadow a 32 bit page table. This may be a legacy 2-level
3685          * or a PAE 3-level page table. In either case we need to be aware that
3686          * the shadow page table may be a PAE or a long mode page table.
3687          */
3688         pm_mask = PT_PRESENT_MASK;
3689         if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3690                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3691
3692         for (i = 0; i < 4; ++i) {
3693                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3694
3695                 MMU_WARN_ON(VALID_PAGE(root));
3696                 if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
3697                         pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
3698                         if (!(pdptr & PT_PRESENT_MASK)) {
3699                                 vcpu->arch.mmu->pae_root[i] = 0;
3700                                 continue;
3701                         }
3702                         root_gfn = pdptr >> PAGE_SHIFT;
3703                         if (mmu_check_root(vcpu, root_gfn))
3704                                 return 1;
3705                 }
3706                 spin_lock(&vcpu->kvm->mmu_lock);
3707                 if (make_mmu_pages_available(vcpu) < 0) {
3708                         spin_unlock(&vcpu->kvm->mmu_lock);
3709                         return -ENOSPC;
3710                 }
3711                 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
3712                                       0, ACC_ALL);
3713                 root = __pa(sp->spt);
3714                 ++sp->root_count;
3715                 spin_unlock(&vcpu->kvm->mmu_lock);
3716
3717                 vcpu->arch.mmu->pae_root[i] = root | pm_mask;
3718         }
3719         vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3720
3721         /*
3722          * If we shadow a 32 bit page table with a long mode page
3723          * table we enter this path.
3724          */
3725         if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
3726                 if (vcpu->arch.mmu->lm_root == NULL) {
3727                         /*
3728                          * The additional page necessary for this is only
3729                          * allocated on demand.
3730                          */
3731
3732                         u64 *lm_root;
3733
3734                         lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3735                         if (lm_root == NULL)
3736                                 return 1;
3737
3738                         lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
3739
3740                         vcpu->arch.mmu->lm_root = lm_root;
3741                 }
3742
3743                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
3744         }
3745
3746 set_root_cr3:
3747         vcpu->arch.mmu->root_cr3 = root_cr3;
3748
3749         return 0;
3750 }
3751
3752 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3753 {
3754         if (vcpu->arch.mmu->direct_map)
3755                 return mmu_alloc_direct_roots(vcpu);
3756         else
3757                 return mmu_alloc_shadow_roots(vcpu);
3758 }
3759
3760 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3761 {
3762         int i;
3763         struct kvm_mmu_page *sp;
3764
3765         if (vcpu->arch.mmu->direct_map)
3766                 return;
3767
3768         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3769                 return;
3770
3771         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3772
3773         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3774                 hpa_t root = vcpu->arch.mmu->root_hpa;
3775                 sp = page_header(root);
3776
3777                 /*
3778                  * Even if another CPU was marking the SP as unsync-ed
3779                  * simultaneously, any guest page table changes are not
3780                  * guaranteed to be visible anyway until this VCPU issues a TLB
3781                  * flush strictly after those changes are made. We only need to
3782                  * ensure that the other CPU sets these flags before any actual
3783                  * changes to the page tables are made. The comments in
3784                  * mmu_need_write_protect() describe what could go wrong if this
3785                  * requirement isn't satisfied.
3786                  */
3787                 if (!smp_load_acquire(&sp->unsync) &&
3788                     !smp_load_acquire(&sp->unsync_children))
3789                         return;
3790
3791                 spin_lock(&vcpu->kvm->mmu_lock);
3792                 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3793
3794                 mmu_sync_children(vcpu, sp);
3795
3796                 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3797                 spin_unlock(&vcpu->kvm->mmu_lock);
3798                 return;
3799         }
3800
3801         spin_lock(&vcpu->kvm->mmu_lock);
3802         kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3803
3804         for (i = 0; i < 4; ++i) {
3805                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3806
3807                 if (root && VALID_PAGE(root)) {
3808                         root &= PT64_BASE_ADDR_MASK;
3809                         sp = page_header(root);
3810                         mmu_sync_children(vcpu, sp);
3811                 }
3812         }
3813
3814         kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3815         spin_unlock(&vcpu->kvm->mmu_lock);
3816 }
3817 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3818
3819 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
3820                                   u32 access, struct x86_exception *exception)
3821 {
3822         if (exception)
3823                 exception->error_code = 0;
3824         return vaddr;
3825 }
3826
3827 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
3828                                          u32 access,
3829                                          struct x86_exception *exception)
3830 {
3831         if (exception)
3832                 exception->error_code = 0;
3833         return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
3834 }
3835
3836 static bool
3837 __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
3838 {
3839         int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
3840
3841         return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
3842                 ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
3843 }
3844
3845 static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3846 {
3847         return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
3848 }
3849
3850 static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
3851 {
3852         return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
3853 }
3854
3855 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3856 {
3857         /*
3858          * A nested guest cannot use the MMIO cache if it is using nested
3859          * page tables, because cr2 is a nGPA while the cache stores GPAs.
3860          */
3861         if (mmu_is_nested(vcpu))
3862                 return false;
3863
3864         if (direct)
3865                 return vcpu_match_mmio_gpa(vcpu, addr);
3866
3867         return vcpu_match_mmio_gva(vcpu, addr);
3868 }
3869
3870 /* return true if reserved bit is detected on spte. */
3871 static bool
3872 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3873 {
3874         struct kvm_shadow_walk_iterator iterator;
3875         u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
3876         int root, leaf;
3877         bool reserved = false;
3878
3879         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3880                 goto exit;
3881
3882         walk_shadow_page_lockless_begin(vcpu);
3883
3884         for (shadow_walk_init(&iterator, vcpu, addr),
3885                  leaf = root = iterator.level;
3886              shadow_walk_okay(&iterator);
3887              __shadow_walk_next(&iterator, spte)) {
3888                 spte = mmu_spte_get_lockless(iterator.sptep);
3889
3890                 sptes[leaf - 1] = spte;
3891                 leaf--;
3892
3893                 if (!is_shadow_present_pte(spte))
3894                         break;
3895
3896                 reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
3897                                                     iterator.level);
3898         }
3899
3900         walk_shadow_page_lockless_end(vcpu);
3901
3902         if (reserved) {
3903                 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
3904                        __func__, addr);
3905                 while (root > leaf) {
3906                         pr_err("------ spte 0x%llx level %d.\n",
3907                                sptes[root - 1], root);
3908                         root--;
3909                 }
3910         }
3911 exit:
3912         *sptep = spte;
3913         return reserved;
3914 }
3915
3916 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3917 {
3918         u64 spte;
3919         bool reserved;
3920
3921         if (mmio_info_in_cache(vcpu, addr, direct))
3922                 return RET_PF_EMULATE;
3923
3924         reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
3925         if (WARN_ON(reserved))
3926                 return -EINVAL;
3927
3928         if (is_mmio_spte(spte)) {
3929                 gfn_t gfn = get_mmio_spte_gfn(spte);
3930                 unsigned access = get_mmio_spte_access(spte);
3931
3932                 if (!check_mmio_spte(vcpu, spte))
3933                         return RET_PF_INVALID;
3934
3935                 if (direct)
3936                         addr = 0;
3937
3938                 trace_handle_mmio_page_fault(addr, gfn, access);
3939                 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3940                 return RET_PF_EMULATE;
3941         }
3942
3943         /*
3944          * If the page table is zapped by other cpus, let CPU fault again on
3945          * the address.
3946          */
3947         return RET_PF_RETRY;
3948 }
3949
3950 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
3951                                          u32 error_code, gfn_t gfn)
3952 {
3953         if (unlikely(error_code & PFERR_RSVD_MASK))
3954                 return false;
3955
3956         if (!(error_code & PFERR_PRESENT_MASK) ||
3957               !(error_code & PFERR_WRITE_MASK))
3958                 return false;
3959
3960         /*
3961          * guest is writing the page which is write tracked which can
3962          * not be fixed by page fault handler.
3963          */
3964         if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
3965                 return true;
3966
3967         return false;
3968 }
3969
3970 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
3971 {
3972         struct kvm_shadow_walk_iterator iterator;
3973         u64 spte;
3974
3975         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3976                 return;
3977
3978         walk_shadow_page_lockless_begin(vcpu);
3979         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
3980                 clear_sp_write_flooding_count(iterator.sptep);
3981                 if (!is_shadow_present_pte(spte))
3982                         break;
3983         }
3984         walk_shadow_page_lockless_end(vcpu);
3985 }
3986
3987 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3988                                 u32 error_code, bool prefault)
3989 {
3990         gfn_t gfn = gva >> PAGE_SHIFT;
3991         int r;
3992
3993         pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
3994
3995         if (page_fault_handle_page_track(vcpu, error_code, gfn))
3996                 return RET_PF_EMULATE;
3997
3998         r = mmu_topup_memory_caches(vcpu);
3999         if (r)
4000                 return r;
4001
4002         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4003
4004
4005         return nonpaging_map(vcpu, gva & PAGE_MASK,
4006                              error_code, gfn, prefault);
4007 }
4008
4009 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
4010 {
4011         struct kvm_arch_async_pf arch;
4012
4013         arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4014         arch.gfn = gfn;
4015         arch.direct_map = vcpu->arch.mmu->direct_map;
4016         arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
4017
4018         return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4019 }
4020
4021 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
4022 {
4023         if (unlikely(!lapic_in_kernel(vcpu) ||
4024                      kvm_event_needs_reinjection(vcpu) ||
4025                      vcpu->arch.exception.pending))
4026                 return false;
4027
4028         if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
4029                 return false;
4030
4031         return kvm_x86_ops->interrupt_allowed(vcpu);
4032 }
4033
4034 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
4035                          gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
4036 {
4037         struct kvm_memory_slot *slot;
4038         bool async;
4039
4040         /*
4041          * Don't expose private memslots to L2.
4042          */
4043         if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
4044                 *pfn = KVM_PFN_NOSLOT;
4045                 return false;
4046         }
4047
4048         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
4049         async = false;
4050         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
4051         if (!async)
4052                 return false; /* *pfn has correct page already */
4053
4054         if (!prefault && kvm_can_do_async_pf(vcpu)) {
4055                 trace_kvm_try_async_get_page(gva, gfn);
4056                 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
4057                         trace_kvm_async_pf_doublefault(gva, gfn);
4058                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4059                         return true;
4060                 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
4061                         return true;
4062         }
4063
4064         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
4065         return false;
4066 }
4067
4068 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4069                                 u64 fault_address, char *insn, int insn_len)
4070 {
4071         int r = 1;
4072
4073         vcpu->arch.l1tf_flush_l1d = true;
4074         switch (vcpu->arch.apf.host_apf_reason) {
4075         default:
4076                 trace_kvm_page_fault(fault_address, error_code);
4077
4078                 if (kvm_event_needs_reinjection(vcpu))
4079                         kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4080                 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4081                                 insn_len);
4082                 break;
4083         case KVM_PV_REASON_PAGE_NOT_PRESENT:
4084                 vcpu->arch.apf.host_apf_reason = 0;
4085                 local_irq_disable();
4086                 kvm_async_pf_task_wait(fault_address, 0);
4087                 local_irq_enable();
4088                 break;
4089         case KVM_PV_REASON_PAGE_READY:
4090                 vcpu->arch.apf.host_apf_reason = 0;
4091                 local_irq_disable();
4092                 kvm_async_pf_task_wake(fault_address);
4093                 local_irq_enable();
4094                 break;
4095         }
4096         return r;
4097 }
4098 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4099
4100 static bool
4101 check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
4102 {
4103         int page_num = KVM_PAGES_PER_HPAGE(level);
4104
4105         gfn &= ~(page_num - 1);
4106
4107         return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
4108 }
4109
4110 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4111                           bool prefault)
4112 {
4113         kvm_pfn_t pfn;
4114         int r;
4115         int level;
4116         bool force_pt_level;
4117         gfn_t gfn = gpa >> PAGE_SHIFT;
4118         unsigned long mmu_seq;
4119         int write = error_code & PFERR_WRITE_MASK;
4120         bool map_writable;
4121
4122         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4123
4124         if (page_fault_handle_page_track(vcpu, error_code, gfn))
4125                 return RET_PF_EMULATE;
4126
4127         r = mmu_topup_memory_caches(vcpu);
4128         if (r)
4129                 return r;
4130
4131         force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
4132                                                            PT_DIRECTORY_LEVEL);
4133         level = mapping_level(vcpu, gfn, &force_pt_level);
4134         if (likely(!force_pt_level)) {
4135                 if (level > PT_DIRECTORY_LEVEL &&
4136                     !check_hugepage_cache_consistency(vcpu, gfn, level))
4137                         level = PT_DIRECTORY_LEVEL;
4138                 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
4139         }
4140
4141         if (fast_page_fault(vcpu, gpa, level, error_code))
4142                 return RET_PF_RETRY;
4143
4144         mmu_seq = vcpu->kvm->mmu_notifier_seq;
4145         smp_rmb();
4146
4147         if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
4148                 return RET_PF_RETRY;
4149
4150         if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
4151                 return r;
4152
4153         spin_lock(&vcpu->kvm->mmu_lock);
4154         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
4155                 goto out_unlock;
4156         if (make_mmu_pages_available(vcpu) < 0)
4157                 goto out_unlock;
4158         if (likely(!force_pt_level))
4159                 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
4160         r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
4161         spin_unlock(&vcpu->kvm->mmu_lock);
4162
4163         return r;
4164
4165 out_unlock:
4166         spin_unlock(&vcpu->kvm->mmu_lock);
4167         kvm_release_pfn_clean(pfn);
4168         return RET_PF_RETRY;
4169 }
4170
4171 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
4172                                    struct kvm_mmu *context)
4173 {
4174         context->page_fault = nonpaging_page_fault;
4175         context->gva_to_gpa = nonpaging_gva_to_gpa;
4176         context->sync_page = nonpaging_sync_page;
4177         context->invlpg = nonpaging_invlpg;
4178         context->update_pte = nonpaging_update_pte;
4179         context->root_level = 0;
4180         context->shadow_root_level = PT32E_ROOT_LEVEL;
4181         context->direct_map = true;
4182         context->nx = false;
4183 }
4184
4185 /*
4186  * Find out if a previously cached root matching the new CR3/role is available.
4187  * The current root is also inserted into the cache.
4188  * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
4189  * returned.
4190  * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
4191  * false is returned. This root should now be freed by the caller.
4192  */
4193 static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4194                                   union kvm_mmu_page_role new_role)
4195 {
4196         uint i;
4197         struct kvm_mmu_root_info root;
4198         struct kvm_mmu *mmu = vcpu->arch.mmu;
4199
4200         root.cr3 = mmu->root_cr3;
4201         root.hpa = mmu->root_hpa;
4202
4203         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4204                 swap(root, mmu->prev_roots[i]);
4205
4206                 if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
4207                     page_header(root.hpa) != NULL &&
4208                     new_role.word == page_header(root.hpa)->role.word)
4209                         break;
4210         }
4211
4212         mmu->root_hpa = root.hpa;
4213         mmu->root_cr3 = root.cr3;
4214
4215         return i < KVM_MMU_NUM_PREV_ROOTS;
4216 }
4217
4218 static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4219                             union kvm_mmu_page_role new_role,
4220                             bool skip_tlb_flush)
4221 {
4222         struct kvm_mmu *mmu = vcpu->arch.mmu;
4223
4224         /*
4225          * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
4226          * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4227          * later if necessary.
4228          */
4229         if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4230             mmu->root_level >= PT64_ROOT_4LEVEL) {
4231                 if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
4232                         return false;
4233
4234                 if (cached_root_available(vcpu, new_cr3, new_role)) {
4235                         kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
4236                         if (!skip_tlb_flush) {
4237                                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4238                                 kvm_x86_ops->tlb_flush(vcpu, true);
4239                         }
4240
4241                         /*
4242                          * The last MMIO access's GVA and GPA are cached in the
4243                          * VCPU. When switching to a new CR3, that GVA->GPA
4244                          * mapping may no longer be valid. So clear any cached
4245                          * MMIO info even when we don't need to sync the shadow
4246                          * page tables.
4247                          */
4248                         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4249
4250                         __clear_sp_write_flooding_count(
4251                                 page_header(mmu->root_hpa));
4252
4253                         return true;
4254                 }
4255         }
4256
4257         return false;
4258 }
4259
4260 static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4261                               union kvm_mmu_page_role new_role,
4262                               bool skip_tlb_flush)
4263 {
4264         if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
4265                 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
4266                                    KVM_MMU_ROOT_CURRENT);
4267 }
4268
4269 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
4270 {
4271         __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
4272                           skip_tlb_flush);
4273 }
4274 EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
4275
4276 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4277 {
4278         return kvm_read_cr3(vcpu);
4279 }
4280
4281 static void inject_page_fault(struct kvm_vcpu *vcpu,
4282                               struct x86_exception *fault)
4283 {
4284         vcpu->arch.mmu->inject_page_fault(vcpu, fault);
4285 }
4286
4287 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4288                            unsigned access, int *nr_present)
4289 {
4290         if (unlikely(is_mmio_spte(*sptep))) {
4291                 if (gfn != get_mmio_spte_gfn(*sptep)) {
4292                         mmu_spte_clear_no_track(sptep);
4293                         return true;
4294                 }
4295
4296                 (*nr_present)++;
4297                 mark_mmio_spte(vcpu, sptep, gfn, access);
4298                 return true;
4299         }
4300
4301         return false;
4302 }
4303
4304 static inline bool is_last_gpte(struct kvm_mmu *mmu,
4305                                 unsigned level, unsigned gpte)
4306 {
4307         /*
4308          * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
4309          * If it is clear, there are no large pages at this level, so clear
4310          * PT_PAGE_SIZE_MASK in gpte if that is the case.
4311          */
4312         gpte &= level - mmu->last_nonleaf_level;
4313
4314         /*
4315          * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
4316          * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
4317          * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
4318          */
4319         gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
4320
4321         return gpte & PT_PAGE_SIZE_MASK;
4322 }
4323
4324 #define PTTYPE_EPT 18 /* arbitrary */
4325 #define PTTYPE PTTYPE_EPT
4326 #include "paging_tmpl.h"
4327 #undef PTTYPE
4328
4329 #define PTTYPE 64
4330 #include "paging_tmpl.h"
4331 #undef PTTYPE
4332
4333 #define PTTYPE 32
4334 #include "paging_tmpl.h"
4335 #undef PTTYPE
4336
4337 static void
4338 __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4339                         struct rsvd_bits_validate *rsvd_check,
4340                         int maxphyaddr, int level, bool nx, bool gbpages,
4341                         bool pse, bool amd)
4342 {
4343         u64 exb_bit_rsvd = 0;
4344         u64 gbpages_bit_rsvd = 0;
4345         u64 nonleaf_bit8_rsvd = 0;
4346
4347         rsvd_check->bad_mt_xwr = 0;
4348
4349         if (!nx)
4350                 exb_bit_rsvd = rsvd_bits(63, 63);
4351         if (!gbpages)
4352                 gbpages_bit_rsvd = rsvd_bits(7, 7);
4353
4354         /*
4355          * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4356          * leaf entries) on AMD CPUs only.
4357          */
4358         if (amd)
4359                 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4360
4361         switch (level) {
4362         case PT32_ROOT_LEVEL:
4363                 /* no rsvd bits for 2 level 4K page table entries */
4364                 rsvd_check->rsvd_bits_mask[0][1] = 0;
4365                 rsvd_check->rsvd_bits_mask[0][0] = 0;
4366                 rsvd_check->rsvd_bits_mask[1][0] =
4367                         rsvd_check->rsvd_bits_mask[0][0];
4368
4369                 if (!pse) {
4370                         rsvd_check->rsvd_bits_mask[1][1] = 0;
4371                         break;
4372                 }
4373
4374                 if (is_cpuid_PSE36())
4375                         /* 36bits PSE 4MB page */
4376                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4377                 else
4378                         /* 32 bits PSE 4MB page */
4379                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4380                 break;
4381         case PT32E_ROOT_LEVEL:
4382                 rsvd_check->rsvd_bits_mask[0][2] =
4383                         rsvd_bits(maxphyaddr, 63) |
4384                         rsvd_bits(5, 8) | rsvd_bits(1, 2);      /* PDPTE */
4385                 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4386                         rsvd_bits(maxphyaddr, 62);      /* PDE */
4387                 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4388                         rsvd_bits(maxphyaddr, 62);      /* PTE */
4389                 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4390                         rsvd_bits(maxphyaddr, 62) |
4391                         rsvd_bits(13, 20);              /* large page */
4392                 rsvd_check->rsvd_bits_mask[1][0] =
4393                         rsvd_check->rsvd_bits_mask[0][0];
4394                 break;
4395         case PT64_ROOT_5LEVEL:
4396                 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4397                         nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4398                         rsvd_bits(maxphyaddr, 51);
4399                 rsvd_check->rsvd_bits_mask[1][4] =
4400                         rsvd_check->rsvd_bits_mask[0][4];
4401                 /* fall through */
4402         case PT64_ROOT_4LEVEL:
4403                 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4404                         nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4405                         rsvd_bits(maxphyaddr, 51);
4406                 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
4407                         nonleaf_bit8_rsvd | gbpages_bit_rsvd |
4408                         rsvd_bits(maxphyaddr, 51);
4409                 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4410                         rsvd_bits(maxphyaddr, 51);
4411                 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4412                         rsvd_bits(maxphyaddr, 51);
4413                 rsvd_check->rsvd_bits_mask[1][3] =
4414                         rsvd_check->rsvd_bits_mask[0][3];
4415                 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
4416                         gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
4417                         rsvd_bits(13, 29);
4418                 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4419                         rsvd_bits(maxphyaddr, 51) |
4420                         rsvd_bits(13, 20);              /* large page */
4421                 rsvd_check->rsvd_bits_mask[1][0] =
4422                         rsvd_check->rsvd_bits_mask[0][0];
4423                 break;
4424         }
4425 }
4426
4427 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4428                                   struct kvm_mmu *context)
4429 {
4430         __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4431                                 cpuid_maxphyaddr(vcpu), context->root_level,
4432                                 context->nx,
4433                                 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4434                                 is_pse(vcpu), guest_cpuid_is_amd(vcpu));
4435 }
4436
4437 static void
4438 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4439                             int maxphyaddr, bool execonly)
4440 {
4441         u64 bad_mt_xwr;
4442
4443         rsvd_check->rsvd_bits_mask[0][4] =
4444                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4445         rsvd_check->rsvd_bits_mask[0][3] =
4446                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4447         rsvd_check->rsvd_bits_mask[0][2] =
4448                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4449         rsvd_check->rsvd_bits_mask[0][1] =
4450                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4451         rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
4452
4453         /* large page */
4454         rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4455         rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4456         rsvd_check->rsvd_bits_mask[1][2] =
4457                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
4458         rsvd_check->rsvd_bits_mask[1][1] =
4459                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
4460         rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4461
4462         bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
4463         bad_mt_xwr |= 0xFFull << (3 * 8);       /* bits 3..5 must not be 3 */
4464         bad_mt_xwr |= 0xFFull << (7 * 8);       /* bits 3..5 must not be 7 */
4465         bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   /* bits 0..2 must not be 010 */
4466         bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   /* bits 0..2 must not be 110 */
4467         if (!execonly) {
4468                 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4469                 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4470         }
4471         rsvd_check->bad_mt_xwr = bad_mt_xwr;
4472 }
4473
4474 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4475                 struct kvm_mmu *context, bool execonly)
4476 {
4477         __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4478                                     cpuid_maxphyaddr(vcpu), execonly);
4479 }
4480
4481 /*
4482  * the page table on host is the shadow page table for the page
4483  * table in guest or amd nested guest, its mmu features completely
4484  * follow the features in guest.
4485  */
4486 void
4487 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4488 {
4489         bool uses_nx = context->nx ||
4490                 context->mmu_role.base.smep_andnot_wp;
4491         struct rsvd_bits_validate *shadow_zero_check;
4492         int i;
4493
4494         /*
4495          * Passing "true" to the last argument is okay; it adds a check
4496          * on bit 8 of the SPTEs which KVM doesn't use anyway.
4497          */
4498         shadow_zero_check = &context->shadow_zero_check;
4499         __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4500                                 boot_cpu_data.x86_phys_bits,
4501                                 context->shadow_root_level, uses_nx,
4502                                 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4503                                 is_pse(vcpu), true);
4504
4505         if (!shadow_me_mask)
4506                 return;
4507
4508         for (i = context->shadow_root_level; --i >= 0;) {
4509                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4510                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4511         }
4512
4513 }
4514 EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4515
4516 static inline bool boot_cpu_is_amd(void)
4517 {
4518         WARN_ON_ONCE(!tdp_enabled);
4519         return shadow_x_mask == 0;
4520 }
4521
4522 /*
4523  * the direct page table on host, use as much mmu features as
4524  * possible, however, kvm currently does not do execution-protection.
4525  */
4526 static void
4527 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4528                                 struct kvm_mmu *context)
4529 {
4530         struct rsvd_bits_validate *shadow_zero_check;
4531         int i;
4532
4533         shadow_zero_check = &context->shadow_zero_check;
4534
4535         if (boot_cpu_is_amd())
4536                 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4537                                         boot_cpu_data.x86_phys_bits,
4538                                         context->shadow_root_level, false,
4539                                         boot_cpu_has(X86_FEATURE_GBPAGES),
4540                                         true, true);
4541         else
4542                 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4543                                             boot_cpu_data.x86_phys_bits,
4544                                             false);
4545
4546         if (!shadow_me_mask)
4547                 return;
4548
4549         for (i = context->shadow_root_level; --i >= 0;) {
4550                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4551                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4552         }
4553 }
4554
4555 /*
4556  * as the comments in reset_shadow_zero_bits_mask() except it
4557  * is the shadow page table for intel nested guest.
4558  */
4559 static void
4560 reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4561                                 struct kvm_mmu *context, bool execonly)
4562 {
4563         __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4564                                     boot_cpu_data.x86_phys_bits, execonly);
4565 }
4566
4567 #define BYTE_MASK(access) \
4568         ((1 & (access) ? 2 : 0) | \
4569          (2 & (access) ? 4 : 0) | \
4570          (3 & (access) ? 8 : 0) | \
4571          (4 & (access) ? 16 : 0) | \
4572          (5 & (access) ? 32 : 0) | \
4573          (6 & (access) ? 64 : 0) | \
4574          (7 & (access) ? 128 : 0))
4575
4576
4577 static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4578                                       struct kvm_mmu *mmu, bool ept)
4579 {
4580         unsigned byte;
4581
4582         const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4583         const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4584         const u8 u = BYTE_MASK(ACC_USER_MASK);
4585
4586         bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4587         bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4588         bool cr0_wp = is_write_protection(vcpu);
4589
4590         for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4591                 unsigned pfec = byte << 1;
4592
4593                 /*
4594                  * Each "*f" variable has a 1 bit for each UWX value
4595                  * that causes a fault with the given PFEC.
4596                  */
4597
4598                 /* Faults from writes to non-writable pages */
4599                 u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0;
4600                 /* Faults from user mode accesses to supervisor pages */
4601                 u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0;
4602                 /* Faults from fetches of non-executable pages*/
4603                 u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0;
4604                 /* Faults from kernel mode fetches of user pages */
4605                 u8 smepf = 0;
4606                 /* Faults from kernel mode accesses of user pages */
4607                 u8 smapf = 0;
4608
4609                 if (!ept) {
4610                         /* Faults from kernel mode accesses to user pages */
4611                         u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4612
4613                         /* Not really needed: !nx will cause pte.nx to fault */
4614                         if (!mmu->nx)
4615                                 ff = 0;
4616
4617                         /* Allow supervisor writes if !cr0.wp */
4618                         if (!cr0_wp)
4619                                 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4620
4621                         /* Disallow supervisor fetches of user code if cr4.smep */
4622                         if (cr4_smep)
4623                                 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4624
4625                         /*
4626                          * SMAP:kernel-mode data accesses from user-mode
4627                          * mappings should fault. A fault is considered
4628                          * as a SMAP violation if all of the following
4629                          * conditions are true:
4630                          *   - X86_CR4_SMAP is set in CR4
4631                          *   - A user page is accessed
4632                          *   - The access is not a fetch
4633                          *   - Page fault in kernel mode
4634                          *   - if CPL = 3 or X86_EFLAGS_AC is clear
4635                          *
4636                          * Here, we cover the first three conditions.
4637                          * The fourth is computed dynamically in permission_fault();
4638                          * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4639                          * *not* subject to SMAP restrictions.
4640                          */
4641                         if (cr4_smap)
4642                                 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4643                 }
4644
4645                 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4646         }
4647 }
4648
4649 /*
4650 * PKU is an additional mechanism by which the paging controls access to
4651 * user-mode addresses based on the value in the PKRU register.  Protection
4652 * key violations are reported through a bit in the page fault error code.
4653 * Unlike other bits of the error code, the PK bit is not known at the
4654 * call site of e.g. gva_to_gpa; it must be computed directly in
4655 * permission_fault based on two bits of PKRU, on some machine state (CR4,
4656 * CR0, EFER, CPL), and on other bits of the error code and the page tables.
4657 *
4658 * In particular the following conditions come from the error code, the
4659 * page tables and the machine state:
4660 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4661 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4662 * - PK is always zero if U=0 in the page tables
4663 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4664 *
4665 * The PKRU bitmask caches the result of these four conditions.  The error
4666 * code (minus the P bit) and the page table's U bit form an index into the
4667 * PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
4668 * with the two bits of the PKRU register corresponding to the protection key.
4669 * For the first three conditions above the bits will be 00, thus masking
4670 * away both AD and WD.  For all reads or if the last condition holds, WD
4671 * only will be masked away.
4672 */
4673 static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4674                                 bool ept)
4675 {
4676         unsigned bit;
4677         bool wp;
4678
4679         if (ept) {
4680                 mmu->pkru_mask = 0;
4681                 return;
4682         }
4683
4684         /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
4685         if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
4686                 mmu->pkru_mask = 0;
4687                 return;
4688         }
4689
4690         wp = is_write_protection(vcpu);
4691
4692         for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4693                 unsigned pfec, pkey_bits;
4694                 bool check_pkey, check_write, ff, uf, wf, pte_user;
4695
4696                 pfec = bit << 1;
4697                 ff = pfec & PFERR_FETCH_MASK;
4698                 uf = pfec & PFERR_USER_MASK;
4699                 wf = pfec & PFERR_WRITE_MASK;
4700
4701                 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
4702                 pte_user = pfec & PFERR_RSVD_MASK;
4703
4704                 /*
4705                  * Only need to check the access which is not an
4706                  * instruction fetch and is to a user page.
4707                  */
4708                 check_pkey = (!ff && pte_user);
4709                 /*
4710                  * write access is controlled by PKRU if it is a
4711                  * user access or CR0.WP = 1.
4712                  */
4713                 check_write = check_pkey && wf && (uf || wp);
4714
4715                 /* PKRU.AD stops both read and write access. */
4716                 pkey_bits = !!check_pkey;
4717                 /* PKRU.WD stops write access. */
4718                 pkey_bits |= (!!check_write) << 1;
4719
4720                 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4721         }
4722 }
4723
4724 static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
4725 {
4726         unsigned root_level = mmu->root_level;
4727
4728         mmu->last_nonleaf_level = root_level;
4729         if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
4730                 mmu->last_nonleaf_level++;
4731 }
4732
4733 static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4734                                          struct kvm_mmu *context,
4735                                          int level)
4736 {
4737         context->nx = is_nx(vcpu);
4738         context->root_level = level;
4739
4740         reset_rsvds_bits_mask(vcpu, context);
4741         update_permission_bitmask(vcpu, context, false);
4742         update_pkru_bitmask(vcpu, context, false);
4743         update_last_nonleaf_level(vcpu, context);
4744
4745         MMU_WARN_ON(!is_pae(vcpu));
4746         context->page_fault = paging64_page_fault;
4747         context->gva_to_gpa = paging64_gva_to_gpa;
4748         context->sync_page = paging64_sync_page;
4749         context->invlpg = paging64_invlpg;
4750         context->update_pte = paging64_update_pte;
4751         context->shadow_root_level = level;
4752         context->direct_map = false;
4753 }
4754
4755 static void paging64_init_context(struct kvm_vcpu *vcpu,
4756                                   struct kvm_mmu *context)
4757 {
4758         int root_level = is_la57_mode(vcpu) ?
4759                          PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4760
4761         paging64_init_context_common(vcpu, context, root_level);
4762 }
4763
4764 static void paging32_init_context(struct kvm_vcpu *vcpu,
4765                                   struct kvm_mmu *context)
4766 {
4767         context->nx = false;
4768         context->root_level = PT32_ROOT_LEVEL;
4769
4770         reset_rsvds_bits_mask(vcpu, context);
4771         update_permission_bitmask(vcpu, context, false);
4772         update_pkru_bitmask(vcpu, context, false);
4773         update_last_nonleaf_level(vcpu, context);
4774
4775         context->page_fault = paging32_page_fault;
4776         context->gva_to_gpa = paging32_gva_to_gpa;
4777         context->sync_page = paging32_sync_page;
4778         context->invlpg = paging32_invlpg;
4779         context->update_pte = paging32_update_pte;
4780         context->shadow_root_level = PT32E_ROOT_LEVEL;
4781         context->direct_map = false;
4782 }
4783
4784 static void paging32E_init_context(struct kvm_vcpu *vcpu,
4785                                    struct kvm_mmu *context)
4786 {
4787         paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
4788 }
4789
4790 static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
4791 {
4792         union kvm_mmu_extended_role ext = {0};
4793
4794         ext.cr0_pg = !!is_paging(vcpu);
4795         ext.cr4_pae = !!is_pae(vcpu);
4796         ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
4797         ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
4798         ext.cr4_pse = !!is_pse(vcpu);
4799         ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
4800         ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
4801         ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
4802
4803         ext.valid = 1;
4804
4805         return ext;
4806 }
4807
4808 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4809                                                    bool base_only)
4810 {
4811         union kvm_mmu_role role = {0};
4812
4813         role.base.access = ACC_ALL;
4814         role.base.nxe = !!is_nx(vcpu);
4815         role.base.cr0_wp = is_write_protection(vcpu);
4816         role.base.smm = is_smm(vcpu);
4817         role.base.guest_mode = is_guest_mode(vcpu);
4818
4819         if (base_only)
4820                 return role;
4821
4822         role.ext = kvm_calc_mmu_role_ext(vcpu);
4823
4824         return role;
4825 }
4826
4827 static union kvm_mmu_role
4828 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4829 {
4830         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4831
4832         role.base.ad_disabled = (shadow_accessed_mask == 0);
4833         role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
4834         role.base.direct = true;
4835         role.base.gpte_is_8_bytes = true;
4836
4837         return role;
4838 }
4839
4840 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4841 {
4842         struct kvm_mmu *context = vcpu->arch.mmu;
4843         union kvm_mmu_role new_role =
4844                 kvm_calc_tdp_mmu_root_page_role(vcpu, false);
4845
4846         new_role.base.word &= mmu_base_role_mask.word;
4847         if (new_role.as_u64 == context->mmu_role.as_u64)
4848                 return;
4849
4850         context->mmu_role.as_u64 = new_role.as_u64;
4851         context->page_fault = tdp_page_fault;
4852         context->sync_page = nonpaging_sync_page;
4853         context->invlpg = nonpaging_invlpg;
4854         context->update_pte = nonpaging_update_pte;
4855         context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
4856         context->direct_map = true;
4857         context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
4858         context->get_cr3 = get_cr3;
4859         context->get_pdptr = kvm_pdptr_read;
4860         context->inject_page_fault = kvm_inject_page_fault;
4861
4862         if (!is_paging(vcpu)) {
4863                 context->nx = false;
4864                 context->gva_to_gpa = nonpaging_gva_to_gpa;
4865                 context->root_level = 0;
4866         } else if (is_long_mode(vcpu)) {
4867                 context->nx = is_nx(vcpu);
4868                 context->root_level = is_la57_mode(vcpu) ?
4869                                 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4870                 reset_rsvds_bits_mask(vcpu, context);
4871                 context->gva_to_gpa = paging64_gva_to_gpa;
4872         } else if (is_pae(vcpu)) {
4873                 context->nx = is_nx(vcpu);
4874                 context->root_level = PT32E_ROOT_LEVEL;
4875                 reset_rsvds_bits_mask(vcpu, context);
4876                 context->gva_to_gpa = paging64_gva_to_gpa;
4877         } else {
4878                 context->nx = false;
4879                 context->root_level = PT32_ROOT_LEVEL;
4880                 reset_rsvds_bits_mask(vcpu, context);
4881                 context->gva_to_gpa = paging32_gva_to_gpa;
4882         }
4883
4884         update_permission_bitmask(vcpu, context, false);
4885         update_pkru_bitmask(vcpu, context, false);
4886         update_last_nonleaf_level(vcpu, context);
4887         reset_tdp_shadow_zero_bits_mask(vcpu, context);
4888 }
4889
4890 static union kvm_mmu_role
4891 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4892 {
4893         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4894
4895         role.base.smep_andnot_wp = role.ext.cr4_smep &&
4896                 !is_write_protection(vcpu);
4897         role.base.smap_andnot_wp = role.ext.cr4_smap &&
4898                 !is_write_protection(vcpu);
4899         role.base.direct = !is_paging(vcpu);
4900         role.base.gpte_is_8_bytes = !!is_pae(vcpu);
4901
4902         if (!is_long_mode(vcpu))
4903                 role.base.level = PT32E_ROOT_LEVEL;
4904         else if (is_la57_mode(vcpu))
4905                 role.base.level = PT64_ROOT_5LEVEL;
4906         else
4907                 role.base.level = PT64_ROOT_4LEVEL;
4908
4909         return role;
4910 }
4911
4912 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
4913 {
4914         struct kvm_mmu *context = vcpu->arch.mmu;
4915         union kvm_mmu_role new_role =
4916                 kvm_calc_shadow_mmu_root_page_role(vcpu, false);
4917
4918         new_role.base.word &= mmu_base_role_mask.word;
4919         if (new_role.as_u64 == context->mmu_role.as_u64)
4920                 return;
4921
4922         if (!is_paging(vcpu))
4923                 nonpaging_init_context(vcpu, context);
4924         else if (is_long_mode(vcpu))
4925                 paging64_init_context(vcpu, context);
4926         else if (is_pae(vcpu))
4927                 paging32E_init_context(vcpu, context);
4928         else
4929                 paging32_init_context(vcpu, context);
4930
4931         context->mmu_role.as_u64 = new_role.as_u64;
4932         reset_shadow_zero_bits_mask(vcpu, context);
4933 }
4934 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
4935
4936 static union kvm_mmu_role
4937 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
4938                                    bool execonly)
4939 {
4940         union kvm_mmu_role role = {0};
4941
4942         /* SMM flag is inherited from root_mmu */
4943         role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
4944
4945         role.base.level = PT64_ROOT_4LEVEL;
4946         role.base.gpte_is_8_bytes = true;
4947         role.base.direct = false;
4948         role.base.ad_disabled = !accessed_dirty;
4949         role.base.guest_mode = true;
4950         role.base.access = ACC_ALL;
4951
4952         /*
4953          * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
4954          * SMAP variation to denote shadow EPT entries.
4955          */
4956         role.base.cr0_wp = true;
4957         role.base.smap_andnot_wp = true;
4958
4959         role.ext = kvm_calc_mmu_role_ext(vcpu);
4960         role.ext.execonly = execonly;
4961
4962         return role;
4963 }
4964
4965 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4966                              bool accessed_dirty, gpa_t new_eptp)
4967 {
4968         struct kvm_mmu *context = vcpu->arch.mmu;
4969         union kvm_mmu_role new_role =
4970                 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
4971                                                    execonly);
4972
4973         __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
4974
4975         new_role.base.word &= mmu_base_role_mask.word;
4976         if (new_role.as_u64 == context->mmu_role.as_u64)
4977                 return;
4978
4979         context->shadow_root_level = PT64_ROOT_4LEVEL;
4980
4981         context->nx = true;
4982         context->ept_ad = accessed_dirty;
4983         context->page_fault = ept_page_fault;
4984         context->gva_to_gpa = ept_gva_to_gpa;
4985         context->sync_page = ept_sync_page;
4986         context->invlpg = ept_invlpg;
4987         context->update_pte = ept_update_pte;
4988         context->root_level = PT64_ROOT_4LEVEL;
4989         context->direct_map = false;
4990         context->mmu_role.as_u64 = new_role.as_u64;
4991
4992         update_permission_bitmask(vcpu, context, true);
4993         update_pkru_bitmask(vcpu, context, true);
4994         update_last_nonleaf_level(vcpu, context);
4995         reset_rsvds_bits_mask_ept(vcpu, context, execonly);
4996         reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
4997 }
4998 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
4999
5000 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
5001 {
5002         struct kvm_mmu *context = vcpu->arch.mmu;
5003
5004         kvm_init_shadow_mmu(vcpu);
5005         context->set_cr3           = kvm_x86_ops->set_cr3;
5006         context->get_cr3           = get_cr3;
5007         context->get_pdptr         = kvm_pdptr_read;
5008         context->inject_page_fault = kvm_inject_page_fault;
5009 }
5010
5011 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
5012 {
5013         union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
5014         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5015
5016         new_role.base.word &= mmu_base_role_mask.word;
5017         if (new_role.as_u64 == g_context->mmu_role.as_u64)
5018                 return;
5019
5020         g_context->mmu_role.as_u64 = new_role.as_u64;
5021         g_context->get_cr3           = get_cr3;
5022         g_context->get_pdptr         = kvm_pdptr_read;
5023         g_context->inject_page_fault = kvm_inject_page_fault;
5024
5025         /*
5026          * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5027          * L1's nested page tables (e.g. EPT12). The nested translation
5028          * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5029          * L2's page tables as the first level of translation and L1's
5030          * nested page tables as the second level of translation. Basically
5031          * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5032          */
5033         if (!is_paging(vcpu)) {
5034                 g_context->nx = false;
5035                 g_context->root_level = 0;
5036                 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
5037         } else if (is_long_mode(vcpu)) {
5038                 g_context->nx = is_nx(vcpu);
5039                 g_context->root_level = is_la57_mode(vcpu) ?
5040                                         PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5041                 reset_rsvds_bits_mask(vcpu, g_context);
5042                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5043         } else if (is_pae(vcpu)) {
5044                 g_context->nx = is_nx(vcpu);
5045                 g_context->root_level = PT32E_ROOT_LEVEL;
5046                 reset_rsvds_bits_mask(vcpu, g_context);
5047                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5048         } else {
5049                 g_context->nx = false;
5050                 g_context->root_level = PT32_ROOT_LEVEL;
5051                 reset_rsvds_bits_mask(vcpu, g_context);
5052                 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
5053         }
5054
5055         update_permission_bitmask(vcpu, g_context, false);
5056         update_pkru_bitmask(vcpu, g_context, false);
5057         update_last_nonleaf_level(vcpu, g_context);
5058 }
5059
5060 void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
5061 {
5062         if (reset_roots) {
5063                 uint i;
5064
5065                 vcpu->arch.mmu->root_hpa = INVALID_PAGE;
5066
5067                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5068                         vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5069         }
5070
5071         if (mmu_is_nested(vcpu))
5072                 init_kvm_nested_mmu(vcpu);
5073         else if (tdp_enabled)
5074                 init_kvm_tdp_mmu(vcpu);
5075         else
5076                 init_kvm_softmmu(vcpu);
5077 }
5078 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5079
5080 static union kvm_mmu_page_role
5081 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
5082 {
5083         union kvm_mmu_role role;
5084
5085         if (tdp_enabled)
5086                 role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
5087         else
5088                 role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
5089
5090         return role.base;
5091 }
5092
5093 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5094 {
5095         kvm_mmu_unload(vcpu);
5096         kvm_init_mmu(vcpu, true);
5097 }
5098 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5099
5100 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5101 {
5102         int r;
5103
5104         r = mmu_topup_memory_caches(vcpu);
5105         if (r)
5106                 goto out;
5107         r = mmu_alloc_roots(vcpu);
5108         kvm_mmu_sync_roots(vcpu);
5109         if (r)
5110                 goto out;
5111         kvm_mmu_load_cr3(vcpu);
5112         kvm_x86_ops->tlb_flush(vcpu, true);
5113 out:
5114         return r;
5115 }
5116 EXPORT_SYMBOL_GPL(kvm_mmu_load);
5117
5118 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5119 {
5120         kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5121         WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
5122         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5123         WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
5124 }
5125 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
5126
5127 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
5128                                   struct kvm_mmu_page *sp, u64 *spte,
5129                                   const void *new)
5130 {
5131         if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
5132                 ++vcpu->kvm->stat.mmu_pde_zapped;
5133                 return;
5134         }
5135
5136         ++vcpu->kvm->stat.mmu_pte_updated;
5137         vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
5138 }
5139
5140 static bool need_remote_flush(u64 old, u64 new)
5141 {
5142         if (!is_shadow_present_pte(old))
5143                 return false;
5144         if (!is_shadow_present_pte(new))
5145                 return true;
5146         if ((old ^ new) & PT64_BASE_ADDR_MASK)
5147                 return true;
5148         old ^= shadow_nx_mask;
5149         new ^= shadow_nx_mask;
5150         return (old & ~new & PT64_PERM_MASK) != 0;
5151 }
5152
5153 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5154                                     int *bytes)
5155 {
5156         u64 gentry = 0;
5157         int r;
5158
5159         /*
5160          * Assume that the pte write on a page table of the same type
5161          * as the current vcpu paging mode since we update the sptes only
5162          * when they have the same mode.
5163          */
5164         if (is_pae(vcpu) && *bytes == 4) {
5165                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5166                 *gpa &= ~(gpa_t)7;
5167                 *bytes = 8;
5168         }
5169
5170         if (*bytes == 4 || *bytes == 8) {
5171                 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5172                 if (r)
5173                         gentry = 0;
5174         }
5175
5176         return gentry;
5177 }
5178
5179 /*
5180  * If we're seeing too many writes to a page, it may no longer be a page table,
5181  * or we may be forking, in which case it is better to unmap the page.
5182  */
5183 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5184 {
5185         /*
5186          * Skip write-flooding detected for the sp whose level is 1, because
5187          * it can become unsync, then the guest page is not write-protected.
5188          */
5189         if (sp->role.level == PT_PAGE_TABLE_LEVEL)
5190                 return false;
5191
5192         atomic_inc(&sp->write_flooding_count);
5193         return atomic_read(&sp->write_flooding_count) >= 3;
5194 }
5195
5196 /*
5197  * Misaligned accesses are too much trouble to fix up; also, they usually
5198  * indicate a page is not used as a page table.
5199  */
5200 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5201                                     int bytes)
5202 {
5203         unsigned offset, pte_size, misaligned;
5204
5205         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5206                  gpa, bytes, sp->role.word);
5207
5208         offset = offset_in_page(gpa);
5209         pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5210
5211         /*
5212          * Sometimes, the OS only writes the last one bytes to update status
5213          * bits, for example, in linux, andb instruction is used in clear_bit().
5214          */
5215         if (!(offset & (pte_size - 1)) && bytes == 1)
5216                 return false;
5217
5218         misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5219         misaligned |= bytes < 4;
5220
5221         return misaligned;
5222 }
5223
5224 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5225 {
5226         unsigned page_offset, quadrant;
5227         u64 *spte;
5228         int level;
5229
5230         page_offset = offset_in_page(gpa);
5231         level = sp->role.level;
5232         *nspte = 1;
5233         if (!sp->role.gpte_is_8_bytes) {
5234                 page_offset <<= 1;      /* 32->64 */
5235                 /*
5236                  * A 32-bit pde maps 4MB while the shadow pdes map
5237                  * only 2MB.  So we need to double the offset again
5238                  * and zap two pdes instead of one.
5239                  */
5240                 if (level == PT32_ROOT_LEVEL) {
5241                         page_offset &= ~7; /* kill rounding error */
5242                         page_offset <<= 1;
5243                         *nspte = 2;
5244                 }
5245                 quadrant = page_offset >> PAGE_SHIFT;
5246                 page_offset &= ~PAGE_MASK;
5247                 if (quadrant != sp->role.quadrant)
5248                         return NULL;
5249         }
5250
5251         spte = &sp->spt[page_offset / sizeof(*spte)];
5252         return spte;
5253 }
5254
5255 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5256                               const u8 *new, int bytes,
5257                               struct kvm_page_track_notifier_node *node)
5258 {
5259         gfn_t gfn = gpa >> PAGE_SHIFT;
5260         struct kvm_mmu_page *sp;
5261         LIST_HEAD(invalid_list);
5262         u64 entry, gentry, *spte;
5263         int npte;
5264         bool remote_flush, local_flush;
5265
5266         /*
5267          * If we don't have indirect shadow pages, it means no page is
5268          * write-protected, so we can exit simply.
5269          */
5270         if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5271                 return;
5272
5273         remote_flush = local_flush = false;
5274
5275         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5276
5277         /*
5278          * No need to care whether allocation memory is successful
5279          * or not since pte prefetch is skiped if it does not have
5280          * enough objects in the cache.
5281          */
5282         mmu_topup_memory_caches(vcpu);
5283
5284         spin_lock(&vcpu->kvm->mmu_lock);
5285
5286         gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5287
5288         ++vcpu->kvm->stat.mmu_pte_write;
5289         kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
5290
5291         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5292                 if (detect_write_misaligned(sp, gpa, bytes) ||
5293                       detect_write_flooding(sp)) {
5294                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5295                         ++vcpu->kvm->stat.mmu_flooded;
5296                         continue;
5297                 }
5298
5299                 spte = get_written_sptes(sp, gpa, &npte);
5300                 if (!spte)
5301                         continue;
5302
5303                 local_flush = true;
5304                 while (npte--) {
5305                         u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
5306
5307                         entry = *spte;
5308                         mmu_page_zap_pte(vcpu->kvm, sp, spte);
5309                         if (gentry &&
5310                               !((sp->role.word ^ base_role)
5311                               & mmu_base_role_mask.word) && rmap_can_add(vcpu))
5312                                 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
5313                         if (need_remote_flush(entry, *spte))
5314                                 remote_flush = true;
5315                         ++spte;
5316                 }
5317         }
5318         kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
5319         kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
5320         spin_unlock(&vcpu->kvm->mmu_lock);
5321 }
5322
5323 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
5324 {
5325         gpa_t gpa;
5326         int r;
5327
5328         if (vcpu->arch.mmu->direct_map)
5329                 return 0;
5330
5331         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
5332
5333         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
5334
5335         return r;
5336 }
5337 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
5338
5339 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
5340 {
5341         LIST_HEAD(invalid_list);
5342
5343         if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
5344                 return 0;
5345
5346         while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
5347                 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
5348                         break;
5349
5350                 ++vcpu->kvm->stat.mmu_recycled;
5351         }
5352         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
5353
5354         if (!kvm_mmu_available_pages(vcpu->kvm))
5355                 return -ENOSPC;
5356         return 0;
5357 }
5358
5359 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
5360                        void *insn, int insn_len)
5361 {
5362         int r, emulation_type = 0;
5363         enum emulation_result er;
5364         bool direct = vcpu->arch.mmu->direct_map;
5365
5366         /* With shadow page tables, fault_address contains a GVA or nGPA.  */
5367         if (vcpu->arch.mmu->direct_map) {
5368                 vcpu->arch.gpa_available = true;
5369                 vcpu->arch.gpa_val = cr2;
5370         }
5371
5372         r = RET_PF_INVALID;
5373         if (unlikely(error_code & PFERR_RSVD_MASK)) {
5374                 r = handle_mmio_page_fault(vcpu, cr2, direct);
5375                 if (r == RET_PF_EMULATE)
5376                         goto emulate;
5377         }
5378
5379         if (r == RET_PF_INVALID) {
5380                 r = vcpu->arch.mmu->page_fault(vcpu, cr2,
5381                                                lower_32_bits(error_code),
5382                                                false);
5383                 WARN_ON(r == RET_PF_INVALID);
5384         }
5385
5386         if (r == RET_PF_RETRY)
5387                 return 1;
5388         if (r < 0)
5389                 return r;
5390
5391         /*
5392          * Before emulating the instruction, check if the error code
5393          * was due to a RO violation while translating the guest page.
5394          * This can occur when using nested virtualization with nested
5395          * paging in both guests. If true, we simply unprotect the page
5396          * and resume the guest.
5397          */
5398         if (vcpu->arch.mmu->direct_map &&
5399             (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5400                 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
5401                 return 1;
5402         }
5403
5404         /*
5405          * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5406          * optimistically try to just unprotect the page and let the processor
5407          * re-execute the instruction that caused the page fault.  Do not allow
5408          * retrying MMIO emulation, as it's not only pointless but could also
5409          * cause us to enter an infinite loop because the processor will keep
5410          * faulting on the non-existent MMIO address.  Retrying an instruction
5411          * from a nested guest is also pointless and dangerous as we are only
5412          * explicitly shadowing L1's page tables, i.e. unprotecting something
5413          * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5414          */
5415         if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
5416                 emulation_type = EMULTYPE_ALLOW_RETRY;
5417 emulate:
5418         /*
5419          * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
5420          * This can happen if a guest gets a page-fault on data access but the HW
5421          * table walker is not able to read the instruction page (e.g instruction
5422          * page is not present in memory). In those cases we simply restart the
5423          * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
5424          */
5425         if (unlikely(insn && !insn_len)) {
5426                 if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
5427                         return 1;
5428         }
5429
5430         er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
5431
5432         switch (er) {
5433         case EMULATE_DONE:
5434                 return 1;
5435         case EMULATE_USER_EXIT:
5436                 ++vcpu->stat.mmio_exits;
5437                 /* fall through */
5438         case EMULATE_FAIL:
5439                 return 0;
5440         default:
5441                 BUG();
5442         }
5443 }
5444 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5445
5446 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5447 {
5448         struct kvm_mmu *mmu = vcpu->arch.mmu;
5449         int i;
5450
5451         /* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
5452         if (is_noncanonical_address(gva, vcpu))
5453                 return;
5454
5455         mmu->invlpg(vcpu, gva, mmu->root_hpa);
5456
5457         /*
5458          * INVLPG is required to invalidate any global mappings for the VA,
5459          * irrespective of PCID. Since it would take us roughly similar amount
5460          * of work to determine whether any of the prev_root mappings of the VA
5461          * is marked global, or to just sync it blindly, so we might as well
5462          * just always sync it.
5463          *
5464          * Mappings not reachable via the current cr3 or the prev_roots will be
5465          * synced when switching to that cr3, so nothing needs to be done here
5466          * for them.
5467          */
5468         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5469                 if (VALID_PAGE(mmu->prev_roots[i].hpa))
5470                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5471
5472         kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5473         ++vcpu->stat.invlpg;
5474 }
5475 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5476
5477 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5478 {
5479         struct kvm_mmu *mmu = vcpu->arch.mmu;
5480         bool tlb_flush = false;
5481         uint i;
5482
5483         if (pcid == kvm_get_active_pcid(vcpu)) {
5484                 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5485                 tlb_flush = true;
5486         }
5487
5488         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5489                 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5490                     pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
5491                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5492                         tlb_flush = true;
5493                 }
5494         }
5495
5496         if (tlb_flush)
5497                 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5498
5499         ++vcpu->stat.invlpg;
5500
5501         /*
5502          * Mappings not reachable via the current cr3 or the prev_roots will be
5503          * synced when switching to that cr3, so nothing needs to be done here
5504          * for them.
5505          */
5506 }
5507 EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
5508
5509 void kvm_enable_tdp(void)
5510 {
5511         tdp_enabled = true;
5512 }
5513 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
5514
5515 void kvm_disable_tdp(void)
5516 {
5517         tdp_enabled = false;
5518 }
5519 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
5520
5521
5522 /* The return value indicates if tlb flush on all vcpus is needed. */
5523 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5524
5525 /* The caller should hold mmu-lock before calling this function. */
5526 static __always_inline bool
5527 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5528                         slot_level_handler fn, int start_level, int end_level,
5529                         gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5530 {
5531         struct slot_rmap_walk_iterator iterator;
5532         bool flush = false;
5533
5534         for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5535                         end_gfn, &iterator) {
5536                 if (iterator.rmap)
5537                         flush |= fn(kvm, iterator.rmap);
5538
5539                 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5540                         if (flush && lock_flush_tlb) {
5541                                 kvm_flush_remote_tlbs_with_address(kvm,
5542                                                 start_gfn,
5543                                                 iterator.gfn - start_gfn + 1);
5544                                 flush = false;
5545                         }
5546                         cond_resched_lock(&kvm->mmu_lock);
5547                 }
5548         }
5549
5550         if (flush && lock_flush_tlb) {
5551                 kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
5552                                                    end_gfn - start_gfn + 1);
5553                 flush = false;
5554         }
5555
5556         return flush;
5557 }
5558
5559 static __always_inline bool
5560 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5561                   slot_level_handler fn, int start_level, int end_level,
5562                   bool lock_flush_tlb)
5563 {
5564         return slot_handle_level_range(kvm, memslot, fn, start_level,
5565                         end_level, memslot->base_gfn,
5566                         memslot->base_gfn + memslot->npages - 1,
5567                         lock_flush_tlb);
5568 }
5569
5570 static __always_inline bool
5571 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5572                       slot_level_handler fn, bool lock_flush_tlb)
5573 {
5574         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5575                                  PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5576 }
5577
5578 static __always_inline bool
5579 slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5580                         slot_level_handler fn, bool lock_flush_tlb)
5581 {
5582         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
5583                                  PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5584 }
5585
5586 static __always_inline bool
5587 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5588                  slot_level_handler fn, bool lock_flush_tlb)
5589 {
5590         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5591                                  PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5592 }
5593
5594 static void free_mmu_pages(struct kvm_vcpu *vcpu)
5595 {
5596         free_page((unsigned long)vcpu->arch.mmu->pae_root);
5597         free_page((unsigned long)vcpu->arch.mmu->lm_root);
5598 }
5599
5600 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
5601 {
5602         struct page *page;
5603         int i;
5604
5605         if (tdp_enabled)
5606                 return 0;
5607
5608         /*
5609          * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
5610          * Therefore we need to allocate shadow page tables in the first
5611          * 4GB of memory, which happens to fit the DMA32 zone.
5612          */
5613         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5614         if (!page)
5615                 return -ENOMEM;
5616
5617         vcpu->arch.mmu->pae_root = page_address(page);
5618         for (i = 0; i < 4; ++i)
5619                 vcpu->arch.mmu->pae_root[i] = INVALID_PAGE;
5620
5621         return 0;
5622 }
5623
5624 int kvm_mmu_create(struct kvm_vcpu *vcpu)
5625 {
5626         uint i;
5627
5628         vcpu->arch.mmu = &vcpu->arch.root_mmu;
5629         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5630
5631         vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
5632         vcpu->arch.root_mmu.root_cr3 = 0;
5633         vcpu->arch.root_mmu.translate_gpa = translate_gpa;
5634         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5635                 vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5636
5637         vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
5638         vcpu->arch.guest_mmu.root_cr3 = 0;
5639         vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
5640         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5641                 vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5642
5643         vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5644         return alloc_mmu_pages(vcpu);
5645 }
5646
5647 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5648                         struct kvm_memory_slot *slot,
5649                         struct kvm_page_track_notifier_node *node)
5650 {
5651         struct kvm_mmu_page *sp;
5652         LIST_HEAD(invalid_list);
5653         unsigned long i;
5654         bool flush;
5655         gfn_t gfn;
5656
5657         spin_lock(&kvm->mmu_lock);
5658
5659         if (list_empty(&kvm->arch.active_mmu_pages))
5660                 goto out_unlock;
5661
5662         flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false);
5663
5664         for (i = 0; i < slot->npages; i++) {
5665                 gfn = slot->base_gfn + i;
5666
5667                 for_each_valid_sp(kvm, sp, gfn) {
5668                         if (sp->gfn != gfn)
5669                                 continue;
5670
5671                         kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
5672                 }
5673                 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5674                         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
5675                         flush = false;
5676                         cond_resched_lock(&kvm->mmu_lock);
5677                 }
5678         }
5679         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
5680
5681 out_unlock:
5682         spin_unlock(&kvm->mmu_lock);
5683 }
5684
5685 void kvm_mmu_init_vm(struct kvm *kvm)
5686 {
5687         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5688
5689         node->track_write = kvm_mmu_pte_write;
5690         node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5691         kvm_page_track_register_notifier(kvm, node);
5692 }
5693
5694 void kvm_mmu_uninit_vm(struct kvm *kvm)
5695 {
5696         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5697
5698         kvm_page_track_unregister_notifier(kvm, node);
5699 }
5700
5701 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5702 {
5703         struct kvm_memslots *slots;
5704         struct kvm_memory_slot *memslot;
5705         int i;
5706
5707         spin_lock(&kvm->mmu_lock);
5708         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5709                 slots = __kvm_memslots(kvm, i);
5710                 kvm_for_each_memslot(memslot, slots) {
5711                         gfn_t start, end;
5712
5713                         start = max(gfn_start, memslot->base_gfn);
5714                         end = min(gfn_end, memslot->base_gfn + memslot->npages);
5715                         if (start >= end)
5716                                 continue;
5717
5718                         slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5719                                                 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
5720                                                 start, end - 1, true);
5721                 }
5722         }
5723
5724         spin_unlock(&kvm->mmu_lock);
5725 }
5726
5727 static bool slot_rmap_write_protect(struct kvm *kvm,
5728                                     struct kvm_rmap_head *rmap_head)
5729 {
5730         return __rmap_write_protect(kvm, rmap_head, false);
5731 }
5732
5733 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5734                                       struct kvm_memory_slot *memslot)
5735 {
5736         bool flush;
5737
5738         spin_lock(&kvm->mmu_lock);
5739         flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
5740                                       false);
5741         spin_unlock(&kvm->mmu_lock);
5742
5743         /*
5744          * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
5745          * which do tlb flush out of mmu-lock should be serialized by
5746          * kvm->slots_lock otherwise tlb flush would be missed.
5747          */
5748         lockdep_assert_held(&kvm->slots_lock);
5749
5750         /*
5751          * We can flush all the TLBs out of the mmu lock without TLB
5752          * corruption since we just change the spte from writable to
5753          * readonly so that we only need to care the case of changing
5754          * spte from present to present (changing the spte from present
5755          * to nonpresent will flush all the TLBs immediately), in other
5756          * words, the only case we care is mmu_spte_update() where we
5757          * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
5758          * instead of PT_WRITABLE_MASK, that means it does not depend
5759          * on PT_WRITABLE_MASK anymore.
5760          */
5761         if (flush)
5762                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5763                         memslot->npages);
5764 }
5765
5766 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
5767                                          struct kvm_rmap_head *rmap_head)
5768 {
5769         u64 *sptep;
5770         struct rmap_iterator iter;
5771         int need_tlb_flush = 0;
5772         kvm_pfn_t pfn;
5773         struct kvm_mmu_page *sp;
5774
5775 restart:
5776         for_each_rmap_spte(rmap_head, &iter, sptep) {
5777                 sp = page_header(__pa(sptep));
5778                 pfn = spte_to_pfn(*sptep);
5779
5780                 /*
5781                  * We cannot do huge page mapping for indirect shadow pages,
5782                  * which are found on the last rmap (level = 1) when not using
5783                  * tdp; such shadow pages are synced with the page table in
5784                  * the guest, and the guest page table is using 4K page size
5785                  * mapping if the indirect sp has level = 1.
5786                  */
5787                 if (sp->role.direct &&
5788                         !kvm_is_reserved_pfn(pfn) &&
5789                         PageTransCompoundMap(pfn_to_page(pfn))) {
5790                         pte_list_remove(rmap_head, sptep);
5791
5792                         if (kvm_available_flush_tlb_with_range())
5793                                 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
5794                                         KVM_PAGES_PER_HPAGE(sp->role.level));
5795                         else
5796                                 need_tlb_flush = 1;
5797
5798                         goto restart;
5799                 }
5800         }
5801
5802         return need_tlb_flush;
5803 }
5804
5805 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
5806                                    const struct kvm_memory_slot *memslot)
5807 {
5808         /* FIXME: const-ify all uses of struct kvm_memory_slot.  */
5809         spin_lock(&kvm->mmu_lock);
5810         slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
5811                          kvm_mmu_zap_collapsible_spte, true);
5812         spin_unlock(&kvm->mmu_lock);
5813 }
5814
5815 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
5816                                    struct kvm_memory_slot *memslot)
5817 {
5818         bool flush;
5819
5820         spin_lock(&kvm->mmu_lock);
5821         flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
5822         spin_unlock(&kvm->mmu_lock);
5823
5824         lockdep_assert_held(&kvm->slots_lock);
5825
5826         /*
5827          * It's also safe to flush TLBs out of mmu lock here as currently this
5828          * function is only used for dirty logging, in which case flushing TLB
5829          * out of mmu lock also guarantees no dirty pages will be lost in
5830          * dirty_bitmap.
5831          */
5832         if (flush)
5833                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5834                                 memslot->npages);
5835 }
5836 EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
5837
5838 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
5839                                         struct kvm_memory_slot *memslot)
5840 {
5841         bool flush;
5842
5843         spin_lock(&kvm->mmu_lock);
5844         flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
5845                                         false);
5846         spin_unlock(&kvm->mmu_lock);
5847
5848         /* see kvm_mmu_slot_remove_write_access */
5849         lockdep_assert_held(&kvm->slots_lock);
5850
5851         if (flush)
5852                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5853                                 memslot->npages);
5854 }
5855 EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
5856
5857 void kvm_mmu_slot_set_dirty(struct kvm *kvm,
5858                             struct kvm_memory_slot *memslot)
5859 {
5860         bool flush;
5861
5862         spin_lock(&kvm->mmu_lock);
5863         flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
5864         spin_unlock(&kvm->mmu_lock);
5865
5866         lockdep_assert_held(&kvm->slots_lock);
5867
5868         /* see kvm_mmu_slot_leaf_clear_dirty */
5869         if (flush)
5870                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5871                                 memslot->npages);
5872 }
5873 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
5874
5875 static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
5876 {
5877         struct kvm_mmu_page *sp, *node;
5878         LIST_HEAD(invalid_list);
5879         int ign;
5880
5881         spin_lock(&kvm->mmu_lock);
5882 restart:
5883         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
5884                 if (mmio_only && !sp->mmio_cached)
5885                         continue;
5886                 if (sp->role.invalid && sp->root_count)
5887                         continue;
5888                 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
5889                         WARN_ON_ONCE(mmio_only);
5890                         goto restart;
5891                 }
5892                 if (cond_resched_lock(&kvm->mmu_lock))
5893                         goto restart;
5894         }
5895
5896         kvm_mmu_commit_zap_page(kvm, &invalid_list);
5897         spin_unlock(&kvm->mmu_lock);
5898 }
5899
5900 void kvm_mmu_zap_all(struct kvm *kvm)
5901 {
5902         return __kvm_mmu_zap_all(kvm, false);
5903 }
5904
5905 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
5906 {
5907         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
5908
5909         gen &= MMIO_SPTE_GEN_MASK;
5910
5911         /*
5912          * Generation numbers are incremented in multiples of the number of
5913          * address spaces in order to provide unique generations across all
5914          * address spaces.  Strip what is effectively the address space
5915          * modifier prior to checking for a wrap of the MMIO generation so
5916          * that a wrap in any address space is detected.
5917          */
5918         gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
5919
5920         /*
5921          * The very rare case: if the MMIO generation number has wrapped,
5922          * zap all shadow pages.
5923          */
5924         if (unlikely(gen == 0)) {
5925                 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
5926                 __kvm_mmu_zap_all(kvm, true);
5927         }
5928 }
5929
5930 static unsigned long
5931 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
5932 {
5933         struct kvm *kvm;
5934         int nr_to_scan = sc->nr_to_scan;
5935         unsigned long freed = 0;
5936
5937         spin_lock(&kvm_lock);
5938
5939         list_for_each_entry(kvm, &vm_list, vm_list) {
5940                 int idx;
5941                 LIST_HEAD(invalid_list);
5942
5943                 /*
5944                  * Never scan more than sc->nr_to_scan VM instances.
5945                  * Will not hit this condition practically since we do not try
5946                  * to shrink more than one VM and it is very unlikely to see
5947                  * !n_used_mmu_pages so many times.
5948                  */
5949                 if (!nr_to_scan--)
5950                         break;
5951                 /*
5952                  * n_used_mmu_pages is accessed without holding kvm->mmu_lock
5953                  * here. We may skip a VM instance errorneosly, but we do not
5954                  * want to shrink a VM that only started to populate its MMU
5955                  * anyway.
5956                  */
5957                 if (!kvm->arch.n_used_mmu_pages)
5958                         continue;
5959
5960                 idx = srcu_read_lock(&kvm->srcu);
5961                 spin_lock(&kvm->mmu_lock);
5962
5963                 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
5964                         freed++;
5965                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
5966
5967                 spin_unlock(&kvm->mmu_lock);
5968                 srcu_read_unlock(&kvm->srcu, idx);
5969
5970                 /*
5971                  * unfair on small ones
5972                  * per-vm shrinkers cry out
5973                  * sadness comes quickly
5974                  */
5975                 list_move_tail(&kvm->vm_list, &vm_list);
5976                 break;
5977         }
5978
5979         spin_unlock(&kvm_lock);
5980         return freed;
5981 }
5982
5983 static unsigned long
5984 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
5985 {
5986         return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
5987 }
5988
5989 static struct shrinker mmu_shrinker = {
5990         .count_objects = mmu_shrink_count,
5991         .scan_objects = mmu_shrink_scan,
5992         .seeks = DEFAULT_SEEKS * 10,
5993 };
5994
5995 static void mmu_destroy_caches(void)
5996 {
5997         kmem_cache_destroy(pte_list_desc_cache);
5998         kmem_cache_destroy(mmu_page_header_cache);
5999 }
6000
6001 int kvm_mmu_module_init(void)
6002 {
6003         int ret = -ENOMEM;
6004
6005         /*
6006          * MMU roles use union aliasing which is, generally speaking, an
6007          * undefined behavior. However, we supposedly know how compilers behave
6008          * and the current status quo is unlikely to change. Guardians below are
6009          * supposed to let us know if the assumption becomes false.
6010          */
6011         BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6012         BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6013         BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6014
6015         kvm_mmu_reset_all_pte_masks();
6016
6017         pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6018                                             sizeof(struct pte_list_desc),
6019                                             0, SLAB_ACCOUNT, NULL);
6020         if (!pte_list_desc_cache)
6021                 goto out;
6022
6023         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6024                                                   sizeof(struct kvm_mmu_page),
6025                                                   0, SLAB_ACCOUNT, NULL);
6026         if (!mmu_page_header_cache)
6027                 goto out;
6028
6029         if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6030                 goto out;
6031
6032         ret = register_shrinker(&mmu_shrinker);
6033         if (ret)
6034                 goto out;
6035
6036         return 0;
6037
6038 out:
6039         mmu_destroy_caches();
6040         return ret;
6041 }
6042
6043 /*
6044  * Calculate mmu pages needed for kvm.
6045  */
6046 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
6047 {
6048         unsigned long nr_mmu_pages;
6049         unsigned long nr_pages = 0;
6050         struct kvm_memslots *slots;
6051         struct kvm_memory_slot *memslot;
6052         int i;
6053
6054         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6055                 slots = __kvm_memslots(kvm, i);
6056
6057                 kvm_for_each_memslot(memslot, slots)
6058                         nr_pages += memslot->npages;
6059         }
6060
6061         nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
6062         nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
6063
6064         return nr_mmu_pages;
6065 }
6066
6067 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6068 {
6069         kvm_mmu_unload(vcpu);
6070         free_mmu_pages(vcpu);
6071         mmu_free_memory_caches(vcpu);
6072 }
6073
6074 void kvm_mmu_module_exit(void)
6075 {
6076         mmu_destroy_caches();
6077         percpu_counter_destroy(&kvm_total_used_mmu_pages);
6078         unregister_shrinker(&mmu_shrinker);
6079         mmu_audit_disable();
6080 }