arch/x86/kvm/vmx/vmx.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * This module enables machines with Intel VT-x extensions to run virtual
   6  * machines without emulation or binary translation.
   7  *
   8  * Copyright (C) 2006 Qumranet, Inc.
   9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  10  *
  11  * Authors:
  12  *   Avi Kivity   <avi@qumranet.com>
  13  *   Yaniv Kamay  <yaniv@qumranet.com>
  14  */
  15
  16 #include <linux/frame.h>
  17 #include <linux/highmem.h>
  18 #include <linux/hrtimer.h>
  19 #include <linux/kernel.h>
  20 #include <linux/kvm_host.h>
  21 #include <linux/module.h>
  22 #include <linux/moduleparam.h>
  23 #include <linux/mod_devicetable.h>
  24 #include <linux/mm.h>
  25 #include <linux/sched.h>
  26 #include <linux/sched/smt.h>
  27 #include <linux/slab.h>
  28 #include <linux/tboot.h>
  29 #include <linux/trace_events.h>
  30 #include <linux/entry-kvm.h>
  31
  32 #include <asm/apic.h>
  33 #include <asm/asm.h>
  34 #include <asm/cpu.h>
  35 #include <asm/cpu_device_id.h>
  36 #include <asm/debugreg.h>
  37 #include <asm/desc.h>
  38 #include <asm/fpu/internal.h>
  39 #include <asm/io.h>
  40 #include <asm/irq_remapping.h>
  41 #include <asm/kexec.h>
  42 #include <asm/perf_event.h>
  43 #include <asm/mce.h>
  44 #include <asm/mmu_context.h>
  45 #include <asm/mshyperv.h>
  46 #include <asm/mwait.h>
  47 #include <asm/spec-ctrl.h>
  48 #include <asm/virtext.h>
  49 #include <asm/vmx.h>
  50
  51 #include "capabilities.h"
  52 #include "cpuid.h"
  53 #include "evmcs.h"
  54 #include "irq.h"
  55 #include "kvm_cache_regs.h"
  56 #include "lapic.h"
  57 #include "mmu.h"
  58 #include "nested.h"
  59 #include "pmu.h"
  60 #include "trace.h"
  61 #include "vmcs.h"
  62 #include "vmcs12.h"
  63 #include "vmx.h"
  64 #include "x86.h"
  65
  66 MODULE_AUTHOR("Qumranet");
  67 MODULE_LICENSE("GPL");
  68
  69 #ifdef MODULE
  70 static const struct x86_cpu_id vmx_cpu_id[] = {
  71         X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
  72         {}
  73 };
  74 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
  75 #endif
  76
  77 bool __read_mostly enable_vpid = 1;
  78 module_param_named(vpid, enable_vpid, bool, 0444);
  79
  80 static bool __read_mostly enable_vnmi = 1;
  81 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
  82
  83 bool __read_mostly flexpriority_enabled = 1;
  84 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
  85
  86 bool __read_mostly enable_ept = 1;
  87 module_param_named(ept, enable_ept, bool, S_IRUGO);
  88
  89 bool __read_mostly enable_unrestricted_guest = 1;
  90 module_param_named(unrestricted_guest,
  91                         enable_unrestricted_guest, bool, S_IRUGO);
  92
  93 bool __read_mostly enable_ept_ad_bits = 1;
  94 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
  95
  96 static bool __read_mostly emulate_invalid_guest_state = true;
  97 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
  98
  99 static bool __read_mostly fasteoi = 1;
 100 module_param(fasteoi, bool, S_IRUGO);
 101
 102 bool __read_mostly enable_apicv = 1;
 103 module_param(enable_apicv, bool, S_IRUGO);
 104
 105 /*
 106  * If nested=1, nested virtualization is supported, i.e., guests may use
 107  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
 108  * use VMX instructions.
 109  */
 110 static bool __read_mostly nested = 1;
 111 module_param(nested, bool, S_IRUGO);
 112
 113 bool __read_mostly enable_pml = 1;
 114 module_param_named(pml, enable_pml, bool, S_IRUGO);
 115
 116 static bool __read_mostly dump_invalid_vmcs = 0;
 117 module_param(dump_invalid_vmcs, bool, 0644);
 118
 119 #define MSR_BITMAP_MODE_X2APIC          1
 120 #define MSR_BITMAP_MODE_X2APIC_APICV    2
 121
 122 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 123
 124 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
 125 static int __read_mostly cpu_preemption_timer_multi;
 126 static bool __read_mostly enable_preemption_timer = 1;
 127 #ifdef CONFIG_X86_64
 128 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
 129 #endif
 130
 131 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
 132 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
 133 #define KVM_VM_CR0_ALWAYS_ON                            \
 134         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST |      \
 135          X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
 136
 137 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
 138 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
 139 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
 140
 141 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
 142
 143 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
 144         RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
 145         RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
 146         RTIT_STATUS_BYTECNT))
 147
 148 /*
 149  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 150  * ple_gap:    upper bound on the amount of time between two successive
 151  *             executions of PAUSE in a loop. Also indicate if ple enabled.
 152  *             According to test, this time is usually smaller than 128 cycles.
 153  * ple_window: upper bound on the amount of time a guest is allowed to execute
 154  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
 155  *             less than 2^12 cycles
 156  * Time is measured based on a counter that runs at the same rate as the TSC,
 157  * refer SDM volume 3b section 21.6.13 & 22.1.3.
 158  */
 159 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
 160 module_param(ple_gap, uint, 0444);
 161
 162 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 163 module_param(ple_window, uint, 0444);
 164
 165 /* Default doubles per-vcpu window every exit. */
 166 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 167 module_param(ple_window_grow, uint, 0444);
 168
 169 /* Default resets per-vcpu window every exit to ple_window. */
 170 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 171 module_param(ple_window_shrink, uint, 0444);
 172
 173 /* Default is to compute the maximum so we can never overflow. */
 174 static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 175 module_param(ple_window_max, uint, 0444);
 176
 177 /* Default is SYSTEM mode, 1 for host-guest mode */
 178 int __read_mostly pt_mode = PT_MODE_SYSTEM;
 179 module_param(pt_mode, int, S_IRUGO);
 180
 181 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
 182 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
 183 static DEFINE_MUTEX(vmx_l1d_flush_mutex);
 184
 185 /* Storage for pre module init parameter parsing */
 186 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
 187
 188 static const struct {
 189         const char *option;
 190         bool for_parse;
 191 } vmentry_l1d_param[] = {
 192         [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
 193         [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
 194         [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
 195         [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
 196         [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
 197         [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
 198 };
 199
 200 #define L1D_CACHE_ORDER 4
 201 static void *vmx_l1d_flush_pages;
 202
 203 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
 204 {
 205         struct page *page;
 206         unsigned int i;
 207
 208         if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
 209                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
 210                 return 0;
 211         }
 212
 213         if (!enable_ept) {
 214                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
 215                 return 0;
 216         }
 217
 218         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
 219                 u64 msr;
 220
 221                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
 222                 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
 223                         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
 224                         return 0;
 225                 }
 226         }
 227
 228         /* If set to auto use the default l1tf mitigation method */
 229         if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
 230                 switch (l1tf_mitigation) {
 231                 case L1TF_MITIGATION_OFF:
 232                         l1tf = VMENTER_L1D_FLUSH_NEVER;
 233                         break;
 234                 case L1TF_MITIGATION_FLUSH_NOWARN:
 235                 case L1TF_MITIGATION_FLUSH:
 236                 case L1TF_MITIGATION_FLUSH_NOSMT:
 237                         l1tf = VMENTER_L1D_FLUSH_COND;
 238                         break;
 239                 case L1TF_MITIGATION_FULL:
 240                 case L1TF_MITIGATION_FULL_FORCE:
 241                         l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 242                         break;
 243                 }
 244         } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
 245                 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 246         }
 247
 248         if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
 249             !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
 250                 /*
 251                  * This allocation for vmx_l1d_flush_pages is not tied to a VM
 252                  * lifetime and so should not be charged to a memcg.
 253                  */
 254                 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
 255                 if (!page)
 256                         return -ENOMEM;
 257                 vmx_l1d_flush_pages = page_address(page);
 258
 259                 /*
 260                  * Initialize each page with a different pattern in
 261                  * order to protect against KSM in the nested
 262                  * virtualization case.
 263                  */
 264                 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
 265                         memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
 266                                PAGE_SIZE);
 267                 }
 268         }
 269
 270         l1tf_vmx_mitigation = l1tf;
 271
 272         if (l1tf != VMENTER_L1D_FLUSH_NEVER)
 273                 static_branch_enable(&vmx_l1d_should_flush);
 274         else
 275                 static_branch_disable(&vmx_l1d_should_flush);
 276
 277         if (l1tf == VMENTER_L1D_FLUSH_COND)
 278                 static_branch_enable(&vmx_l1d_flush_cond);
 279         else
 280                 static_branch_disable(&vmx_l1d_flush_cond);
 281         return 0;
 282 }
 283
 284 static int vmentry_l1d_flush_parse(const char *s)
 285 {
 286         unsigned int i;
 287
 288         if (s) {
 289                 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
 290                         if (vmentry_l1d_param[i].for_parse &&
 291                             sysfs_streq(s, vmentry_l1d_param[i].option))
 292                                 return i;
 293                 }
 294         }
 295         return -EINVAL;
 296 }
 297
 298 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
 299 {
 300         int l1tf, ret;
 301
 302         l1tf = vmentry_l1d_flush_parse(s);
 303         if (l1tf < 0)
 304                 return l1tf;
 305
 306         if (!boot_cpu_has(X86_BUG_L1TF))
 307                 return 0;
 308
 309         /*
 310          * Has vmx_init() run already? If not then this is the pre init
 311          * parameter parsing. In that case just store the value and let
 312          * vmx_init() do the proper setup after enable_ept has been
 313          * established.
 314          */
 315         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
 316                 vmentry_l1d_flush_param = l1tf;
 317                 return 0;
 318         }
 319
 320         mutex_lock(&vmx_l1d_flush_mutex);
 321         ret = vmx_setup_l1d_flush(l1tf);
 322         mutex_unlock(&vmx_l1d_flush_mutex);
 323         return ret;
 324 }
 325
 326 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
 327 {
 328         if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
 329                 return sprintf(s, "???\n");
 330
 331         return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
 332 }
 333
 334 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
 335         .set = vmentry_l1d_flush_set,
 336         .get = vmentry_l1d_flush_get,
 337 };
 338 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 339
 340 static u32 vmx_segment_access_rights(struct kvm_segment *var);
 341 static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
 342                                                           u32 msr, int type);
 343
 344 void vmx_vmexit(void);
 345
 346 #define vmx_insn_failed(fmt...)         \
 347 do {                                    \
 348         WARN_ONCE(1, fmt);              \
 349         pr_warn_ratelimited(fmt);       \
 350 } while (0)
 351
 352 asmlinkage void vmread_error(unsigned long field, bool fault)
 353 {
 354         if (fault)
 355                 kvm_spurious_fault();
 356         else
 357                 vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
 358 }
 359
 360 noinline void vmwrite_error(unsigned long field, unsigned long value)
 361 {
 362         vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
 363                         field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
 364 }
 365
 366 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
 367 {
 368         vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
 369 }
 370
 371 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
 372 {
 373         vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
 374 }
 375
 376 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
 377 {
 378         vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
 379                         ext, vpid, gva);
 380 }
 381
 382 noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
 383 {
 384         vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
 385                         ext, eptp, gpa);
 386 }
 387
 388 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 389 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 390 /*
 391  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
 392  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
 393  */
 394 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 395
 396 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 397 static DEFINE_SPINLOCK(vmx_vpid_lock);
 398
 399 struct vmcs_config vmcs_config;
 400 struct vmx_capability vmx_capability;
 401
 402 #define VMX_SEGMENT_FIELD(seg)                                  \
 403         [VCPU_SREG_##seg] = {                                   \
 404                 .selector = GUEST_##seg##_SELECTOR,             \
 405                 .base = GUEST_##seg##_BASE,                     \
 406                 .limit = GUEST_##seg##_LIMIT,                   \
 407                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
 408         }
 409
 410 static const struct kvm_vmx_segment_field {
 411         unsigned selector;
 412         unsigned base;
 413         unsigned limit;
 414         unsigned ar_bytes;
 415 } kvm_vmx_segment_fields[] = {
 416         VMX_SEGMENT_FIELD(CS),
 417         VMX_SEGMENT_FIELD(DS),
 418         VMX_SEGMENT_FIELD(ES),
 419         VMX_SEGMENT_FIELD(FS),
 420         VMX_SEGMENT_FIELD(GS),
 421         VMX_SEGMENT_FIELD(SS),
 422         VMX_SEGMENT_FIELD(TR),
 423         VMX_SEGMENT_FIELD(LDTR),
 424 };
 425
 426 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
 427 {
 428         vmx->segment_cache.bitmask = 0;
 429 }
 430
 431 static unsigned long host_idt_base;
 432
 433 /*
 434  * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
 435  * will emulate SYSCALL in legacy mode if the vendor string in guest
 436  * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
 437  * support this emulation, IA32_STAR must always be included in
 438  * vmx_msr_index[], even in i386 builds.
 439  */
 440 const u32 vmx_msr_index[] = {
 441 #ifdef CONFIG_X86_64
 442         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 443 #endif
 444         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 445         MSR_IA32_TSX_CTRL,
 446 };
 447
 448 #if IS_ENABLED(CONFIG_HYPERV)
 449 static bool __read_mostly enlightened_vmcs = true;
 450 module_param(enlightened_vmcs, bool, 0444);
 451
 452 /* check_ept_pointer() should be under protection of ept_pointer_lock. */
 453 static void check_ept_pointer_match(struct kvm *kvm)
 454 {
 455         struct kvm_vcpu *vcpu;
 456         u64 tmp_eptp = INVALID_PAGE;
 457         int i;
 458
 459         kvm_for_each_vcpu(i, vcpu, kvm) {
 460                 if (!VALID_PAGE(tmp_eptp)) {
 461                         tmp_eptp = to_vmx(vcpu)->ept_pointer;
 462                 } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
 463                         to_kvm_vmx(kvm)->ept_pointers_match
 464                                 = EPT_POINTERS_MISMATCH;
 465                         return;
 466                 }
 467         }
 468
 469         to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
 470 }
 471
 472 static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
 473                 void *data)
 474 {
 475         struct kvm_tlb_range *range = data;
 476
 477         return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
 478                         range->pages);
 479 }
 480
 481 static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
 482                 struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
 483 {
 484         u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
 485
 486         /*
 487          * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
 488          * of the base of EPT PML4 table, strip off EPT configuration
 489          * information.
 490          */
 491         if (range)
 492                 return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
 493                                 kvm_fill_hv_flush_list_func, (void *)range);
 494         else
 495                 return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
 496 }
 497
 498 static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
 499                 struct kvm_tlb_range *range)
 500 {
 501         struct kvm_vcpu *vcpu;
 502         int ret = 0, i;
 503
 504         spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
 505
 506         if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
 507                 check_ept_pointer_match(kvm);
 508
 509         if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
 510                 kvm_for_each_vcpu(i, vcpu, kvm) {
 511                         /* If ept_pointer is invalid pointer, bypass flush request. */
 512                         if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
 513                                 ret |= __hv_remote_flush_tlb_with_range(
 514                                         kvm, vcpu, range);
 515                 }
 516         } else {
 517                 ret = __hv_remote_flush_tlb_with_range(kvm,
 518                                 kvm_get_vcpu(kvm, 0), range);
 519         }
 520
 521         spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
 522         return ret;
 523 }
 524 static int hv_remote_flush_tlb(struct kvm *kvm)
 525 {
 526         return hv_remote_flush_tlb_with_range(kvm, NULL);
 527 }
 528
 529 static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
 530 {
 531         struct hv_enlightened_vmcs *evmcs;
 532         struct hv_partition_assist_pg **p_hv_pa_pg =
 533                         &vcpu->kvm->arch.hyperv.hv_pa_pg;
 534         /*
 535          * Synthetic VM-Exit is not enabled in current code and so All
 536          * evmcs in singe VM shares same assist page.
 537          */
 538         if (!*p_hv_pa_pg)
 539                 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
 540
 541         if (!*p_hv_pa_pg)
 542                 return -ENOMEM;
 543
 544         evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
 545
 546         evmcs->partition_assist_page =
 547                 __pa(*p_hv_pa_pg);
 548         evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
 549         evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
 550
 551         return 0;
 552 }
 553
 554 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 555
 556 /*
 557  * Comment's format: document - errata name - stepping - processor name.
 558  * Refer from
 559  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
 560  */
 561 static u32 vmx_preemption_cpu_tfms[] = {
 562 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
 563 0x000206E6,
 564 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
 565 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
 566 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
 567 0x00020652,
 568 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
 569 0x00020655,
 570 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
 571 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
 572 /*
 573  * 320767.pdf - AAP86  - B1 -
 574  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
 575  */
 576 0x000106E5,
 577 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
 578 0x000106A0,
 579 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
 580 0x000106A1,
 581 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
 582 0x000106A4,
 583  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
 584  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
 585  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
 586 0x000106A5,
 587  /* Xeon E3-1220 V2 */
 588 0x000306A8,
 589 };
 590
 591 static inline bool cpu_has_broken_vmx_preemption_timer(void)
 592 {
 593         u32 eax = cpuid_eax(0x00000001), i;
 594
 595         /* Clear the reserved bits */
 596         eax &= ~(0x3U << 14 | 0xfU << 28);
 597         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
 598                 if (eax == vmx_preemption_cpu_tfms[i])
 599                         return true;
 600
 601         return false;
 602 }
 603
 604 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
 605 {
 606         return flexpriority_enabled && lapic_in_kernel(vcpu);
 607 }
 608
 609 static inline bool report_flexpriority(void)
 610 {
 611         return flexpriority_enabled;
 612 }
 613
 614 static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 615 {
 616         int i;
 617
 618         for (i = 0; i < vmx->nr_uret_msrs; ++i)
 619                 if (vmx_msr_index[vmx->guest_uret_msrs[i].index] == msr)
 620                         return i;
 621         return -1;
 622 }
 623
 624 struct vmx_uret_msr *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 625 {
 626         int i;
 627
 628         i = __find_msr_index(vmx, msr);
 629         if (i >= 0)
 630                 return &vmx->guest_uret_msrs[i];
 631         return NULL;
 632 }
 633
 634 static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct vmx_uret_msr *msr, u64 data)
 635 {
 636         int ret = 0;
 637
 638         u64 old_msr_data = msr->data;
 639         msr->data = data;
 640         if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
 641                 preempt_disable();
 642                 ret = kvm_set_user_return_msr(msr->index, msr->data, msr->mask);
 643                 preempt_enable();
 644                 if (ret)
 645                         msr->data = old_msr_data;
 646         }
 647         return ret;
 648 }
 649
 650 #ifdef CONFIG_KEXEC_CORE
 651 static void crash_vmclear_local_loaded_vmcss(void)
 652 {
 653         int cpu = raw_smp_processor_id();
 654         struct loaded_vmcs *v;
 655
 656         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
 657                             loaded_vmcss_on_cpu_link)
 658                 vmcs_clear(v->vmcs);
 659 }
 660 #endif /* CONFIG_KEXEC_CORE */
 661
 662 static void __loaded_vmcs_clear(void *arg)
 663 {
 664         struct loaded_vmcs *loaded_vmcs = arg;
 665         int cpu = raw_smp_processor_id();
 666
 667         if (loaded_vmcs->cpu != cpu)
 668                 return; /* vcpu migration can race with cpu offline */
 669         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
 670                 per_cpu(current_vmcs, cpu) = NULL;
 671
 672         vmcs_clear(loaded_vmcs->vmcs);
 673         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
 674                 vmcs_clear(loaded_vmcs->shadow_vmcs);
 675
 676         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
 677
 678         /*
 679          * Ensure all writes to loaded_vmcs, including deleting it from its
 680          * current percpu list, complete before setting loaded_vmcs->vcpu to
 681          * -1, otherwise a different cpu can see vcpu == -1 first and add
 682          * loaded_vmcs to its percpu list before it's deleted from this cpu's
 683          * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
 684          */
 685         smp_wmb();
 686
 687         loaded_vmcs->cpu = -1;
 688         loaded_vmcs->launched = 0;
 689 }
 690
 691 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 692 {
 693         int cpu = loaded_vmcs->cpu;
 694
 695         if (cpu != -1)
 696                 smp_call_function_single(cpu,
 697                          __loaded_vmcs_clear, loaded_vmcs, 1);
 698 }
 699
 700 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
 701                                        unsigned field)
 702 {
 703         bool ret;
 704         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
 705
 706         if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
 707                 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
 708                 vmx->segment_cache.bitmask = 0;
 709         }
 710         ret = vmx->segment_cache.bitmask & mask;
 711         vmx->segment_cache.bitmask |= mask;
 712         return ret;
 713 }
 714
 715 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
 716 {
 717         u16 *p = &vmx->segment_cache.seg[seg].selector;
 718
 719         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
 720                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
 721         return *p;
 722 }
 723
 724 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
 725 {
 726         ulong *p = &vmx->segment_cache.seg[seg].base;
 727
 728         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
 729                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
 730         return *p;
 731 }
 732
 733 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
 734 {
 735         u32 *p = &vmx->segment_cache.seg[seg].limit;
 736
 737         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
 738                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
 739         return *p;
 740 }
 741
 742 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
 743 {
 744         u32 *p = &vmx->segment_cache.seg[seg].ar;
 745
 746         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
 747                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
 748         return *p;
 749 }
 750
 751 void update_exception_bitmap(struct kvm_vcpu *vcpu)
 752 {
 753         u32 eb;
 754
 755         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
 756              (1u << DB_VECTOR) | (1u << AC_VECTOR);
 757         /*
 758          * Guest access to VMware backdoor ports could legitimately
 759          * trigger #GP because of TSS I/O permission bitmap.
 760          * We intercept those #GP and allow access to them anyway
 761          * as VMware does.
 762          */
 763         if (enable_vmware_backdoor)
 764                 eb |= (1u << GP_VECTOR);
 765         if ((vcpu->guest_debug &
 766              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
 767             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
 768                 eb |= 1u << BP_VECTOR;
 769         if (to_vmx(vcpu)->rmode.vm86_active)
 770                 eb = ~0;
 771         if (!vmx_need_pf_intercept(vcpu))
 772                 eb &= ~(1u << PF_VECTOR);
 773
 774         /* When we are running a nested L2 guest and L1 specified for it a
 775          * certain exception bitmap, we must trap the same exceptions and pass
 776          * them to L1. When running L2, we will only handle the exceptions
 777          * specified above if L1 did not want them.
 778          */
 779         if (is_guest_mode(vcpu))
 780                 eb |= get_vmcs12(vcpu)->exception_bitmap;
 781
 782         vmcs_write32(EXCEPTION_BITMAP, eb);
 783 }
 784
 785 /*
 786  * Check if MSR is intercepted for currently loaded MSR bitmap.
 787  */
 788 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 789 {
 790         unsigned long *msr_bitmap;
 791         int f = sizeof(unsigned long);
 792
 793         if (!cpu_has_vmx_msr_bitmap())
 794                 return true;
 795
 796         msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
 797
 798         if (msr <= 0x1fff) {
 799                 return !!test_bit(msr, msr_bitmap + 0x800 / f);
 800         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 801                 msr &= 0x1fff;
 802                 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
 803         }
 804
 805         return true;
 806 }
 807
 808 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 809                 unsigned long entry, unsigned long exit)
 810 {
 811         vm_entry_controls_clearbit(vmx, entry);
 812         vm_exit_controls_clearbit(vmx, exit);
 813 }
 814
 815 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
 816 {
 817         unsigned int i;
 818
 819         for (i = 0; i < m->nr; ++i) {
 820                 if (m->val[i].index == msr)
 821                         return i;
 822         }
 823         return -ENOENT;
 824 }
 825
 826 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
 827 {
 828         int i;
 829         struct msr_autoload *m = &vmx->msr_autoload;
 830
 831         switch (msr) {
 832         case MSR_EFER:
 833                 if (cpu_has_load_ia32_efer()) {
 834                         clear_atomic_switch_msr_special(vmx,
 835                                         VM_ENTRY_LOAD_IA32_EFER,
 836                                         VM_EXIT_LOAD_IA32_EFER);
 837                         return;
 838                 }
 839                 break;
 840         case MSR_CORE_PERF_GLOBAL_CTRL:
 841                 if (cpu_has_load_perf_global_ctrl()) {
 842                         clear_atomic_switch_msr_special(vmx,
 843                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
 844                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
 845                         return;
 846                 }
 847                 break;
 848         }
 849         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
 850         if (i < 0)
 851                 goto skip_guest;
 852         --m->guest.nr;
 853         m->guest.val[i] = m->guest.val[m->guest.nr];
 854         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
 855
 856 skip_guest:
 857         i = vmx_find_loadstore_msr_slot(&m->host, msr);
 858         if (i < 0)
 859                 return;
 860
 861         --m->host.nr;
 862         m->host.val[i] = m->host.val[m->host.nr];
 863         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
 864 }
 865
 866 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 867                 unsigned long entry, unsigned long exit,
 868                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
 869                 u64 guest_val, u64 host_val)
 870 {
 871         vmcs_write64(guest_val_vmcs, guest_val);
 872         if (host_val_vmcs != HOST_IA32_EFER)
 873                 vmcs_write64(host_val_vmcs, host_val);
 874         vm_entry_controls_setbit(vmx, entry);
 875         vm_exit_controls_setbit(vmx, exit);
 876 }
 877
 878 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
 879                                   u64 guest_val, u64 host_val, bool entry_only)
 880 {
 881         int i, j = 0;
 882         struct msr_autoload *m = &vmx->msr_autoload;
 883
 884         switch (msr) {
 885         case MSR_EFER:
 886                 if (cpu_has_load_ia32_efer()) {
 887                         add_atomic_switch_msr_special(vmx,
 888                                         VM_ENTRY_LOAD_IA32_EFER,
 889                                         VM_EXIT_LOAD_IA32_EFER,
 890                                         GUEST_IA32_EFER,
 891                                         HOST_IA32_EFER,
 892                                         guest_val, host_val);
 893                         return;
 894                 }
 895                 break;
 896         case MSR_CORE_PERF_GLOBAL_CTRL:
 897                 if (cpu_has_load_perf_global_ctrl()) {
 898                         add_atomic_switch_msr_special(vmx,
 899                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
 900                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
 901                                         GUEST_IA32_PERF_GLOBAL_CTRL,
 902                                         HOST_IA32_PERF_GLOBAL_CTRL,
 903                                         guest_val, host_val);
 904                         return;
 905                 }
 906                 break;
 907         case MSR_IA32_PEBS_ENABLE:
 908                 /* PEBS needs a quiescent period after being disabled (to write
 909                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
 910                  * provide that period, so a CPU could write host's record into
 911                  * guest's memory.
 912                  */
 913                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
 914         }
 915
 916         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
 917         if (!entry_only)
 918                 j = vmx_find_loadstore_msr_slot(&m->host, msr);
 919
 920         if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
 921             (j < 0 &&  m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
 922                 printk_once(KERN_WARNING "Not enough msr switch entries. "
 923                                 "Can't add msr %x\n", msr);
 924                 return;
 925         }
 926         if (i < 0) {
 927                 i = m->guest.nr++;
 928                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
 929         }
 930         m->guest.val[i].index = msr;
 931         m->guest.val[i].value = guest_val;
 932
 933         if (entry_only)
 934                 return;
 935
 936         if (j < 0) {
 937                 j = m->host.nr++;
 938                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
 939         }
 940         m->host.val[j].index = msr;
 941         m->host.val[j].value = host_val;
 942 }
 943
 944 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 945 {
 946         u64 guest_efer = vmx->vcpu.arch.efer;
 947         u64 ignore_bits = 0;
 948
 949         /* Shadow paging assumes NX to be available.  */
 950         if (!enable_ept)
 951                 guest_efer |= EFER_NX;
 952
 953         /*
 954          * LMA and LME handled by hardware; SCE meaningless outside long mode.
 955          */
 956         ignore_bits |= EFER_SCE;
 957 #ifdef CONFIG_X86_64
 958         ignore_bits |= EFER_LMA | EFER_LME;
 959         /* SCE is meaningful only in long mode on Intel */
 960         if (guest_efer & EFER_LMA)
 961                 ignore_bits &= ~(u64)EFER_SCE;
 962 #endif
 963
 964         /*
 965          * On EPT, we can't emulate NX, so we must switch EFER atomically.
 966          * On CPUs that support "load IA32_EFER", always switch EFER
 967          * atomically, since it's faster than switching it manually.
 968          */
 969         if (cpu_has_load_ia32_efer() ||
 970             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
 971                 if (!(guest_efer & EFER_LMA))
 972                         guest_efer &= ~EFER_LME;
 973                 if (guest_efer != host_efer)
 974                         add_atomic_switch_msr(vmx, MSR_EFER,
 975                                               guest_efer, host_efer, false);
 976                 else
 977                         clear_atomic_switch_msr(vmx, MSR_EFER);
 978                 return false;
 979         } else {
 980                 clear_atomic_switch_msr(vmx, MSR_EFER);
 981
 982                 guest_efer &= ~ignore_bits;
 983                 guest_efer |= host_efer & ignore_bits;
 984
 985                 vmx->guest_uret_msrs[efer_offset].data = guest_efer;
 986                 vmx->guest_uret_msrs[efer_offset].mask = ~ignore_bits;
 987
 988                 return true;
 989         }
 990 }
 991
 992 #ifdef CONFIG_X86_32
 993 /*
 994  * On 32-bit kernels, VM exits still load the FS and GS bases from the
 995  * VMCS rather than the segment table.  KVM uses this helper to figure
 996  * out the current bases to poke them into the VMCS before entry.
 997  */
 998 static unsigned long segment_base(u16 selector)
 999 {
1000         struct desc_struct *table;
1001         unsigned long v;
1002
1003         if (!(selector & ~SEGMENT_RPL_MASK))
1004                 return 0;
1005
1006         table = get_current_gdt_ro();
1007
1008         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1009                 u16 ldt_selector = kvm_read_ldt();
1010
1011                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1012                         return 0;
1013
1014                 table = (struct desc_struct *)segment_base(ldt_selector);
1015         }
1016         v = get_desc_base(&table[selector >> 3]);
1017         return v;
1018 }
1019 #endif
1020
1021 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1022 {
1023         return vmx_pt_mode_is_host_guest() &&
1024                !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1025 }
1026
1027 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1028 {
1029         /* The base must be 128-byte aligned and a legal physical address. */
1030         return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f);
1031 }
1032
1033 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1034 {
1035         u32 i;
1036
1037         wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1038         wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1039         wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1040         wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1041         for (i = 0; i < addr_range; i++) {
1042                 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1043                 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1044         }
1045 }
1046
1047 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1048 {
1049         u32 i;
1050
1051         rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1052         rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1053         rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1054         rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1055         for (i = 0; i < addr_range; i++) {
1056                 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1057                 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1058         }
1059 }
1060
1061 static void pt_guest_enter(struct vcpu_vmx *vmx)
1062 {
1063         if (vmx_pt_mode_is_system())
1064                 return;
1065
1066         /*
1067          * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1068          * Save host state before VM entry.
1069          */
1070         rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1071         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1072                 wrmsrl(MSR_IA32_RTIT_CTL, 0);
1073                 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1074                 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1075         }
1076 }
1077
1078 static void pt_guest_exit(struct vcpu_vmx *vmx)
1079 {
1080         if (vmx_pt_mode_is_system())
1081                 return;
1082
1083         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1084                 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1085                 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1086         }
1087
1088         /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
1089         wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1090 }
1091
1092 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1093                         unsigned long fs_base, unsigned long gs_base)
1094 {
1095         if (unlikely(fs_sel != host->fs_sel)) {
1096                 if (!(fs_sel & 7))
1097                         vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1098                 else
1099                         vmcs_write16(HOST_FS_SELECTOR, 0);
1100                 host->fs_sel = fs_sel;
1101         }
1102         if (unlikely(gs_sel != host->gs_sel)) {
1103                 if (!(gs_sel & 7))
1104                         vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1105                 else
1106                         vmcs_write16(HOST_GS_SELECTOR, 0);
1107                 host->gs_sel = gs_sel;
1108         }
1109         if (unlikely(fs_base != host->fs_base)) {
1110                 vmcs_writel(HOST_FS_BASE, fs_base);
1111                 host->fs_base = fs_base;
1112         }
1113         if (unlikely(gs_base != host->gs_base)) {
1114                 vmcs_writel(HOST_GS_BASE, gs_base);
1115                 host->gs_base = gs_base;
1116         }
1117 }
1118
1119 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1120 {
1121         struct vcpu_vmx *vmx = to_vmx(vcpu);
1122         struct vmcs_host_state *host_state;
1123 #ifdef CONFIG_X86_64
1124         int cpu = raw_smp_processor_id();
1125 #endif
1126         unsigned long fs_base, gs_base;
1127         u16 fs_sel, gs_sel;
1128         int i;
1129
1130         vmx->req_immediate_exit = false;
1131
1132         /*
1133          * Note that guest MSRs to be saved/restored can also be changed
1134          * when guest state is loaded. This happens when guest transitions
1135          * to/from long-mode by setting MSR_EFER.LMA.
1136          */
1137         if (!vmx->guest_uret_msrs_loaded) {
1138                 vmx->guest_uret_msrs_loaded = true;
1139                 for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
1140                         kvm_set_user_return_msr(vmx->guest_uret_msrs[i].index,
1141                                                 vmx->guest_uret_msrs[i].data,
1142                                                 vmx->guest_uret_msrs[i].mask);
1143
1144         }
1145
1146         if (vmx->nested.need_vmcs12_to_shadow_sync)
1147                 nested_sync_vmcs12_to_shadow(vcpu);
1148
1149         if (vmx->guest_state_loaded)
1150                 return;
1151
1152         host_state = &vmx->loaded_vmcs->host_state;
1153
1154         /*
1155          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1156          * allow segment selectors with cpl > 0 or ti == 1.
1157          */
1158         host_state->ldt_sel = kvm_read_ldt();
1159
1160 #ifdef CONFIG_X86_64
1161         savesegment(ds, host_state->ds_sel);
1162         savesegment(es, host_state->es_sel);
1163
1164         gs_base = cpu_kernelmode_gs_base(cpu);
1165         if (likely(is_64bit_mm(current->mm))) {
1166                 current_save_fsgs();
1167                 fs_sel = current->thread.fsindex;
1168                 gs_sel = current->thread.gsindex;
1169                 fs_base = current->thread.fsbase;
1170                 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
1171         } else {
1172                 savesegment(fs, fs_sel);
1173                 savesegment(gs, gs_sel);
1174                 fs_base = read_msr(MSR_FS_BASE);
1175                 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1176         }
1177
1178         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1179 #else
1180         savesegment(fs, fs_sel);
1181         savesegment(gs, gs_sel);
1182         fs_base = segment_base(fs_sel);
1183         gs_base = segment_base(gs_sel);
1184 #endif
1185
1186         vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
1187         vmx->guest_state_loaded = true;
1188 }
1189
1190 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1191 {
1192         struct vmcs_host_state *host_state;
1193
1194         if (!vmx->guest_state_loaded)
1195                 return;
1196
1197         host_state = &vmx->loaded_vmcs->host_state;
1198
1199         ++vmx->vcpu.stat.host_state_reload;
1200
1201 #ifdef CONFIG_X86_64
1202         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1203 #endif
1204         if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1205                 kvm_load_ldt(host_state->ldt_sel);
1206 #ifdef CONFIG_X86_64
1207                 load_gs_index(host_state->gs_sel);
1208 #else
1209                 loadsegment(gs, host_state->gs_sel);
1210 #endif
1211         }
1212         if (host_state->fs_sel & 7)
1213                 loadsegment(fs, host_state->fs_sel);
1214 #ifdef CONFIG_X86_64
1215         if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1216                 loadsegment(ds, host_state->ds_sel);
1217                 loadsegment(es, host_state->es_sel);
1218         }
1219 #endif
1220         invalidate_tss_limit();
1221 #ifdef CONFIG_X86_64
1222         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1223 #endif
1224         load_fixmap_gdt(raw_smp_processor_id());
1225         vmx->guest_state_loaded = false;
1226         vmx->guest_uret_msrs_loaded = false;
1227 }
1228
1229 #ifdef CONFIG_X86_64
1230 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1231 {
1232         preempt_disable();
1233         if (vmx->guest_state_loaded)
1234                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1235         preempt_enable();
1236         return vmx->msr_guest_kernel_gs_base;
1237 }
1238
1239 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1240 {
1241         preempt_disable();
1242         if (vmx->guest_state_loaded)
1243                 wrmsrl(MSR_KERNEL_GS_BASE, data);
1244         preempt_enable();
1245         vmx->msr_guest_kernel_gs_base = data;
1246 }
1247 #endif
1248
1249 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
1250                         struct loaded_vmcs *buddy)
1251 {
1252         struct vcpu_vmx *vmx = to_vmx(vcpu);
1253         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1254         struct vmcs *prev;
1255
1256         if (!already_loaded) {
1257                 loaded_vmcs_clear(vmx->loaded_vmcs);
1258                 local_irq_disable();
1259
1260                 /*
1261                  * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1262                  * this cpu's percpu list, otherwise it may not yet be deleted
1263                  * from its previous cpu's percpu list.  Pairs with the
1264                  * smb_wmb() in __loaded_vmcs_clear().
1265                  */
1266                 smp_rmb();
1267
1268                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1269                          &per_cpu(loaded_vmcss_on_cpu, cpu));
1270                 local_irq_enable();
1271         }
1272
1273         prev = per_cpu(current_vmcs, cpu);
1274         if (prev != vmx->loaded_vmcs->vmcs) {
1275                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1276                 vmcs_load(vmx->loaded_vmcs->vmcs);
1277
1278                 /*
1279                  * No indirect branch prediction barrier needed when switching
1280                  * the active VMCS within a guest, e.g. on nested VM-Enter.
1281                  * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
1282                  */
1283                 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
1284                         indirect_branch_prediction_barrier();
1285         }
1286
1287         if (!already_loaded) {
1288                 void *gdt = get_current_gdt_ro();
1289                 unsigned long sysenter_esp;
1290
1291                 /*
1292                  * Flush all EPTP/VPID contexts, the new pCPU may have stale
1293                  * TLB entries from its previous association with the vCPU.
1294                  */
1295                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1296
1297                 /*
1298                  * Linux uses per-cpu TSS and GDT, so set these when switching
1299                  * processors.  See 22.2.4.
1300                  */
1301                 vmcs_writel(HOST_TR_BASE,
1302                             (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1303                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
1304
1305                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1306                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1307
1308                 vmx->loaded_vmcs->cpu = cpu;
1309         }
1310
1311         /* Setup TSC multiplier */
1312         if (kvm_has_tsc_control &&
1313             vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
1314                 decache_tsc_multiplier(vmx);
1315 }
1316
1317 /*
1318  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1319  * vcpu mutex is already taken.
1320  */
1321 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1322 {
1323         struct vcpu_vmx *vmx = to_vmx(vcpu);
1324
1325         vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
1326
1327         vmx_vcpu_pi_load(vcpu, cpu);
1328
1329         vmx->host_debugctlmsr = get_debugctlmsr();
1330 }
1331
1332 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1333 {
1334         vmx_vcpu_pi_put(vcpu);
1335
1336         vmx_prepare_switch_to_host(to_vmx(vcpu));
1337 }
1338
1339 static bool emulation_required(struct kvm_vcpu *vcpu)
1340 {
1341         return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
1342 }
1343
1344 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1345 {
1346         struct vcpu_vmx *vmx = to_vmx(vcpu);
1347         unsigned long rflags, save_rflags;
1348
1349         if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1350                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1351                 rflags = vmcs_readl(GUEST_RFLAGS);
1352                 if (vmx->rmode.vm86_active) {
1353                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1354                         save_rflags = vmx->rmode.save_rflags;
1355                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1356                 }
1357                 vmx->rflags = rflags;
1358         }
1359         return vmx->rflags;
1360 }
1361
1362 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1363 {
1364         struct vcpu_vmx *vmx = to_vmx(vcpu);
1365         unsigned long old_rflags;
1366
1367         if (is_unrestricted_guest(vcpu)) {
1368                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1369                 vmx->rflags = rflags;
1370                 vmcs_writel(GUEST_RFLAGS, rflags);
1371                 return;
1372         }
1373
1374         old_rflags = vmx_get_rflags(vcpu);
1375         vmx->rflags = rflags;
1376         if (vmx->rmode.vm86_active) {
1377                 vmx->rmode.save_rflags = rflags;
1378                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1379         }
1380         vmcs_writel(GUEST_RFLAGS, rflags);
1381
1382         if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1383                 vmx->emulation_required = emulation_required(vcpu);
1384 }
1385
1386 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1387 {
1388         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1389         int ret = 0;
1390
1391         if (interruptibility & GUEST_INTR_STATE_STI)
1392                 ret |= KVM_X86_SHADOW_INT_STI;
1393         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1394                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1395
1396         return ret;
1397 }
1398
1399 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1400 {
1401         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1402         u32 interruptibility = interruptibility_old;
1403
1404         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1405
1406         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1407                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1408         else if (mask & KVM_X86_SHADOW_INT_STI)
1409                 interruptibility |= GUEST_INTR_STATE_STI;
1410
1411         if ((interruptibility != interruptibility_old))
1412                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1413 }
1414
1415 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1416 {
1417         struct vcpu_vmx *vmx = to_vmx(vcpu);
1418         unsigned long value;
1419
1420         /*
1421          * Any MSR write that attempts to change bits marked reserved will
1422          * case a #GP fault.
1423          */
1424         if (data & vmx->pt_desc.ctl_bitmask)
1425                 return 1;
1426
1427         /*
1428          * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1429          * result in a #GP unless the same write also clears TraceEn.
1430          */
1431         if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1432                 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1433                 return 1;
1434
1435         /*
1436          * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1437          * and FabricEn would cause #GP, if
1438          * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1439          */
1440         if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1441                 !(data & RTIT_CTL_FABRIC_EN) &&
1442                 !intel_pt_validate_cap(vmx->pt_desc.caps,
1443                                         PT_CAP_single_range_output))
1444                 return 1;
1445
1446         /*
1447          * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1448          * utilize encodings marked reserved will casue a #GP fault.
1449          */
1450         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1451         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1452                         !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1453                         RTIT_CTL_MTC_RANGE_OFFSET, &value))
1454                 return 1;
1455         value = intel_pt_validate_cap(vmx->pt_desc.caps,
1456                                                 PT_CAP_cycle_thresholds);
1457         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1458                         !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1459                         RTIT_CTL_CYC_THRESH_OFFSET, &value))
1460                 return 1;
1461         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1462         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1463                         !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1464                         RTIT_CTL_PSB_FREQ_OFFSET, &value))
1465                 return 1;
1466
1467         /*
1468          * If ADDRx_CFG is reserved or the encodings is >2 will
1469          * cause a #GP fault.
1470          */
1471         value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1472         if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
1473                 return 1;
1474         value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1475         if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
1476                 return 1;
1477         value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1478         if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
1479                 return 1;
1480         value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1481         if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
1482                 return 1;
1483
1484         return 0;
1485 }
1486
1487 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
1488 {
1489         return true;
1490 }
1491
1492 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
1493 {
1494         unsigned long rip, orig_rip;
1495
1496         /*
1497          * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1498          * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1499          * set when EPT misconfig occurs.  In practice, real hardware updates
1500          * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1501          * (namely Hyper-V) don't set it due to it being undefined behavior,
1502          * i.e. we end up advancing IP with some random value.
1503          */
1504         if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
1505             to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
1506                 orig_rip = kvm_rip_read(vcpu);
1507                 rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1508 #ifdef CONFIG_X86_64
1509                 /*
1510                  * We need to mask out the high 32 bits of RIP if not in 64-bit
1511                  * mode, but just finding out that we are in 64-bit mode is
1512                  * quite expensive.  Only do it if there was a carry.
1513                  */
1514                 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1515                         rip = (u32)rip;
1516 #endif
1517                 kvm_rip_write(vcpu, rip);
1518         } else {
1519                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1520                         return 0;
1521         }
1522
1523         /* skipping an emulated instruction also counts */
1524         vmx_set_interrupt_shadow(vcpu, 0);
1525
1526         return 1;
1527 }
1528
1529 /*
1530  * Recognizes a pending MTF VM-exit and records the nested state for later
1531  * delivery.
1532  */
1533 static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1534 {
1535         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1536         struct vcpu_vmx *vmx = to_vmx(vcpu);
1537
1538         if (!is_guest_mode(vcpu))
1539                 return;
1540
1541         /*
1542          * Per the SDM, MTF takes priority over debug-trap exceptions besides
1543          * T-bit traps. As instruction emulation is completed (i.e. at the
1544          * instruction boundary), any #DB exception pending delivery must be a
1545          * debug-trap. Record the pending MTF state to be delivered in
1546          * vmx_check_nested_events().
1547          */
1548         if (nested_cpu_has_mtf(vmcs12) &&
1549             (!vcpu->arch.exception.pending ||
1550              vcpu->arch.exception.nr == DB_VECTOR))
1551                 vmx->nested.mtf_pending = true;
1552         else
1553                 vmx->nested.mtf_pending = false;
1554 }
1555
1556 static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1557 {
1558         vmx_update_emulated_instruction(vcpu);
1559         return skip_emulated_instruction(vcpu);
1560 }
1561
1562 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1563 {
1564         /*
1565          * Ensure that we clear the HLT state in the VMCS.  We don't need to
1566          * explicitly skip the instruction because if the HLT state is set,
1567          * then the instruction is already executing and RIP has already been
1568          * advanced.
1569          */
1570         if (kvm_hlt_in_guest(vcpu->kvm) &&
1571                         vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1572                 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1573 }
1574
1575 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
1576 {
1577         struct vcpu_vmx *vmx = to_vmx(vcpu);
1578         unsigned nr = vcpu->arch.exception.nr;
1579         bool has_error_code = vcpu->arch.exception.has_error_code;
1580         u32 error_code = vcpu->arch.exception.error_code;
1581         u32 intr_info = nr | INTR_INFO_VALID_MASK;
1582
1583         kvm_deliver_exception_payload(vcpu);
1584
1585         if (has_error_code) {
1586                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1587                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1588         }
1589
1590         if (vmx->rmode.vm86_active) {
1591                 int inc_eip = 0;
1592                 if (kvm_exception_is_soft(nr))
1593                         inc_eip = vcpu->arch.event_exit_inst_len;
1594                 kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
1595                 return;
1596         }
1597
1598         WARN_ON_ONCE(vmx->emulation_required);
1599
1600         if (kvm_exception_is_soft(nr)) {
1601                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1602                              vmx->vcpu.arch.event_exit_inst_len);
1603                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1604         } else
1605                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1606
1607         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1608
1609         vmx_clear_hlt(vcpu);
1610 }
1611
1612 /*
1613  * Swap MSR entry in host/guest MSR entry array.
1614  */
1615 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1616 {
1617         struct vmx_uret_msr tmp;
1618
1619         tmp = vmx->guest_uret_msrs[to];
1620         vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
1621         vmx->guest_uret_msrs[from] = tmp;
1622 }
1623
1624 /*
1625  * Set up the vmcs to automatically save and restore system
1626  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
1627  * mode, as fiddling with msrs is very expensive.
1628  */
1629 static void setup_msrs(struct vcpu_vmx *vmx)
1630 {
1631         int nr_active_uret_msrs, index;
1632
1633         nr_active_uret_msrs = 0;
1634 #ifdef CONFIG_X86_64
1635         /*
1636          * The SYSCALL MSRs are only needed on long mode guests, and only
1637          * when EFER.SCE is set.
1638          */
1639         if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
1640                 index = __find_msr_index(vmx, MSR_STAR);
1641                 if (index >= 0)
1642                         move_msr_up(vmx, index, nr_active_uret_msrs++);
1643                 index = __find_msr_index(vmx, MSR_LSTAR);
1644                 if (index >= 0)
1645                         move_msr_up(vmx, index, nr_active_uret_msrs++);
1646                 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
1647                 if (index >= 0)
1648                         move_msr_up(vmx, index, nr_active_uret_msrs++);
1649         }
1650 #endif
1651         index = __find_msr_index(vmx, MSR_EFER);
1652         if (index >= 0 && update_transition_efer(vmx, index))
1653                 move_msr_up(vmx, index, nr_active_uret_msrs++);
1654         index = __find_msr_index(vmx, MSR_TSC_AUX);
1655         if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
1656                 move_msr_up(vmx, index, nr_active_uret_msrs++);
1657         index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
1658         if (index >= 0)
1659                 move_msr_up(vmx, index, nr_active_uret_msrs++);
1660
1661         vmx->nr_active_uret_msrs = nr_active_uret_msrs;
1662         vmx->guest_uret_msrs_loaded = false;
1663
1664         if (cpu_has_vmx_msr_bitmap())
1665                 vmx_update_msr_bitmap(&vmx->vcpu);
1666 }
1667
1668 static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1669 {
1670         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1671         u64 g_tsc_offset = 0;
1672
1673         /*
1674          * We're here if L1 chose not to trap WRMSR to TSC. According
1675          * to the spec, this should set L1's TSC; The offset that L1
1676          * set for L2 remains unchanged, and still needs to be added
1677          * to the newly set TSC to get L2's TSC.
1678          */
1679         if (is_guest_mode(vcpu) &&
1680             (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
1681                 g_tsc_offset = vmcs12->tsc_offset;
1682
1683         trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1684                                    vcpu->arch.tsc_offset - g_tsc_offset,
1685                                    offset);
1686         vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
1687         return offset + g_tsc_offset;
1688 }
1689
1690 /*
1691  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1692  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1693  * all guests if the "nested" module option is off, and can also be disabled
1694  * for a single guest by disabling its VMX cpuid bit.
1695  */
1696 bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1697 {
1698         return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
1699 }
1700
1701 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
1702                                                  uint64_t val)
1703 {
1704         uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
1705
1706         return !(val & ~valid_bits);
1707 }
1708
1709 static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
1710 {
1711         switch (msr->index) {
1712         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1713                 if (!nested)
1714                         return 1;
1715                 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1716         case MSR_IA32_PERF_CAPABILITIES:
1717                 msr->data = vmx_get_perf_capabilities();
1718                 return 0;
1719         default:
1720                 return KVM_MSR_RET_INVALID;
1721         }
1722 }
1723
1724 /*
1725  * Reads an msr value (of 'msr_index') into 'pdata'.
1726  * Returns 0 on success, non-0 otherwise.
1727  * Assumes vcpu_load() was already called.
1728  */
1729 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1730 {
1731         struct vcpu_vmx *vmx = to_vmx(vcpu);
1732         struct vmx_uret_msr *msr;
1733         u32 index;
1734
1735         switch (msr_info->index) {
1736 #ifdef CONFIG_X86_64
1737         case MSR_FS_BASE:
1738                 msr_info->data = vmcs_readl(GUEST_FS_BASE);
1739                 break;
1740         case MSR_GS_BASE:
1741                 msr_info->data = vmcs_readl(GUEST_GS_BASE);
1742                 break;
1743         case MSR_KERNEL_GS_BASE:
1744                 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
1745                 break;
1746 #endif
1747         case MSR_EFER:
1748                 return kvm_get_msr_common(vcpu, msr_info);
1749         case MSR_IA32_TSX_CTRL:
1750                 if (!msr_info->host_initiated &&
1751                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
1752                         return 1;
1753                 goto find_uret_msr;
1754         case MSR_IA32_UMWAIT_CONTROL:
1755                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
1756                         return 1;
1757
1758                 msr_info->data = vmx->msr_ia32_umwait_control;
1759                 break;
1760         case MSR_IA32_SPEC_CTRL:
1761                 if (!msr_info->host_initiated &&
1762                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
1763                         return 1;
1764
1765                 msr_info->data = to_vmx(vcpu)->spec_ctrl;
1766                 break;
1767         case MSR_IA32_SYSENTER_CS:
1768                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
1769                 break;
1770         case MSR_IA32_SYSENTER_EIP:
1771                 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
1772                 break;
1773         case MSR_IA32_SYSENTER_ESP:
1774                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
1775                 break;
1776         case MSR_IA32_BNDCFGS:
1777                 if (!kvm_mpx_supported() ||
1778                     (!msr_info->host_initiated &&
1779                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
1780                         return 1;
1781                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
1782                 break;
1783         case MSR_IA32_MCG_EXT_CTL:
1784                 if (!msr_info->host_initiated &&
1785                     !(vmx->msr_ia32_feature_control &
1786                       FEAT_CTL_LMCE_ENABLED))
1787                         return 1;
1788                 msr_info->data = vcpu->arch.mcg_ext_ctl;
1789                 break;
1790         case MSR_IA32_FEAT_CTL:
1791                 msr_info->data = vmx->msr_ia32_feature_control;
1792                 break;
1793         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1794                 if (!nested_vmx_allowed(vcpu))
1795                         return 1;
1796                 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
1797                                     &msr_info->data))
1798                         return 1;
1799                 /*
1800                  * Enlightened VMCS v1 doesn't have certain fields, but buggy
1801                  * Hyper-V versions are still trying to use corresponding
1802                  * features when they are exposed. Filter out the essential
1803                  * minimum.
1804                  */
1805                 if (!msr_info->host_initiated &&
1806                     vmx->nested.enlightened_vmcs_enabled)
1807                         nested_evmcs_filter_control_msr(msr_info->index,
1808                                                         &msr_info->data);
1809                 break;
1810         case MSR_IA32_RTIT_CTL:
1811                 if (!vmx_pt_mode_is_host_guest())
1812                         return 1;
1813                 msr_info->data = vmx->pt_desc.guest.ctl;
1814                 break;
1815         case MSR_IA32_RTIT_STATUS:
1816                 if (!vmx_pt_mode_is_host_guest())
1817                         return 1;
1818                 msr_info->data = vmx->pt_desc.guest.status;
1819                 break;
1820         case MSR_IA32_RTIT_CR3_MATCH:
1821                 if (!vmx_pt_mode_is_host_guest() ||
1822                         !intel_pt_validate_cap(vmx->pt_desc.caps,
1823                                                 PT_CAP_cr3_filtering))
1824                         return 1;
1825                 msr_info->data = vmx->pt_desc.guest.cr3_match;
1826                 break;
1827         case MSR_IA32_RTIT_OUTPUT_BASE:
1828                 if (!vmx_pt_mode_is_host_guest() ||
1829                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
1830                                         PT_CAP_topa_output) &&
1831                          !intel_pt_validate_cap(vmx->pt_desc.caps,
1832                                         PT_CAP_single_range_output)))
1833                         return 1;
1834                 msr_info->data = vmx->pt_desc.guest.output_base;
1835                 break;
1836         case MSR_IA32_RTIT_OUTPUT_MASK:
1837                 if (!vmx_pt_mode_is_host_guest() ||
1838                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
1839                                         PT_CAP_topa_output) &&
1840                          !intel_pt_validate_cap(vmx->pt_desc.caps,
1841                                         PT_CAP_single_range_output)))
1842                         return 1;
1843                 msr_info->data = vmx->pt_desc.guest.output_mask;
1844                 break;
1845         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
1846                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
1847                 if (!vmx_pt_mode_is_host_guest() ||
1848                         (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
1849                                         PT_CAP_num_address_ranges)))
1850                         return 1;
1851                 if (index % 2)
1852                         msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
1853                 else
1854                         msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
1855                 break;
1856         case MSR_TSC_AUX:
1857                 if (!msr_info->host_initiated &&
1858                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1859                         return 1;
1860                 goto find_uret_msr;
1861         default:
1862         find_uret_msr:
1863                 msr = find_msr_entry(vmx, msr_info->index);
1864                 if (msr) {
1865                         msr_info->data = msr->data;
1866                         break;
1867                 }
1868                 return kvm_get_msr_common(vcpu, msr_info);
1869         }
1870
1871         return 0;
1872 }
1873
1874 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
1875                                                     u64 data)
1876 {
1877 #ifdef CONFIG_X86_64
1878         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
1879                 return (u32)data;
1880 #endif
1881         return (unsigned long)data;
1882 }
1883
1884 /*
1885  * Writes msr value into the appropriate "register".
1886  * Returns 0 on success, non-0 otherwise.
1887  * Assumes vcpu_load() was already called.
1888  */
1889 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1890 {
1891         struct vcpu_vmx *vmx = to_vmx(vcpu);
1892         struct vmx_uret_msr *msr;
1893         int ret = 0;
1894         u32 msr_index = msr_info->index;
1895         u64 data = msr_info->data;
1896         u32 index;
1897
1898         switch (msr_index) {
1899         case MSR_EFER:
1900                 ret = kvm_set_msr_common(vcpu, msr_info);
1901                 break;
1902 #ifdef CONFIG_X86_64
1903         case MSR_FS_BASE:
1904                 vmx_segment_cache_clear(vmx);
1905                 vmcs_writel(GUEST_FS_BASE, data);
1906                 break;
1907         case MSR_GS_BASE:
1908                 vmx_segment_cache_clear(vmx);
1909                 vmcs_writel(GUEST_GS_BASE, data);
1910                 break;
1911         case MSR_KERNEL_GS_BASE:
1912                 vmx_write_guest_kernel_gs_base(vmx, data);
1913                 break;
1914 #endif
1915         case MSR_IA32_SYSENTER_CS:
1916                 if (is_guest_mode(vcpu))
1917                         get_vmcs12(vcpu)->guest_sysenter_cs = data;
1918                 vmcs_write32(GUEST_SYSENTER_CS, data);
1919                 break;
1920         case MSR_IA32_SYSENTER_EIP:
1921                 if (is_guest_mode(vcpu)) {
1922                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
1923                         get_vmcs12(vcpu)->guest_sysenter_eip = data;
1924                 }
1925                 vmcs_writel(GUEST_SYSENTER_EIP, data);
1926                 break;
1927         case MSR_IA32_SYSENTER_ESP:
1928                 if (is_guest_mode(vcpu)) {
1929                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
1930                         get_vmcs12(vcpu)->guest_sysenter_esp = data;
1931                 }
1932                 vmcs_writel(GUEST_SYSENTER_ESP, data);
1933                 break;
1934         case MSR_IA32_DEBUGCTLMSR:
1935                 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
1936                                                 VM_EXIT_SAVE_DEBUG_CONTROLS)
1937                         get_vmcs12(vcpu)->guest_ia32_debugctl = data;
1938
1939                 ret = kvm_set_msr_common(vcpu, msr_info);
1940                 break;
1941
1942         case MSR_IA32_BNDCFGS:
1943                 if (!kvm_mpx_supported() ||
1944                     (!msr_info->host_initiated &&
1945                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
1946                         return 1;
1947                 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
1948                     (data & MSR_IA32_BNDCFGS_RSVD))
1949                         return 1;
1950                 vmcs_write64(GUEST_BNDCFGS, data);
1951                 break;
1952         case MSR_IA32_UMWAIT_CONTROL:
1953                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
1954                         return 1;
1955
1956                 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
1957                 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
1958                         return 1;
1959
1960                 vmx->msr_ia32_umwait_control = data;
1961                 break;
1962         case MSR_IA32_SPEC_CTRL:
1963                 if (!msr_info->host_initiated &&
1964                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
1965                         return 1;
1966
1967                 if (kvm_spec_ctrl_test_value(data))
1968                         return 1;
1969
1970                 vmx->spec_ctrl = data;
1971                 if (!data)
1972                         break;
1973
1974                 /*
1975                  * For non-nested:
1976                  * When it's written (to non-zero) for the first time, pass
1977                  * it through.
1978                  *
1979                  * For nested:
1980                  * The handling of the MSR bitmap for L2 guests is done in
1981                  * nested_vmx_prepare_msr_bitmap. We should not touch the
1982                  * vmcs02.msr_bitmap here since it gets completely overwritten
1983                  * in the merging. We update the vmcs01 here for L1 as well
1984                  * since it will end up touching the MSR anyway now.
1985                  */
1986                 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
1987                                               MSR_IA32_SPEC_CTRL,
1988                                               MSR_TYPE_RW);
1989                 break;
1990         case MSR_IA32_TSX_CTRL:
1991                 if (!msr_info->host_initiated &&
1992                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
1993                         return 1;
1994                 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
1995                         return 1;
1996                 goto find_uret_msr;
1997         case MSR_IA32_PRED_CMD:
1998                 if (!msr_info->host_initiated &&
1999                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2000                         return 1;
2001
2002                 if (data & ~PRED_CMD_IBPB)
2003                         return 1;
2004                 if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL))
2005                         return 1;
2006                 if (!data)
2007                         break;
2008
2009                 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2010
2011                 /*
2012                  * For non-nested:
2013                  * When it's written (to non-zero) for the first time, pass
2014                  * it through.
2015                  *
2016                  * For nested:
2017                  * The handling of the MSR bitmap for L2 guests is done in
2018                  * nested_vmx_prepare_msr_bitmap. We should not touch the
2019                  * vmcs02.msr_bitmap here since it gets completely overwritten
2020                  * in the merging.
2021                  */
2022                 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
2023                                               MSR_TYPE_W);
2024                 break;
2025         case MSR_IA32_CR_PAT:
2026                 if (!kvm_pat_valid(data))
2027                         return 1;
2028
2029                 if (is_guest_mode(vcpu) &&
2030                     get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2031                         get_vmcs12(vcpu)->guest_ia32_pat = data;
2032
2033                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2034                         vmcs_write64(GUEST_IA32_PAT, data);
2035                         vcpu->arch.pat = data;
2036                         break;
2037                 }
2038                 ret = kvm_set_msr_common(vcpu, msr_info);
2039                 break;
2040         case MSR_IA32_TSC_ADJUST:
2041                 ret = kvm_set_msr_common(vcpu, msr_info);
2042                 break;
2043         case MSR_IA32_MCG_EXT_CTL:
2044                 if ((!msr_info->host_initiated &&
2045                      !(to_vmx(vcpu)->msr_ia32_feature_control &
2046                        FEAT_CTL_LMCE_ENABLED)) ||
2047                     (data & ~MCG_EXT_CTL_LMCE_EN))
2048                         return 1;
2049                 vcpu->arch.mcg_ext_ctl = data;
2050                 break;
2051         case MSR_IA32_FEAT_CTL:
2052                 if (!vmx_feature_control_msr_valid(vcpu, data) ||
2053                     (to_vmx(vcpu)->msr_ia32_feature_control &
2054                      FEAT_CTL_LOCKED && !msr_info->host_initiated))
2055                         return 1;
2056                 vmx->msr_ia32_feature_control = data;
2057                 if (msr_info->host_initiated && data == 0)
2058                         vmx_leave_nested(vcpu);
2059                 break;
2060         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2061                 if (!msr_info->host_initiated)
2062                         return 1; /* they are read-only */
2063                 if (!nested_vmx_allowed(vcpu))
2064                         return 1;
2065                 return vmx_set_vmx_msr(vcpu, msr_index, data);
2066         case MSR_IA32_RTIT_CTL:
2067                 if (!vmx_pt_mode_is_host_guest() ||
2068                         vmx_rtit_ctl_check(vcpu, data) ||
2069                         vmx->nested.vmxon)
2070                         return 1;
2071                 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2072                 vmx->pt_desc.guest.ctl = data;
2073                 pt_update_intercept_for_msr(vmx);
2074                 break;
2075         case MSR_IA32_RTIT_STATUS:
2076                 if (!pt_can_write_msr(vmx))
2077                         return 1;
2078                 if (data & MSR_IA32_RTIT_STATUS_MASK)
2079                         return 1;
2080                 vmx->pt_desc.guest.status = data;
2081                 break;
2082         case MSR_IA32_RTIT_CR3_MATCH:
2083                 if (!pt_can_write_msr(vmx))
2084                         return 1;
2085                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2086                                            PT_CAP_cr3_filtering))
2087                         return 1;
2088                 vmx->pt_desc.guest.cr3_match = data;
2089                 break;
2090         case MSR_IA32_RTIT_OUTPUT_BASE:
2091                 if (!pt_can_write_msr(vmx))
2092                         return 1;
2093                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2094                                            PT_CAP_topa_output) &&
2095                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2096                                            PT_CAP_single_range_output))
2097                         return 1;
2098                 if (!pt_output_base_valid(vcpu, data))
2099                         return 1;
2100                 vmx->pt_desc.guest.output_base = data;
2101                 break;
2102         case MSR_IA32_RTIT_OUTPUT_MASK:
2103                 if (!pt_can_write_msr(vmx))
2104                         return 1;
2105                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2106                                            PT_CAP_topa_output) &&
2107                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2108                                            PT_CAP_single_range_output))
2109                         return 1;
2110                 vmx->pt_desc.guest.output_mask = data;
2111                 break;
2112         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2113                 if (!pt_can_write_msr(vmx))
2114                         return 1;
2115                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2116                 if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
2117                                                        PT_CAP_num_address_ranges))
2118                         return 1;
2119                 if (is_noncanonical_address(data, vcpu))
2120                         return 1;
2121                 if (index % 2)
2122                         vmx->pt_desc.guest.addr_b[index / 2] = data;
2123                 else
2124                         vmx->pt_desc.guest.addr_a[index / 2] = data;
2125                 break;
2126         case MSR_TSC_AUX:
2127                 if (!msr_info->host_initiated &&
2128                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
2129                         return 1;
2130                 /* Check reserved bit, higher 32 bits should be zero */
2131                 if ((data >> 32) != 0)
2132                         return 1;
2133                 goto find_uret_msr;
2134
2135         default:
2136         find_uret_msr:
2137                 msr = find_msr_entry(vmx, msr_index);
2138                 if (msr)
2139                         ret = vmx_set_guest_msr(vmx, msr, data);
2140                 else
2141                         ret = kvm_set_msr_common(vcpu, msr_info);
2142         }
2143
2144         return ret;
2145 }
2146
2147 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2148 {
2149         unsigned long guest_owned_bits;
2150
2151         kvm_register_mark_available(vcpu, reg);
2152
2153         switch (reg) {
2154         case VCPU_REGS_RSP:
2155                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2156                 break;
2157         case VCPU_REGS_RIP:
2158                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2159                 break;
2160         case VCPU_EXREG_PDPTR:
2161                 if (enable_ept)
2162                         ept_save_pdptrs(vcpu);
2163                 break;
2164         case VCPU_EXREG_CR0:
2165                 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2166
2167                 vcpu->arch.cr0 &= ~guest_owned_bits;
2168                 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2169                 break;
2170         case VCPU_EXREG_CR3:
2171                 if (is_unrestricted_guest(vcpu) ||
2172                     (enable_ept && is_paging(vcpu)))
2173                         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2174                 break;
2175         case VCPU_EXREG_CR4:
2176                 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2177
2178                 vcpu->arch.cr4 &= ~guest_owned_bits;
2179                 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2180                 break;
2181         default:
2182                 WARN_ON_ONCE(1);
2183                 break;
2184         }
2185 }
2186
2187 static __init int cpu_has_kvm_support(void)
2188 {
2189         return cpu_has_vmx();
2190 }
2191
2192 static __init int vmx_disabled_by_bios(void)
2193 {
2194         return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2195                !boot_cpu_has(X86_FEATURE_VMX);
2196 }
2197
2198 static int kvm_cpu_vmxon(u64 vmxon_pointer)
2199 {
2200         u64 msr;
2201
2202         cr4_set_bits(X86_CR4_VMXE);
2203         intel_pt_handle_vmx(1);
2204
2205         asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
2206                           _ASM_EXTABLE(1b, %l[fault])
2207                           : : [vmxon_pointer] "m"(vmxon_pointer)
2208                           : : fault);
2209         return 0;
2210
2211 fault:
2212         WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2213                   rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
2214         intel_pt_handle_vmx(0);
2215         cr4_clear_bits(X86_CR4_VMXE);
2216
2217         return -EFAULT;
2218 }
2219
2220 static int hardware_enable(void)
2221 {
2222         int cpu = raw_smp_processor_id();
2223         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2224         int r;
2225
2226         if (cr4_read_shadow() & X86_CR4_VMXE)
2227                 return -EBUSY;
2228
2229         /*
2230          * This can happen if we hot-added a CPU but failed to allocate
2231          * VP assist page for it.
2232          */
2233         if (static_branch_unlikely(&enable_evmcs) &&
2234             !hv_get_vp_assist_page(cpu))
2235                 return -EFAULT;
2236
2237         r = kvm_cpu_vmxon(phys_addr);
2238         if (r)
2239                 return r;
2240
2241         if (enable_ept)
2242                 ept_sync_global();
2243
2244         return 0;
2245 }
2246
2247 static void vmclear_local_loaded_vmcss(void)
2248 {
2249         int cpu = raw_smp_processor_id();
2250         struct loaded_vmcs *v, *n;
2251
2252         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2253                                  loaded_vmcss_on_cpu_link)
2254                 __loaded_vmcs_clear(v);
2255 }
2256
2257
2258 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
2259  * tricks.
2260  */
2261 static void kvm_cpu_vmxoff(void)
2262 {
2263         asm volatile (__ex("vmxoff"));
2264
2265         intel_pt_handle_vmx(0);
2266         cr4_clear_bits(X86_CR4_VMXE);
2267 }
2268
2269 static void hardware_disable(void)
2270 {
2271         vmclear_local_loaded_vmcss();
2272         kvm_cpu_vmxoff();
2273 }
2274
2275 /*
2276  * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2277  * directly instead of going through cpu_has(), to ensure KVM is trapping
2278  * ENCLS whenever it's supported in hardware.  It does not matter whether
2279  * the host OS supports or has enabled SGX.
2280  */
2281 static bool cpu_has_sgx(void)
2282 {
2283         return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2284 }
2285
2286 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
2287                                       u32 msr, u32 *result)
2288 {
2289         u32 vmx_msr_low, vmx_msr_high;
2290         u32 ctl = ctl_min | ctl_opt;
2291
2292         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2293
2294         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2295         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2296
2297         /* Ensure minimum (required) set of control bits are supported. */
2298         if (ctl_min & ~ctl)
2299                 return -EIO;
2300
2301         *result = ctl;
2302         return 0;
2303 }
2304
2305 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2306                                     struct vmx_capability *vmx_cap)
2307 {
2308         u32 vmx_msr_low, vmx_msr_high;
2309         u32 min, opt, min2, opt2;
2310         u32 _pin_based_exec_control = 0;
2311         u32 _cpu_based_exec_control = 0;
2312         u32 _cpu_based_2nd_exec_control = 0;
2313         u32 _vmexit_control = 0;
2314         u32 _vmentry_control = 0;
2315
2316         memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2317         min = CPU_BASED_HLT_EXITING |
2318 #ifdef CONFIG_X86_64
2319               CPU_BASED_CR8_LOAD_EXITING |
2320               CPU_BASED_CR8_STORE_EXITING |
2321 #endif
2322               CPU_BASED_CR3_LOAD_EXITING |
2323               CPU_BASED_CR3_STORE_EXITING |
2324               CPU_BASED_UNCOND_IO_EXITING |
2325               CPU_BASED_MOV_DR_EXITING |
2326               CPU_BASED_USE_TSC_OFFSETTING |
2327               CPU_BASED_MWAIT_EXITING |
2328               CPU_BASED_MONITOR_EXITING |
2329               CPU_BASED_INVLPG_EXITING |
2330               CPU_BASED_RDPMC_EXITING;
2331
2332         opt = CPU_BASED_TPR_SHADOW |
2333               CPU_BASED_USE_MSR_BITMAPS |
2334               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2335         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
2336                                 &_cpu_based_exec_control) < 0)
2337                 return -EIO;
2338 #ifdef CONFIG_X86_64
2339         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2340                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
2341                                            ~CPU_BASED_CR8_STORE_EXITING;
2342 #endif
2343         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2344                 min2 = 0;
2345                 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2346                         SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2347                         SECONDARY_EXEC_WBINVD_EXITING |
2348                         SECONDARY_EXEC_ENABLE_VPID |
2349                         SECONDARY_EXEC_ENABLE_EPT |
2350                         SECONDARY_EXEC_UNRESTRICTED_GUEST |
2351                         SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2352                         SECONDARY_EXEC_DESC |
2353                         SECONDARY_EXEC_ENABLE_RDTSCP |
2354                         SECONDARY_EXEC_ENABLE_INVPCID |
2355                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
2356                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2357                         SECONDARY_EXEC_SHADOW_VMCS |
2358                         SECONDARY_EXEC_XSAVES |
2359                         SECONDARY_EXEC_RDSEED_EXITING |
2360                         SECONDARY_EXEC_RDRAND_EXITING |
2361                         SECONDARY_EXEC_ENABLE_PML |
2362                         SECONDARY_EXEC_TSC_SCALING |
2363                         SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2364                         SECONDARY_EXEC_PT_USE_GPA |
2365                         SECONDARY_EXEC_PT_CONCEAL_VMX |
2366                         SECONDARY_EXEC_ENABLE_VMFUNC;
2367                 if (cpu_has_sgx())
2368                         opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
2369                 if (adjust_vmx_controls(min2, opt2,
2370                                         MSR_IA32_VMX_PROCBASED_CTLS2,
2371                                         &_cpu_based_2nd_exec_control) < 0)
2372                         return -EIO;
2373         }
2374 #ifndef CONFIG_X86_64
2375         if (!(_cpu_based_2nd_exec_control &
2376                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2377                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2378 #endif
2379
2380         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2381                 _cpu_based_2nd_exec_control &= ~(
2382                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2383                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2384                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2385
2386         rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2387                 &vmx_cap->ept, &vmx_cap->vpid);
2388
2389         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2390                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2391                    enabled */
2392                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
2393                                              CPU_BASED_CR3_STORE_EXITING |
2394                                              CPU_BASED_INVLPG_EXITING);
2395         } else if (vmx_cap->ept) {
2396                 vmx_cap->ept = 0;
2397                 pr_warn_once("EPT CAP should not exist if not support "
2398                                 "1-setting enable EPT VM-execution control\n");
2399         }
2400         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2401                 vmx_cap->vpid) {
2402                 vmx_cap->vpid = 0;
2403                 pr_warn_once("VPID CAP should not exist if not support "
2404                                 "1-setting enable VPID VM-execution control\n");
2405         }
2406
2407         min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
2408 #ifdef CONFIG_X86_64
2409         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2410 #endif
2411         opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
2412               VM_EXIT_LOAD_IA32_PAT |
2413               VM_EXIT_LOAD_IA32_EFER |
2414               VM_EXIT_CLEAR_BNDCFGS |
2415               VM_EXIT_PT_CONCEAL_PIP |
2416               VM_EXIT_CLEAR_IA32_RTIT_CTL;
2417         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2418                                 &_vmexit_control) < 0)
2419                 return -EIO;
2420
2421         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2422         opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
2423                  PIN_BASED_VMX_PREEMPTION_TIMER;
2424         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2425                                 &_pin_based_exec_control) < 0)
2426                 return -EIO;
2427
2428         if (cpu_has_broken_vmx_preemption_timer())
2429                 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2430         if (!(_cpu_based_2nd_exec_control &
2431                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2432                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2433
2434         min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
2435         opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
2436               VM_ENTRY_LOAD_IA32_PAT |
2437               VM_ENTRY_LOAD_IA32_EFER |
2438               VM_ENTRY_LOAD_BNDCFGS |
2439               VM_ENTRY_PT_CONCEAL_PIP |
2440               VM_ENTRY_LOAD_IA32_RTIT_CTL;
2441         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
2442                                 &_vmentry_control) < 0)
2443                 return -EIO;
2444
2445         /*
2446          * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2447          * can't be used due to an errata where VM Exit may incorrectly clear
2448          * IA32_PERF_GLOBAL_CTRL[34:32].  Workaround the errata by using the
2449          * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2450          */
2451         if (boot_cpu_data.x86 == 0x6) {
2452                 switch (boot_cpu_data.x86_model) {
2453                 case 26: /* AAK155 */
2454                 case 30: /* AAP115 */
2455                 case 37: /* AAT100 */
2456                 case 44: /* BC86,AAY89,BD102 */
2457                 case 46: /* BA97 */
2458                         _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
2459                         _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
2460                         pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2461                                         "does not work properly. Using workaround\n");
2462                         break;
2463                 default:
2464                         break;
2465                 }
2466         }
2467
2468
2469         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2470
2471         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2472         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2473                 return -EIO;
2474
2475 #ifdef CONFIG_X86_64
2476         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2477         if (vmx_msr_high & (1u<<16))
2478                 return -EIO;
2479 #endif
2480
2481         /* Require Write-Back (WB) memory type for VMCS accesses. */
2482         if (((vmx_msr_high >> 18) & 15) != 6)
2483                 return -EIO;
2484
2485         vmcs_conf->size = vmx_msr_high & 0x1fff;
2486         vmcs_conf->order = get_order(vmcs_conf->size);
2487         vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
2488
2489         vmcs_conf->revision_id = vmx_msr_low;
2490
2491         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2492         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2493         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2494         vmcs_conf->vmexit_ctrl         = _vmexit_control;
2495         vmcs_conf->vmentry_ctrl        = _vmentry_control;
2496
2497         if (static_branch_unlikely(&enable_evmcs))
2498                 evmcs_sanitize_exec_ctrls(vmcs_conf);
2499
2500         return 0;
2501 }
2502
2503 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
2504 {
2505         int node = cpu_to_node(cpu);
2506         struct page *pages;
2507         struct vmcs *vmcs;
2508
2509         pages = __alloc_pages_node(node, flags, vmcs_config.order);
2510         if (!pages)
2511                 return NULL;
2512         vmcs = page_address(pages);
2513         memset(vmcs, 0, vmcs_config.size);
2514
2515         /* KVM supports Enlightened VMCS v1 only */
2516         if (static_branch_unlikely(&enable_evmcs))
2517                 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2518         else
2519                 vmcs->hdr.revision_id = vmcs_config.revision_id;
2520
2521         if (shadow)
2522                 vmcs->hdr.shadow_vmcs = 1;
2523         return vmcs;
2524 }
2525
2526 void free_vmcs(struct vmcs *vmcs)
2527 {
2528         free_pages((unsigned long)vmcs, vmcs_config.order);
2529 }
2530
2531 /*
2532  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2533  */
2534 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2535 {
2536         if (!loaded_vmcs->vmcs)
2537                 return;
2538         loaded_vmcs_clear(loaded_vmcs);
2539         free_vmcs(loaded_vmcs->vmcs);
2540         loaded_vmcs->vmcs = NULL;
2541         if (loaded_vmcs->msr_bitmap)
2542                 free_page((unsigned long)loaded_vmcs->msr_bitmap);
2543         WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2544 }
2545
2546 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2547 {
2548         loaded_vmcs->vmcs = alloc_vmcs(false);
2549         if (!loaded_vmcs->vmcs)
2550                 return -ENOMEM;
2551
2552         vmcs_clear(loaded_vmcs->vmcs);
2553
2554         loaded_vmcs->shadow_vmcs = NULL;
2555         loaded_vmcs->hv_timer_soft_disabled = false;
2556         loaded_vmcs->cpu = -1;
2557         loaded_vmcs->launched = 0;
2558
2559         if (cpu_has_vmx_msr_bitmap()) {
2560                 loaded_vmcs->msr_bitmap = (unsigned long *)
2561                                 __get_free_page(GFP_KERNEL_ACCOUNT);
2562                 if (!loaded_vmcs->msr_bitmap)
2563                         goto out_vmcs;
2564                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2565
2566                 if (IS_ENABLED(CONFIG_HYPERV) &&
2567                     static_branch_unlikely(&enable_evmcs) &&
2568                     (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
2569                         struct hv_enlightened_vmcs *evmcs =
2570                                 (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
2571
2572                         evmcs->hv_enlightenments_control.msr_bitmap = 1;
2573                 }
2574         }
2575
2576         memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
2577         memset(&loaded_vmcs->controls_shadow, 0,
2578                 sizeof(struct vmcs_controls_shadow));
2579
2580         return 0;
2581
2582 out_vmcs:
2583         free_loaded_vmcs(loaded_vmcs);
2584         return -ENOMEM;
2585 }
2586
2587 static void free_kvm_area(void)
2588 {
2589         int cpu;
2590
2591         for_each_possible_cpu(cpu) {
2592                 free_vmcs(per_cpu(vmxarea, cpu));
2593                 per_cpu(vmxarea, cpu) = NULL;
2594         }
2595 }
2596
2597 static __init int alloc_kvm_area(void)
2598 {
2599         int cpu;
2600
2601         for_each_possible_cpu(cpu) {
2602                 struct vmcs *vmcs;
2603
2604                 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
2605                 if (!vmcs) {
2606                         free_kvm_area();
2607                         return -ENOMEM;
2608                 }
2609
2610                 /*
2611                  * When eVMCS is enabled, alloc_vmcs_cpu() sets
2612                  * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2613                  * revision_id reported by MSR_IA32_VMX_BASIC.
2614                  *
2615                  * However, even though not explicitly documented by
2616                  * TLFS, VMXArea passed as VMXON argument should
2617                  * still be marked with revision_id reported by
2618                  * physical CPU.
2619                  */
2620                 if (static_branch_unlikely(&enable_evmcs))
2621                         vmcs->hdr.revision_id = vmcs_config.revision_id;
2622
2623                 per_cpu(vmxarea, cpu) = vmcs;
2624         }
2625         return 0;
2626 }
2627
2628 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2629                 struct kvm_segment *save)
2630 {
2631         if (!emulate_invalid_guest_state) {
2632                 /*
2633                  * CS and SS RPL should be equal during guest entry according
2634                  * to VMX spec, but in reality it is not always so. Since vcpu
2635                  * is in the middle of the transition from real mode to
2636                  * protected mode it is safe to assume that RPL 0 is a good
2637                  * default value.
2638                  */
2639                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2640                         save->selector &= ~SEGMENT_RPL_MASK;
2641                 save->dpl = save->selector & SEGMENT_RPL_MASK;
2642                 save->s = 1;
2643         }
2644         vmx_set_segment(vcpu, save, seg);
2645 }
2646
2647 static void enter_pmode(struct kvm_vcpu *vcpu)
2648 {
2649         unsigned long flags;
2650         struct vcpu_vmx *vmx = to_vmx(vcpu);
2651
2652         /*
2653          * Update real mode segment cache. It may be not up-to-date if sement
2654          * register was written while vcpu was in a guest mode.
2655          */
2656         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2657         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2658         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2659         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2660         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2661         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2662
2663         vmx->rmode.vm86_active = 0;
2664
2665         vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2666
2667         flags = vmcs_readl(GUEST_RFLAGS);
2668         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2669         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2670         vmcs_writel(GUEST_RFLAGS, flags);
2671
2672         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
2673                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
2674
2675         update_exception_bitmap(vcpu);
2676
2677         fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2678         fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2679         fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2680         fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2681         fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2682         fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2683 }
2684
2685 static void fix_rmode_seg(int seg, struct kvm_segment *save)
2686 {
2687         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2688         struct kvm_segment var = *save;
2689
2690         var.dpl = 0x3;
2691         if (seg == VCPU_SREG_CS)
2692                 var.type = 0x3;
2693
2694         if (!emulate_invalid_guest_state) {
2695                 var.selector = var.base >> 4;
2696                 var.base = var.base & 0xffff0;
2697                 var.limit = 0xffff;
2698                 var.g = 0;
2699                 var.db = 0;
2700                 var.present = 1;
2701                 var.s = 1;
2702                 var.l = 0;
2703                 var.unusable = 0;
2704                 var.type = 0x3;
2705                 var.avl = 0;
2706                 if (save->base & 0xf)
2707                         printk_once(KERN_WARNING "kvm: segment base is not "
2708                                         "paragraph aligned when entering "
2709                                         "protected mode (seg=%d)", seg);
2710         }
2711
2712         vmcs_write16(sf->selector, var.selector);
2713         vmcs_writel(sf->base, var.base);
2714         vmcs_write32(sf->limit, var.limit);
2715         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
2716 }
2717
2718 static void enter_rmode(struct kvm_vcpu *vcpu)
2719 {
2720         unsigned long flags;
2721         struct vcpu_vmx *vmx = to_vmx(vcpu);
2722         struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
2723
2724         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2725         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2726         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2727         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2728         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2729         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2730         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2731
2732         vmx->rmode.vm86_active = 1;
2733
2734         /*
2735          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2736          * vcpu. Warn the user that an update is overdue.
2737          */
2738         if (!kvm_vmx->tss_addr)
2739                 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
2740                              "called before entering vcpu\n");
2741
2742         vmx_segment_cache_clear(vmx);
2743
2744         vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
2745         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
2746         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2747
2748         flags = vmcs_readl(GUEST_RFLAGS);
2749         vmx->rmode.save_rflags = flags;
2750
2751         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2752
2753         vmcs_writel(GUEST_RFLAGS, flags);
2754         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
2755         update_exception_bitmap(vcpu);
2756
2757         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2758         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2759         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2760         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2761         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2762         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2763
2764         kvm_mmu_reset_context(vcpu);
2765 }
2766
2767 void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
2768 {
2769         struct vcpu_vmx *vmx = to_vmx(vcpu);
2770         struct vmx_uret_msr *msr = find_msr_entry(vmx, MSR_EFER);
2771
2772         if (!msr)
2773                 return;
2774
2775         vcpu->arch.efer = efer;
2776         if (efer & EFER_LMA) {
2777                 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2778                 msr->data = efer;
2779         } else {
2780                 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2781
2782                 msr->data = efer & ~EFER_LME;
2783         }
2784         setup_msrs(vmx);
2785 }
2786
2787 #ifdef CONFIG_X86_64
2788
2789 static void enter_lmode(struct kvm_vcpu *vcpu)
2790 {
2791         u32 guest_tr_ar;
2792
2793         vmx_segment_cache_clear(to_vmx(vcpu));
2794
2795         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
2796         if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
2797                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
2798                                      __func__);
2799                 vmcs_write32(GUEST_TR_AR_BYTES,
2800                              (guest_tr_ar & ~VMX_AR_TYPE_MASK)
2801                              | VMX_AR_TYPE_BUSY_64_TSS);
2802         }
2803         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
2804 }
2805
2806 static void exit_lmode(struct kvm_vcpu *vcpu)
2807 {
2808         vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2809         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
2810 }
2811
2812 #endif
2813
2814 static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
2815 {
2816         struct vcpu_vmx *vmx = to_vmx(vcpu);
2817
2818         /*
2819          * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
2820          * the CPU is not required to invalidate guest-physical mappings on
2821          * VM-Entry, even if VPID is disabled.  Guest-physical mappings are
2822          * associated with the root EPT structure and not any particular VPID
2823          * (INVVPID also isn't required to invalidate guest-physical mappings).
2824          */
2825         if (enable_ept) {
2826                 ept_sync_global();
2827         } else if (enable_vpid) {
2828                 if (cpu_has_vmx_invvpid_global()) {
2829                         vpid_sync_vcpu_global();
2830                 } else {
2831                         vpid_sync_vcpu_single(vmx->vpid);
2832                         vpid_sync_vcpu_single(vmx->nested.vpid02);
2833                 }
2834         }
2835 }
2836
2837 static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
2838 {
2839         struct kvm_mmu *mmu = vcpu->arch.mmu;
2840         u64 root_hpa = mmu->root_hpa;
2841
2842         /* No flush required if the current context is invalid. */
2843         if (!VALID_PAGE(root_hpa))
2844                 return;
2845
2846         if (enable_ept)
2847                 ept_sync_context(construct_eptp(vcpu, root_hpa,
2848                                                 mmu->shadow_root_level));
2849         else if (!is_guest_mode(vcpu))
2850                 vpid_sync_context(to_vmx(vcpu)->vpid);
2851         else
2852                 vpid_sync_context(nested_get_vpid02(vcpu));
2853 }
2854
2855 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
2856 {
2857         /*
2858          * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
2859          * vmx_flush_tlb_guest() for an explanation of why this is ok.
2860          */
2861         vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
2862 }
2863
2864 static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
2865 {
2866         /*
2867          * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
2868          * or a vpid couldn't be allocated for this vCPU.  VM-Enter and VM-Exit
2869          * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
2870          * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
2871          * i.e. no explicit INVVPID is necessary.
2872          */
2873         vpid_sync_context(to_vmx(vcpu)->vpid);
2874 }
2875
2876 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
2877 {
2878         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
2879
2880         if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
2881                 return;
2882
2883         if (is_pae_paging(vcpu)) {
2884                 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
2885                 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
2886                 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
2887                 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
2888         }
2889 }
2890
2891 void ept_save_pdptrs(struct kvm_vcpu *vcpu)
2892 {
2893         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
2894
2895         if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
2896                 return;
2897
2898         mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
2899         mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
2900         mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
2901         mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
2902
2903         kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
2904 }
2905
2906 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
2907                                         unsigned long cr0,
2908                                         struct kvm_vcpu *vcpu)
2909 {
2910         struct vcpu_vmx *vmx = to_vmx(vcpu);
2911
2912         if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
2913                 vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
2914         if (!(cr0 & X86_CR0_PG)) {
2915                 /* From paging/starting to nonpaging */
2916                 exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
2917                                           CPU_BASED_CR3_STORE_EXITING);
2918                 vcpu->arch.cr0 = cr0;
2919                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
2920         } else if (!is_paging(vcpu)) {
2921                 /* From nonpaging to paging */
2922                 exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
2923                                             CPU_BASED_CR3_STORE_EXITING);
2924                 vcpu->arch.cr0 = cr0;
2925                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
2926         }
2927
2928         if (!(cr0 & X86_CR0_WP))
2929                 *hw_cr0 &= ~X86_CR0_WP;
2930 }
2931
2932 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
2933 {
2934         struct vcpu_vmx *vmx = to_vmx(vcpu);
2935         unsigned long hw_cr0;
2936
2937         hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
2938         if (is_unrestricted_guest(vcpu))
2939                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
2940         else {
2941                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
2942
2943                 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
2944                         enter_pmode(vcpu);
2945
2946                 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
2947                         enter_rmode(vcpu);
2948         }
2949
2950 #ifdef CONFIG_X86_64
2951         if (vcpu->arch.efer & EFER_LME) {
2952                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
2953                         enter_lmode(vcpu);
2954                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
2955                         exit_lmode(vcpu);
2956         }
2957 #endif
2958
2959         if (enable_ept && !is_unrestricted_guest(vcpu))
2960                 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
2961
2962         vmcs_writel(CR0_READ_SHADOW, cr0);
2963         vmcs_writel(GUEST_CR0, hw_cr0);
2964         vcpu->arch.cr0 = cr0;
2965         kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
2966
2967         /* depends on vcpu->arch.cr0 to be set to a new value */
2968         vmx->emulation_required = emulation_required(vcpu);
2969 }
2970
2971 static int vmx_get_max_tdp_level(void)
2972 {
2973         if (cpu_has_vmx_ept_5levels())
2974                 return 5;
2975         return 4;
2976 }
2977
2978 u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
2979                    int root_level)
2980 {
2981         u64 eptp = VMX_EPTP_MT_WB;
2982
2983         eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
2984
2985         if (enable_ept_ad_bits &&
2986             (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
2987                 eptp |= VMX_EPTP_AD_ENABLE_BIT;
2988         eptp |= (root_hpa & PAGE_MASK);
2989
2990         return eptp;
2991 }
2992
2993 static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
2994                              int pgd_level)
2995 {
2996         struct kvm *kvm = vcpu->kvm;
2997         bool update_guest_cr3 = true;
2998         unsigned long guest_cr3;
2999         u64 eptp;
3000
3001         if (enable_ept) {
3002                 eptp = construct_eptp(vcpu, pgd, pgd_level);
3003                 vmcs_write64(EPT_POINTER, eptp);
3004
3005                 if (kvm_x86_ops.tlb_remote_flush) {
3006                         spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
3007                         to_vmx(vcpu)->ept_pointer = eptp;
3008                         to_kvm_vmx(kvm)->ept_pointers_match
3009                                 = EPT_POINTERS_CHECK;
3010                         spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
3011                 }
3012
3013                 if (!enable_unrestricted_guest && !is_paging(vcpu))
3014                         guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3015                 else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3016                         guest_cr3 = vcpu->arch.cr3;
3017                 else /* vmcs01.GUEST_CR3 is already up-to-date. */
3018                         update_guest_cr3 = false;
3019                 vmx_ept_load_pdptrs(vcpu);
3020         } else {
3021                 guest_cr3 = pgd;
3022         }
3023
3024         if (update_guest_cr3)
3025                 vmcs_writel(GUEST_CR3, guest_cr3);
3026 }
3027
3028 int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3029 {
3030         struct vcpu_vmx *vmx = to_vmx(vcpu);
3031         /*
3032          * Pass through host's Machine Check Enable value to hw_cr4, which
3033          * is in force while we are in guest mode.  Do not let guests control
3034          * this bit, even if host CR4.MCE == 0.
3035          */
3036         unsigned long hw_cr4;
3037
3038         hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3039         if (is_unrestricted_guest(vcpu))
3040                 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3041         else if (vmx->rmode.vm86_active)
3042                 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3043         else
3044                 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3045
3046         if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
3047                 if (cr4 & X86_CR4_UMIP) {
3048                         secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3049                         hw_cr4 &= ~X86_CR4_UMIP;
3050                 } else if (!is_guest_mode(vcpu) ||
3051                         !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3052                         secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3053                 }
3054         }
3055
3056         if (cr4 & X86_CR4_VMXE) {
3057                 /*
3058                  * To use VMXON (and later other VMX instructions), a guest
3059                  * must first be able to turn on cr4.VMXE (see handle_vmon()).
3060                  * So basically the check on whether to allow nested VMX
3061                  * is here.  We operate under the default treatment of SMM,
3062                  * so VMX cannot be enabled under SMM.
3063                  */
3064                 if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
3065                         return 1;
3066         }
3067
3068         if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3069                 return 1;
3070
3071         vcpu->arch.cr4 = cr4;
3072         kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
3073
3074         if (!is_unrestricted_guest(vcpu)) {
3075                 if (enable_ept) {
3076                         if (!is_paging(vcpu)) {
3077                                 hw_cr4 &= ~X86_CR4_PAE;
3078                                 hw_cr4 |= X86_CR4_PSE;
3079                         } else if (!(cr4 & X86_CR4_PAE)) {
3080                                 hw_cr4 &= ~X86_CR4_PAE;
3081                         }
3082                 }
3083
3084                 /*
3085                  * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3086                  * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
3087                  * to be manually disabled when guest switches to non-paging
3088                  * mode.
3089                  *
3090                  * If !enable_unrestricted_guest, the CPU is always running
3091                  * with CR0.PG=1 and CR4 needs to be modified.
3092                  * If enable_unrestricted_guest, the CPU automatically
3093                  * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3094                  */
3095                 if (!is_paging(vcpu))
3096                         hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3097         }
3098
3099         vmcs_writel(CR4_READ_SHADOW, cr4);
3100         vmcs_writel(GUEST_CR4, hw_cr4);
3101         return 0;
3102 }
3103
3104 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3105 {
3106         struct vcpu_vmx *vmx = to_vmx(vcpu);
3107         u32 ar;
3108
3109         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3110                 *var = vmx->rmode.segs[seg];
3111                 if (seg == VCPU_SREG_TR
3112                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3113                         return;
3114                 var->base = vmx_read_guest_seg_base(vmx, seg);
3115                 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3116                 return;
3117         }
3118         var->base = vmx_read_guest_seg_base(vmx, seg);
3119         var->limit = vmx_read_guest_seg_limit(vmx, seg);
3120         var->selector = vmx_read_guest_seg_selector(vmx, seg);
3121         ar = vmx_read_guest_seg_ar(vmx, seg);
3122         var->unusable = (ar >> 16) & 1;
3123         var->type = ar & 15;
3124         var->s = (ar >> 4) & 1;
3125         var->dpl = (ar >> 5) & 3;
3126         /*
3127          * Some userspaces do not preserve unusable property. Since usable
3128          * segment has to be present according to VMX spec we can use present
3129          * property to amend userspace bug by making unusable segment always
3130          * nonpresent. vmx_segment_access_rights() already marks nonpresent
3131          * segment as unusable.
3132          */
3133         var->present = !var->unusable;
3134         var->avl = (ar >> 12) & 1;
3135         var->l = (ar >> 13) & 1;
3136         var->db = (ar >> 14) & 1;
3137         var->g = (ar >> 15) & 1;
3138 }
3139
3140 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3141 {
3142         struct kvm_segment s;
3143
3144         if (to_vmx(vcpu)->rmode.vm86_active) {
3145                 vmx_get_segment(vcpu, &s, seg);
3146                 return s.base;
3147         }
3148         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3149 }
3150
3151 int vmx_get_cpl(struct kvm_vcpu *vcpu)
3152 {
3153         struct vcpu_vmx *vmx = to_vmx(vcpu);
3154
3155         if (unlikely(vmx->rmode.vm86_active))
3156                 return 0;
3157         else {
3158                 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3159                 return VMX_AR_DPL(ar);
3160         }
3161 }
3162
3163 static u32 vmx_segment_access_rights(struct kvm_segment *var)
3164 {
3165         u32 ar;
3166
3167         if (var->unusable || !var->present)
3168                 ar = 1 << 16;
3169         else {
3170                 ar = var->type & 15;
3171                 ar |= (var->s & 1) << 4;
3172                 ar |= (var->dpl & 3) << 5;
3173                 ar |= (var->present & 1) << 7;
3174                 ar |= (var->avl & 1) << 12;
3175                 ar |= (var->l & 1) << 13;
3176                 ar |= (var->db & 1) << 14;
3177                 ar |= (var->g & 1) << 15;
3178         }
3179
3180         return ar;
3181 }
3182
3183 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3184 {
3185         struct vcpu_vmx *vmx = to_vmx(vcpu);
3186         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3187
3188         vmx_segment_cache_clear(vmx);
3189
3190         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3191                 vmx->rmode.segs[seg] = *var;
3192                 if (seg == VCPU_SREG_TR)
3193                         vmcs_write16(sf->selector, var->selector);
3194                 else if (var->s)
3195                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3196                 goto out;
3197         }
3198
3199         vmcs_writel(sf->base, var->base);
3200         vmcs_write32(sf->limit, var->limit);
3201         vmcs_write16(sf->selector, var->selector);
3202
3203         /*
3204          *   Fix the "Accessed" bit in AR field of segment registers for older
3205          * qemu binaries.
3206          *   IA32 arch specifies that at the time of processor reset the
3207          * "Accessed" bit in the AR field of segment registers is 1. And qemu
3208          * is setting it to 0 in the userland code. This causes invalid guest
3209          * state vmexit when "unrestricted guest" mode is turned on.
3210          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
3211          * tree. Newer qemu binaries with that qemu fix would not need this
3212          * kvm hack.
3213          */
3214         if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
3215                 var->type |= 0x1; /* Accessed */
3216
3217         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3218
3219 out:
3220         vmx->emulation_required = emulation_required(vcpu);
3221 }
3222
3223 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3224 {
3225         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3226
3227         *db = (ar >> 14) & 1;
3228         *l = (ar >> 13) & 1;
3229 }
3230
3231 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3232 {
3233         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3234         dt->address = vmcs_readl(GUEST_IDTR_BASE);
3235 }
3236
3237 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3238 {
3239         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3240         vmcs_writel(GUEST_IDTR_BASE, dt->address);
3241 }
3242
3243 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3244 {
3245         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3246         dt->address = vmcs_readl(GUEST_GDTR_BASE);
3247 }
3248
3249 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3250 {
3251         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3252         vmcs_writel(GUEST_GDTR_BASE, dt->address);
3253 }
3254
3255 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3256 {
3257         struct kvm_segment var;
3258         u32 ar;
3259
3260         vmx_get_segment(vcpu, &var, seg);
3261         var.dpl = 0x3;
3262         if (seg == VCPU_SREG_CS)
3263                 var.type = 0x3;
3264         ar = vmx_segment_access_rights(&var);
3265
3266         if (var.base != (var.selector << 4))
3267                 return false;
3268         if (var.limit != 0xffff)
3269                 return false;
3270         if (ar != 0xf3)
3271                 return false;
3272
3273         return true;
3274 }
3275
3276 static bool code_segment_valid(struct kvm_vcpu *vcpu)
3277 {
3278         struct kvm_segment cs;
3279         unsigned int cs_rpl;
3280
3281         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3282         cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3283
3284         if (cs.unusable)
3285                 return false;
3286         if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3287                 return false;
3288         if (!cs.s)
3289                 return false;
3290         if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3291                 if (cs.dpl > cs_rpl)
3292                         return false;
3293         } else {
3294                 if (cs.dpl != cs_rpl)
3295                         return false;
3296         }
3297         if (!cs.present)
3298                 return false;
3299
3300         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3301         return true;
3302 }
3303
3304 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3305 {
3306         struct kvm_segment ss;
3307         unsigned int ss_rpl;
3308
3309         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3310         ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3311
3312         if (ss.unusable)
3313                 return true;
3314         if (ss.type != 3 && ss.type != 7)
3315                 return false;
3316         if (!ss.s)
3317                 return false;
3318         if (ss.dpl != ss_rpl) /* DPL != RPL */
3319                 return false;
3320         if (!ss.present)
3321                 return false;
3322
3323         return true;
3324 }
3325
3326 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3327 {
3328         struct kvm_segment var;
3329         unsigned int rpl;
3330
3331         vmx_get_segment(vcpu, &var, seg);
3332         rpl = var.selector & SEGMENT_RPL_MASK;
3333
3334         if (var.unusable)
3335                 return true;
3336         if (!var.s)
3337                 return false;
3338         if (!var.present)
3339                 return false;
3340         if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3341                 if (var.dpl < rpl) /* DPL < RPL */
3342                         return false;
3343         }
3344
3345         /* TODO: Add other members to kvm_segment_field to allow checking for other access
3346          * rights flags
3347          */
3348         return true;
3349 }
3350
3351 static bool tr_valid(struct kvm_vcpu *vcpu)
3352 {
3353         struct kvm_segment tr;
3354
3355         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3356
3357         if (tr.unusable)
3358                 return false;
3359         if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
3360                 return false;
3361         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3362                 return false;
3363         if (!tr.present)
3364                 return false;
3365
3366         return true;
3367 }
3368
3369 static bool ldtr_valid(struct kvm_vcpu *vcpu)
3370 {
3371         struct kvm_segment ldtr;
3372
3373         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3374
3375         if (ldtr.unusable)
3376                 return true;
3377         if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
3378                 return false;
3379         if (ldtr.type != 2)
3380                 return false;
3381         if (!ldtr.present)
3382                 return false;
3383
3384         return true;
3385 }
3386
3387 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3388 {
3389         struct kvm_segment cs, ss;
3390
3391         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3392         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3393
3394         return ((cs.selector & SEGMENT_RPL_MASK) ==
3395                  (ss.selector & SEGMENT_RPL_MASK));
3396 }
3397
3398 /*
3399  * Check if guest state is valid. Returns true if valid, false if
3400  * not.
3401  * We assume that registers are always usable
3402  */
3403 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
3404 {
3405         /* real mode guest state checks */
3406         if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3407                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3408                         return false;
3409                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3410                         return false;
3411                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3412                         return false;
3413                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3414                         return false;
3415                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3416                         return false;
3417                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3418                         return false;
3419         } else {
3420         /* protected mode guest state checks */
3421                 if (!cs_ss_rpl_check(vcpu))
3422                         return false;
3423                 if (!code_segment_valid(vcpu))
3424                         return false;
3425                 if (!stack_segment_valid(vcpu))
3426                         return false;
3427                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3428                         return false;
3429                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3430                         return false;
3431                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3432                         return false;
3433                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3434                         return false;
3435                 if (!tr_valid(vcpu))
3436                         return false;
3437                 if (!ldtr_valid(vcpu))
3438                         return false;
3439         }
3440         /* TODO:
3441          * - Add checks on RIP
3442          * - Add checks on RFLAGS
3443          */
3444
3445         return true;
3446 }
3447
3448 static int init_rmode_tss(struct kvm *kvm)
3449 {
3450         gfn_t fn;
3451         u16 data = 0;
3452         int idx, r;
3453
3454         idx = srcu_read_lock(&kvm->srcu);
3455         fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
3456         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3457         if (r < 0)
3458                 goto out;
3459         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3460         r = kvm_write_guest_page(kvm, fn++, &data,
3461                         TSS_IOPB_BASE_OFFSET, sizeof(u16));
3462         if (r < 0)
3463                 goto out;
3464         r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
3465         if (r < 0)
3466                 goto out;
3467         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3468         if (r < 0)
3469                 goto out;
3470         data = ~0;
3471         r = kvm_write_guest_page(kvm, fn, &data,
3472                                  RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
3473                                  sizeof(u8));
3474 out:
3475         srcu_read_unlock(&kvm->srcu, idx);
3476         return r;
3477 }
3478
3479 static int init_rmode_identity_map(struct kvm *kvm)
3480 {
3481         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
3482         int i, r = 0;
3483         kvm_pfn_t identity_map_pfn;
3484         u32 tmp;
3485
3486         /* Protect kvm_vmx->ept_identity_pagetable_done. */
3487         mutex_lock(&kvm->slots_lock);
3488
3489         if (likely(kvm_vmx->ept_identity_pagetable_done))
3490                 goto out;
3491
3492         if (!kvm_vmx->ept_identity_map_addr)
3493                 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3494         identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
3495
3496         r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3497                                     kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
3498         if (r < 0)
3499                 goto out;
3500
3501         r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
3502         if (r < 0)
3503                 goto out;
3504         /* Set up identity-mapping pagetable for EPT in real mode */
3505         for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
3506                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3507                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3508                 r = kvm_write_guest_page(kvm, identity_map_pfn,
3509                                 &tmp, i * sizeof(tmp), sizeof(tmp));
3510                 if (r < 0)
3511                         goto out;
3512         }
3513         kvm_vmx->ept_identity_pagetable_done = true;
3514
3515 out:
3516         mutex_unlock(&kvm->slots_lock);
3517         return r;
3518 }
3519
3520 static void seg_setup(int seg)
3521 {
3522         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3523         unsigned int ar;
3524
3525         vmcs_write16(sf->selector, 0);
3526         vmcs_writel(sf->base, 0);
3527         vmcs_write32(sf->limit, 0xffff);
3528         ar = 0x93;
3529         if (seg == VCPU_SREG_CS)
3530                 ar |= 0x08; /* code segment */
3531
3532         vmcs_write32(sf->ar_bytes, ar);
3533 }
3534
3535 static int alloc_apic_access_page(struct kvm *kvm)
3536 {
3537         struct page *page;
3538         int r = 0;
3539
3540         mutex_lock(&kvm->slots_lock);
3541         if (kvm->arch.apic_access_page_done)
3542                 goto out;
3543         r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
3544                                     APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
3545         if (r)
3546                 goto out;
3547
3548         page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
3549         if (is_error_page(page)) {
3550                 r = -EFAULT;
3551                 goto out;
3552         }
3553
3554         /*
3555          * Do not pin the page in memory, so that memory hot-unplug
3556          * is able to migrate it.
3557          */
3558         put_page(page);
3559         kvm->arch.apic_access_page_done = true;
3560 out:
3561         mutex_unlock(&kvm->slots_lock);
3562         return r;
3563 }
3564
3565 int allocate_vpid(void)
3566 {
3567         int vpid;
3568
3569         if (!enable_vpid)
3570                 return 0;
3571         spin_lock(&vmx_vpid_lock);
3572         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
3573         if (vpid < VMX_NR_VPIDS)
3574                 __set_bit(vpid, vmx_vpid_bitmap);
3575         else
3576                 vpid = 0;
3577         spin_unlock(&vmx_vpid_lock);
3578         return vpid;
3579 }
3580
3581 void free_vpid(int vpid)
3582 {
3583         if (!enable_vpid || vpid == 0)
3584                 return;
3585         spin_lock(&vmx_vpid_lock);
3586         __clear_bit(vpid, vmx_vpid_bitmap);
3587         spin_unlock(&vmx_vpid_lock);
3588 }
3589
3590 static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
3591                                                           u32 msr, int type)
3592 {
3593         int f = sizeof(unsigned long);
3594
3595         if (!cpu_has_vmx_msr_bitmap())
3596                 return;
3597
3598         if (static_branch_unlikely(&enable_evmcs))
3599                 evmcs_touch_msr_bitmap();
3600
3601         /*
3602          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
3603          * have the write-low and read-high bitmap offsets the wrong way round.
3604          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3605          */
3606         if (msr <= 0x1fff) {
3607                 if (type & MSR_TYPE_R)
3608                         /* read-low */
3609                         __clear_bit(msr, msr_bitmap + 0x000 / f);
3610
3611                 if (type & MSR_TYPE_W)
3612                         /* write-low */
3613                         __clear_bit(msr, msr_bitmap + 0x800 / f);
3614
3615         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3616                 msr &= 0x1fff;
3617                 if (type & MSR_TYPE_R)
3618                         /* read-high */
3619                         __clear_bit(msr, msr_bitmap + 0x400 / f);
3620
3621                 if (type & MSR_TYPE_W)
3622                         /* write-high */
3623                         __clear_bit(msr, msr_bitmap + 0xc00 / f);
3624
3625         }
3626 }
3627
3628 static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
3629                                                          u32 msr, int type)
3630 {
3631         int f = sizeof(unsigned long);
3632
3633         if (!cpu_has_vmx_msr_bitmap())
3634                 return;
3635
3636         if (static_branch_unlikely(&enable_evmcs))
3637                 evmcs_touch_msr_bitmap();
3638
3639         /*
3640          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
3641          * have the write-low and read-high bitmap offsets the wrong way round.
3642          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3643          */
3644         if (msr <= 0x1fff) {
3645                 if (type & MSR_TYPE_R)
3646                         /* read-low */
3647                         __set_bit(msr, msr_bitmap + 0x000 / f);
3648
3649                 if (type & MSR_TYPE_W)
3650                         /* write-low */
3651                         __set_bit(msr, msr_bitmap + 0x800 / f);
3652
3653         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3654                 msr &= 0x1fff;
3655                 if (type & MSR_TYPE_R)
3656                         /* read-high */
3657                         __set_bit(msr, msr_bitmap + 0x400 / f);
3658
3659                 if (type & MSR_TYPE_W)
3660                         /* write-high */
3661                         __set_bit(msr, msr_bitmap + 0xc00 / f);
3662
3663         }
3664 }
3665
3666 static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
3667                                                       u32 msr, int type, bool value)
3668 {
3669         if (value)
3670                 vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
3671         else
3672                 vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
3673 }
3674
3675 static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
3676 {
3677         u8 mode = 0;
3678
3679         if (cpu_has_secondary_exec_ctrls() &&
3680             (secondary_exec_controls_get(to_vmx(vcpu)) &
3681              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
3682                 mode |= MSR_BITMAP_MODE_X2APIC;
3683                 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
3684                         mode |= MSR_BITMAP_MODE_X2APIC_APICV;
3685         }
3686
3687         return mode;
3688 }
3689
3690 static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
3691                                          u8 mode)
3692 {
3693         int msr;
3694
3695         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
3696                 unsigned word = msr / BITS_PER_LONG;
3697                 msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
3698                 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
3699         }
3700
3701         if (mode & MSR_BITMAP_MODE_X2APIC) {
3702                 /*
3703                  * TPR reads and writes can be virtualized even if virtual interrupt
3704                  * delivery is not in use.
3705                  */
3706                 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
3707                 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
3708                         vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
3709                         vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
3710                         vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
3711                 }
3712         }
3713 }
3714
3715 void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
3716 {
3717         struct vcpu_vmx *vmx = to_vmx(vcpu);
3718         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3719         u8 mode = vmx_msr_bitmap_mode(vcpu);
3720         u8 changed = mode ^ vmx->msr_bitmap_mode;
3721
3722         if (!changed)
3723                 return;
3724
3725         if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
3726                 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
3727
3728         vmx->msr_bitmap_mode = mode;
3729 }
3730
3731 void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
3732 {
3733         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3734         bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
3735         u32 i;
3736
3737         vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
3738                                                         MSR_TYPE_RW, flag);
3739         vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
3740                                                         MSR_TYPE_RW, flag);
3741         vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
3742                                                         MSR_TYPE_RW, flag);
3743         vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
3744                                                         MSR_TYPE_RW, flag);
3745         for (i = 0; i < vmx->pt_desc.addr_range; i++) {
3746                 vmx_set_intercept_for_msr(msr_bitmap,
3747                         MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
3748                 vmx_set_intercept_for_msr(msr_bitmap,
3749                         MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
3750         }
3751 }
3752
3753 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
3754 {
3755         struct vcpu_vmx *vmx = to_vmx(vcpu);
3756         void *vapic_page;
3757         u32 vppr;
3758         int rvi;
3759
3760         if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
3761                 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
3762                 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
3763                 return false;
3764
3765         rvi = vmx_get_rvi();
3766
3767         vapic_page = vmx->nested.virtual_apic_map.hva;
3768         vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
3769
3770         return ((rvi & 0xf0) > (vppr & 0xf0));
3771 }
3772
3773 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
3774                                                      bool nested)
3775 {
3776 #ifdef CONFIG_SMP
3777         int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
3778
3779         if (vcpu->mode == IN_GUEST_MODE) {
3780                 /*
3781                  * The vector of interrupt to be delivered to vcpu had
3782                  * been set in PIR before this function.
3783                  *
3784                  * Following cases will be reached in this block, and
3785                  * we always send a notification event in all cases as
3786                  * explained below.
3787                  *
3788                  * Case 1: vcpu keeps in non-root mode. Sending a
3789                  * notification event posts the interrupt to vcpu.
3790                  *
3791                  * Case 2: vcpu exits to root mode and is still
3792                  * runnable. PIR will be synced to vIRR before the
3793                  * next vcpu entry. Sending a notification event in
3794                  * this case has no effect, as vcpu is not in root
3795                  * mode.
3796                  *
3797                  * Case 3: vcpu exits to root mode and is blocked.
3798                  * vcpu_block() has already synced PIR to vIRR and
3799                  * never blocks vcpu if vIRR is not cleared. Therefore,
3800                  * a blocked vcpu here does not wait for any requested
3801                  * interrupts in PIR, and sending a notification event
3802                  * which has no effect is safe here.
3803                  */
3804
3805                 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
3806                 return true;
3807         }
3808 #endif
3809         return false;
3810 }
3811
3812 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
3813                                                 int vector)
3814 {
3815         struct vcpu_vmx *vmx = to_vmx(vcpu);
3816
3817         if (is_guest_mode(vcpu) &&
3818             vector == vmx->nested.posted_intr_nv) {
3819                 /*
3820                  * If a posted intr is not recognized by hardware,
3821                  * we will accomplish it in the next vmentry.
3822                  */
3823                 vmx->nested.pi_pending = true;
3824                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3825                 /* the PIR and ON have been set by L1. */
3826                 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
3827                         kvm_vcpu_kick(vcpu);
3828                 return 0;
3829         }
3830         return -1;
3831 }
3832 /*
3833  * Send interrupt to vcpu via posted interrupt way.
3834  * 1. If target vcpu is running(non-root mode), send posted interrupt
3835  * notification to vcpu and hardware will sync PIR to vIRR atomically.
3836  * 2. If target vcpu isn't running(root mode), kick it to pick up the
3837  * interrupt from PIR in next vmentry.
3838  */
3839 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
3840 {
3841         struct vcpu_vmx *vmx = to_vmx(vcpu);
3842         int r;
3843
3844         r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
3845         if (!r)
3846                 return 0;
3847
3848         if (!vcpu->arch.apicv_active)
3849                 return -1;
3850
3851         if (pi_test_and_set_pir(vector, &vmx->pi_desc))
3852                 return 0;
3853
3854         /* If a previous notification has sent the IPI, nothing to do.  */
3855         if (pi_test_and_set_on(&vmx->pi_desc))
3856                 return 0;
3857
3858         if (vcpu != kvm_get_running_vcpu() &&
3859             !kvm_vcpu_trigger_posted_interrupt(vcpu, false))
3860                 kvm_vcpu_kick(vcpu);
3861
3862         return 0;
3863 }
3864
3865 /*
3866  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
3867  * will not change in the lifetime of the guest.
3868  * Note that host-state that does change is set elsewhere. E.g., host-state
3869  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
3870  */
3871 void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
3872 {
3873         u32 low32, high32;
3874         unsigned long tmpl;
3875         unsigned long cr0, cr3, cr4;
3876
3877         cr0 = read_cr0();
3878         WARN_ON(cr0 & X86_CR0_TS);
3879         vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
3880
3881         /*
3882          * Save the most likely value for this task's CR3 in the VMCS.
3883          * We can't use __get_current_cr3_fast() because we're not atomic.
3884          */
3885         cr3 = __read_cr3();
3886         vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
3887         vmx->loaded_vmcs->host_state.cr3 = cr3;
3888
3889         /* Save the most likely value for this task's CR4 in the VMCS. */
3890         cr4 = cr4_read_shadow();
3891         vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
3892         vmx->loaded_vmcs->host_state.cr4 = cr4;
3893
3894         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
3895 #ifdef CONFIG_X86_64
3896         /*
3897          * Load null selectors, so we can avoid reloading them in
3898          * vmx_prepare_switch_to_host(), in case userspace uses
3899          * the null selectors too (the expected case).
3900          */
3901         vmcs_write16(HOST_DS_SELECTOR, 0);
3902         vmcs_write16(HOST_ES_SELECTOR, 0);
3903 #else
3904         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
3905         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
3906 #endif
3907         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
3908         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
3909
3910         vmcs_writel(HOST_IDTR_BASE, host_idt_base);   /* 22.2.4 */
3911
3912         vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
3913
3914         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
3915         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
3916         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
3917         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
3918
3919         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
3920                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
3921                 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
3922         }
3923
3924         if (cpu_has_load_ia32_efer())
3925                 vmcs_write64(HOST_IA32_EFER, host_efer);
3926 }
3927
3928 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
3929 {
3930         vmx->vcpu.arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS;
3931         if (!enable_ept)
3932                 vmx->vcpu.arch.cr4_guest_owned_bits &= ~X86_CR4_PGE;
3933         if (is_guest_mode(&vmx->vcpu))
3934                 vmx->vcpu.arch.cr4_guest_owned_bits &=
3935                         ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
3936         vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
3937 }
3938
3939 u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
3940 {
3941         u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
3942
3943         if (!kvm_vcpu_apicv_active(&vmx->vcpu))
3944                 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
3945
3946         if (!enable_vnmi)
3947                 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
3948
3949         if (!enable_preemption_timer)
3950                 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
3951
3952         return pin_based_exec_ctrl;
3953 }
3954
3955 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
3956 {
3957         struct vcpu_vmx *vmx = to_vmx(vcpu);
3958
3959         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
3960         if (cpu_has_secondary_exec_ctrls()) {
3961                 if (kvm_vcpu_apicv_active(vcpu))
3962                         secondary_exec_controls_setbit(vmx,
3963                                       SECONDARY_EXEC_APIC_REGISTER_VIRT |
3964                                       SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3965                 else
3966                         secondary_exec_controls_clearbit(vmx,
3967                                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
3968                                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3969         }
3970
3971         if (cpu_has_vmx_msr_bitmap())
3972                 vmx_update_msr_bitmap(vcpu);
3973 }
3974
3975 u32 vmx_exec_control(struct vcpu_vmx *vmx)
3976 {
3977         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
3978
3979         if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
3980                 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
3981
3982         if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
3983                 exec_control &= ~CPU_BASED_TPR_SHADOW;
3984 #ifdef CONFIG_X86_64
3985                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
3986                                 CPU_BASED_CR8_LOAD_EXITING;
3987 #endif
3988         }
3989         if (!enable_ept)
3990                 exec_control |= CPU_BASED_CR3_STORE_EXITING |
3991                                 CPU_BASED_CR3_LOAD_EXITING  |
3992                                 CPU_BASED_INVLPG_EXITING;
3993         if (kvm_mwait_in_guest(vmx->vcpu.kvm))
3994                 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
3995                                 CPU_BASED_MONITOR_EXITING);
3996         if (kvm_hlt_in_guest(vmx->vcpu.kvm))
3997                 exec_control &= ~CPU_BASED_HLT_EXITING;
3998         return exec_control;
3999 }
4000
4001 /*
4002  * Adjust a single secondary execution control bit to intercept/allow an
4003  * instruction in the guest.  This is usually done based on whether or not a
4004  * feature has been exposed to the guest in order to correctly emulate faults.
4005  */
4006 static inline void
4007 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
4008                                   u32 control, bool enabled, bool exiting)
4009 {
4010         /*
4011          * If the control is for an opt-in feature, clear the control if the
4012          * feature is not exposed to the guest, i.e. not enabled.  If the
4013          * control is opt-out, i.e. an exiting control, clear the control if
4014          * the feature _is_ exposed to the guest, i.e. exiting/interception is
4015          * disabled for the associated instruction.  Note, the caller is
4016          * responsible presetting exec_control to set all supported bits.
4017          */
4018         if (enabled == exiting)
4019                 *exec_control &= ~control;
4020
4021         /*
4022          * Update the nested MSR settings so that a nested VMM can/can't set
4023          * controls for features that are/aren't exposed to the guest.
4024          */
4025         if (nested) {
4026                 if (enabled)
4027                         vmx->nested.msrs.secondary_ctls_high |= control;
4028                 else
4029                         vmx->nested.msrs.secondary_ctls_high &= ~control;
4030         }
4031 }
4032
4033 /*
4034  * Wrapper macro for the common case of adjusting a secondary execution control
4035  * based on a single guest CPUID bit, with a dedicated feature bit.  This also
4036  * verifies that the control is actually supported by KVM and hardware.
4037  */
4038 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4039 ({                                                                       \
4040         bool __enabled;                                                  \
4041                                                                          \
4042         if (cpu_has_vmx_##name()) {                                      \
4043                 __enabled = guest_cpuid_has(&(vmx)->vcpu,                \
4044                                             X86_FEATURE_##feat_name);    \
4045                 vmx_adjust_secondary_exec_control(vmx, exec_control,     \
4046                         SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
4047         }                                                                \
4048 })
4049
4050 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4051 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4052         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4053
4054 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4055         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4056
4057 static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
4058 {
4059         struct kvm_vcpu *vcpu = &vmx->vcpu;
4060
4061         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4062
4063         if (vmx_pt_mode_is_system())
4064                 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4065         if (!cpu_need_virtualize_apic_accesses(vcpu))
4066                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4067         if (vmx->vpid == 0)
4068                 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4069         if (!enable_ept) {
4070                 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4071                 enable_unrestricted_guest = 0;
4072         }
4073         if (!enable_unrestricted_guest)
4074                 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4075         if (kvm_pause_in_guest(vmx->vcpu.kvm))
4076                 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4077         if (!kvm_vcpu_apicv_active(vcpu))
4078                 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4079                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4080         exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4081
4082         /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4083          * in vmx_set_cr4.  */
4084         exec_control &= ~SECONDARY_EXEC_DESC;
4085
4086         /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4087            (handle_vmptrld).
4088            We can NOT enable shadow_vmcs here because we don't have yet
4089            a current VMCS12
4090         */
4091         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4092
4093         if (!enable_pml)
4094                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4095
4096         if (cpu_has_vmx_xsaves()) {
4097                 /* Exposing XSAVES only when XSAVE is exposed */
4098                 bool xsaves_enabled =
4099                         boot_cpu_has(X86_FEATURE_XSAVE) &&
4100                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4101                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
4102
4103                 vcpu->arch.xsaves_enabled = xsaves_enabled;
4104
4105                 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4106                                                   SECONDARY_EXEC_XSAVES,
4107                                                   xsaves_enabled, false);
4108         }
4109
4110         vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
4111
4112         /*
4113          * Expose INVPCID if and only if PCID is also exposed to the guest.
4114          * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF
4115          * if CR4.PCIDE=0.  Enumerating CPUID.INVPCID=1 would lead to incorrect
4116          * behavior from the guest perspective (it would expect #GP or #PF).
4117          */
4118         if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
4119                 guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
4120         vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
4121
4122
4123         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4124         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
4125
4126         vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4127                                     ENABLE_USR_WAIT_PAUSE, false);
4128
4129         vmx->secondary_exec_control = exec_control;
4130 }
4131
4132 static void ept_set_mmio_spte_mask(void)
4133 {
4134         /*
4135          * EPT Misconfigurations can be generated if the value of bits 2:0
4136          * of an EPT paging-structure entry is 110b (write/execute).
4137          */
4138         kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0);
4139 }
4140
4141 #define VMX_XSS_EXIT_BITMAP 0
4142
4143 /*
4144  * Noting that the initialization of Guest-state Area of VMCS is in
4145  * vmx_vcpu_reset().
4146  */
4147 static void init_vmcs(struct vcpu_vmx *vmx)
4148 {
4149         if (nested)
4150                 nested_vmx_set_vmcs_shadowing_bitmap();
4151
4152         if (cpu_has_vmx_msr_bitmap())
4153                 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4154
4155         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
4156
4157         /* Control */
4158         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4159
4160         exec_controls_set(vmx, vmx_exec_control(vmx));
4161
4162         if (cpu_has_secondary_exec_ctrls()) {
4163                 vmx_compute_secondary_exec_control(vmx);
4164                 secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
4165         }
4166
4167         if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
4168                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4169                 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4170                 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4171                 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4172
4173                 vmcs_write16(GUEST_INTR_STATUS, 0);
4174
4175                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4176                 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4177         }
4178
4179         if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
4180                 vmcs_write32(PLE_GAP, ple_gap);
4181                 vmx->ple_window = ple_window;
4182                 vmx->ple_window_dirty = true;
4183         }
4184
4185         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4186         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4187         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
4188
4189         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
4190         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
4191         vmx_set_constant_host_state(vmx);
4192         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4193         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4194
4195         if (cpu_has_vmx_vmfunc())
4196                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
4197
4198         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4199         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4200         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
4201         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4202         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
4203
4204         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4205                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
4206
4207         vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
4208
4209         /* 22.2.1, 20.8.1 */
4210         vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
4211
4212         vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4213         vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
4214
4215         set_cr4_guest_host_mask(vmx);
4216
4217         if (vmx->vpid != 0)
4218                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4219
4220         if (cpu_has_vmx_xsaves())
4221                 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4222
4223         if (enable_pml) {
4224                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4225                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
4226         }
4227
4228         if (cpu_has_vmx_encls_vmexit())
4229                 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
4230
4231         if (vmx_pt_mode_is_host_guest()) {
4232                 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4233                 /* Bit[6~0] are forced to 1, writes are ignored. */
4234                 vmx->pt_desc.guest.output_mask = 0x7F;
4235                 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4236         }
4237
4238         /*
4239          * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
4240          * between guest and host.  In that case we only care about present
4241          * faults.
4242          */
4243         if (enable_ept) {
4244                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK);
4245                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK);
4246         }
4247 }
4248
4249 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4250 {
4251         struct vcpu_vmx *vmx = to_vmx(vcpu);
4252         struct msr_data apic_base_msr;
4253         u64 cr0;
4254
4255         vmx->rmode.vm86_active = 0;
4256         vmx->spec_ctrl = 0;
4257
4258         vmx->msr_ia32_umwait_control = 0;
4259
4260         vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
4261         vmx->hv_deadline_tsc = -1;
4262         kvm_set_cr8(vcpu, 0);
4263
4264         if (!init_event) {
4265                 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
4266                                      MSR_IA32_APICBASE_ENABLE;
4267                 if (kvm_vcpu_is_reset_bsp(vcpu))
4268                         apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
4269                 apic_base_msr.host_initiated = true;
4270                 kvm_set_apic_base(vcpu, &apic_base_msr);
4271         }
4272
4273         vmx_segment_cache_clear(vmx);
4274
4275         seg_setup(VCPU_SREG_CS);
4276         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4277         vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
4278
4279         seg_setup(VCPU_SREG_DS);
4280         seg_setup(VCPU_SREG_ES);
4281         seg_setup(VCPU_SREG_FS);
4282         seg_setup(VCPU_SREG_GS);
4283         seg_setup(VCPU_SREG_SS);
4284
4285         vmcs_write16(GUEST_TR_SELECTOR, 0);
4286         vmcs_writel(GUEST_TR_BASE, 0);
4287         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4288         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4289
4290         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4291         vmcs_writel(GUEST_LDTR_BASE, 0);
4292         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4293         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4294
4295         if (!init_event) {
4296                 vmcs_write32(GUEST_SYSENTER_CS, 0);
4297                 vmcs_writel(GUEST_SYSENTER_ESP, 0);
4298                 vmcs_writel(GUEST_SYSENTER_EIP, 0);
4299                 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4300         }
4301
4302         kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
4303         kvm_rip_write(vcpu, 0xfff0);
4304
4305         vmcs_writel(GUEST_GDTR_BASE, 0);
4306         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4307
4308         vmcs_writel(GUEST_IDTR_BASE, 0);
4309         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4310
4311         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4312         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4313         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4314         if (kvm_mpx_supported())
4315                 vmcs_write64(GUEST_BNDCFGS, 0);
4316
4317         setup_msrs(vmx);
4318
4319         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
4320
4321         if (cpu_has_vmx_tpr_shadow() && !init_event) {
4322                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4323                 if (cpu_need_tpr_shadow(vcpu))
4324                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4325                                      __pa(vcpu->arch.apic->regs));
4326                 vmcs_write32(TPR_THRESHOLD, 0);
4327         }
4328
4329         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4330
4331         cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
4332         vmx->vcpu.arch.cr0 = cr0;
4333         vmx_set_cr0(vcpu, cr0); /* enter rmode */
4334         vmx_set_cr4(vcpu, 0);
4335         vmx_set_efer(vcpu, 0);
4336
4337         update_exception_bitmap(vcpu);
4338
4339         vpid_sync_context(vmx->vpid);
4340         if (init_event)
4341                 vmx_clear_hlt(vcpu);
4342 }
4343
4344 static void enable_irq_window(struct kvm_vcpu *vcpu)
4345 {
4346         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
4347 }
4348
4349 static void enable_nmi_window(struct kvm_vcpu *vcpu)
4350 {
4351         if (!enable_vnmi ||
4352             vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4353                 enable_irq_window(vcpu);
4354                 return;
4355         }
4356
4357         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
4358 }
4359
4360 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
4361 {
4362         struct vcpu_vmx *vmx = to_vmx(vcpu);
4363         uint32_t intr;
4364         int irq = vcpu->arch.interrupt.nr;
4365
4366         trace_kvm_inj_virq(irq);
4367
4368         ++vcpu->stat.irq_injections;
4369         if (vmx->rmode.vm86_active) {
4370                 int inc_eip = 0;
4371                 if (vcpu->arch.interrupt.soft)
4372                         inc_eip = vcpu->arch.event_exit_inst_len;
4373                 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
4374                 return;
4375         }
4376         intr = irq | INTR_INFO_VALID_MASK;
4377         if (vcpu->arch.interrupt.soft) {
4378                 intr |= INTR_TYPE_SOFT_INTR;
4379                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4380                              vmx->vcpu.arch.event_exit_inst_len);
4381         } else
4382                 intr |= INTR_TYPE_EXT_INTR;
4383         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4384
4385         vmx_clear_hlt(vcpu);
4386 }
4387
4388 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4389 {
4390         struct vcpu_vmx *vmx = to_vmx(vcpu);
4391
4392         if (!enable_vnmi) {
4393                 /*
4394                  * Tracking the NMI-blocked state in software is built upon
4395                  * finding the next open IRQ window. This, in turn, depends on
4396                  * well-behaving guests: They have to keep IRQs disabled at
4397                  * least as long as the NMI handler runs. Otherwise we may
4398                  * cause NMI nesting, maybe breaking the guest. But as this is
4399                  * highly unlikely, we can live with the residual risk.
4400                  */
4401                 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
4402                 vmx->loaded_vmcs->vnmi_blocked_time = 0;
4403         }
4404
4405         ++vcpu->stat.nmi_injections;
4406         vmx->loaded_vmcs->nmi_known_unmasked = false;
4407
4408         if (vmx->rmode.vm86_active) {
4409                 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
4410                 return;
4411         }
4412
4413         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4414                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4415
4416         vmx_clear_hlt(vcpu);
4417 }
4418
4419 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4420 {
4421         struct vcpu_vmx *vmx = to_vmx(vcpu);
4422         bool masked;
4423
4424         if (!enable_vnmi)
4425                 return vmx->loaded_vmcs->soft_vnmi_blocked;
4426         if (vmx->loaded_vmcs->nmi_known_unmasked)
4427                 return false;
4428         masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
4429         vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4430         return masked;
4431 }
4432
4433 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4434 {
4435         struct vcpu_vmx *vmx = to_vmx(vcpu);
4436
4437         if (!enable_vnmi) {
4438                 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
4439                         vmx->loaded_vmcs->soft_vnmi_blocked = masked;
4440                         vmx->loaded_vmcs->vnmi_blocked_time = 0;
4441                 }
4442         } else {
4443                 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4444                 if (masked)
4445                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
4446                                       GUEST_INTR_STATE_NMI);
4447                 else
4448                         vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
4449                                         GUEST_INTR_STATE_NMI);
4450         }
4451 }
4452
4453 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
4454 {
4455         if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
4456                 return false;
4457
4458         if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
4459                 return true;
4460
4461         return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4462                 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
4463                  GUEST_INTR_STATE_NMI));
4464 }
4465
4466 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4467 {
4468         if (to_vmx(vcpu)->nested.nested_run_pending)
4469                 return -EBUSY;
4470
4471         /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
4472         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
4473                 return -EBUSY;
4474
4475         return !vmx_nmi_blocked(vcpu);
4476 }
4477
4478 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
4479 {
4480         if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
4481                 return false;
4482
4483         return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
4484                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4485                 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
4486 }
4487
4488 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4489 {
4490         if (to_vmx(vcpu)->nested.nested_run_pending)
4491                 return -EBUSY;
4492
4493        /*
4494         * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
4495         * e.g. if the IRQ arrived asynchronously after checking nested events.
4496         */
4497         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
4498                 return -EBUSY;
4499
4500         return !vmx_interrupt_blocked(vcpu);
4501 }
4502
4503 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4504 {
4505         int ret;
4506
4507         if (enable_unrestricted_guest)
4508                 return 0;
4509
4510         mutex_lock(&kvm->slots_lock);
4511         ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
4512                                       PAGE_SIZE * 3);
4513         mutex_unlock(&kvm->slots_lock);
4514
4515         if (ret)
4516                 return ret;
4517         to_kvm_vmx(kvm)->tss_addr = addr;
4518         return init_rmode_tss(kvm);
4519 }
4520
4521 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
4522 {
4523         to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
4524         return 0;
4525 }
4526
4527 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
4528 {
4529         switch (vec) {
4530         case BP_VECTOR:
4531                 /*
4532                  * Update instruction length as we may reinject the exception
4533                  * from user space while in guest debugging mode.
4534                  */
4535                 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
4536                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4537                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
4538                         return false;
4539                 fallthrough;
4540         case DB_VECTOR:
4541                 return !(vcpu->guest_debug &
4542                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
4543         case DE_VECTOR:
4544         case OF_VECTOR:
4545         case BR_VECTOR:
4546         case UD_VECTOR:
4547         case DF_VECTOR:
4548         case SS_VECTOR:
4549         case GP_VECTOR:
4550         case MF_VECTOR:
4551                 return true;
4552         }
4553         return false;
4554 }
4555
4556 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4557                                   int vec, u32 err_code)
4558 {
4559         /*
4560          * Instruction with address size override prefix opcode 0x67
4561          * Cause the #SS fault with 0 error code in VM86 mode.
4562          */
4563         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
4564                 if (kvm_emulate_instruction(vcpu, 0)) {
4565                         if (vcpu->arch.halt_request) {
4566                                 vcpu->arch.halt_request = 0;
4567                                 return kvm_vcpu_halt(vcpu);
4568                         }
4569                         return 1;
4570                 }
4571                 return 0;
4572         }
4573
4574         /*
4575          * Forward all other exceptions that are valid in real mode.
4576          * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4577          *        the required debugging infrastructure rework.
4578          */
4579         kvm_queue_exception(vcpu, vec);
4580         return 1;
4581 }
4582
4583 /*
4584  * Trigger machine check on the host. We assume all the MSRs are already set up
4585  * by the CPU and that we still run on the same CPU as the MCE occurred on.
4586  * We pass a fake environment to the machine check handler because we want
4587  * the guest to be always treated like user space, no matter what context
4588  * it used internally.
4589  */
4590 static void kvm_machine_check(void)
4591 {
4592 #if defined(CONFIG_X86_MCE)
4593         struct pt_regs regs = {
4594                 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
4595                 .flags = X86_EFLAGS_IF,
4596         };
4597
4598         do_machine_check(&regs);
4599 #endif
4600 }
4601
4602 static int handle_machine_check(struct kvm_vcpu *vcpu)
4603 {
4604         /* handled by vmx_vcpu_run() */
4605         return 1;
4606 }
4607
4608 /*
4609  * If the host has split lock detection disabled, then #AC is
4610  * unconditionally injected into the guest, which is the pre split lock
4611  * detection behaviour.
4612  *
4613  * If the host has split lock detection enabled then #AC is
4614  * only injected into the guest when:
4615  *  - Guest CPL == 3 (user mode)
4616  *  - Guest has #AC detection enabled in CR0
4617  *  - Guest EFLAGS has AC bit set
4618  */
4619 static inline bool guest_inject_ac(struct kvm_vcpu *vcpu)
4620 {
4621         if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
4622                 return true;
4623
4624         return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
4625                (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
4626 }
4627
4628 static int handle_exception_nmi(struct kvm_vcpu *vcpu)
4629 {
4630         struct vcpu_vmx *vmx = to_vmx(vcpu);
4631         struct kvm_run *kvm_run = vcpu->run;
4632         u32 intr_info, ex_no, error_code;
4633         unsigned long cr2, rip, dr6;
4634         u32 vect_info;
4635
4636         vect_info = vmx->idt_vectoring_info;
4637         intr_info = vmx_get_intr_info(vcpu);
4638
4639         if (is_machine_check(intr_info) || is_nmi(intr_info))
4640                 return 1; /* handled by handle_exception_nmi_irqoff() */
4641
4642         if (is_invalid_opcode(intr_info))
4643                 return handle_ud(vcpu);
4644
4645         error_code = 0;
4646         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
4647                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
4648
4649         if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
4650                 WARN_ON_ONCE(!enable_vmware_backdoor);
4651
4652                 /*
4653                  * VMware backdoor emulation on #GP interception only handles
4654                  * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
4655                  * error code on #GP.
4656                  */
4657                 if (error_code) {
4658                         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
4659                         return 1;
4660                 }
4661                 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
4662         }
4663
4664         /*
4665          * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
4666          * MMIO, it is better to report an internal error.
4667          * See the comments in vmx_handle_exit.
4668          */
4669         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4670             !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
4671                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4672                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4673                 vcpu->run->internal.ndata = 4;
4674                 vcpu->run->internal.data[0] = vect_info;
4675                 vcpu->run->internal.data[1] = intr_info;
4676                 vcpu->run->internal.data[2] = error_code;
4677                 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
4678                 return 0;
4679         }
4680
4681         if (is_page_fault(intr_info)) {
4682                 cr2 = vmx_get_exit_qual(vcpu);
4683                 if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
4684                         /*
4685                          * EPT will cause page fault only if we need to
4686                          * detect illegal GPAs.
4687                          */
4688                         kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
4689                         return 1;
4690                 } else
4691                         return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
4692         }
4693
4694         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
4695
4696         if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
4697                 return handle_rmode_exception(vcpu, ex_no, error_code);
4698
4699         switch (ex_no) {
4700         case DB_VECTOR:
4701                 dr6 = vmx_get_exit_qual(vcpu);
4702                 if (!(vcpu->guest_debug &
4703                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
4704                         if (is_icebp(intr_info))
4705                                 WARN_ON(!skip_emulated_instruction(vcpu));
4706
4707                         kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
4708                         return 1;
4709                 }
4710                 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
4711                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
4712                 fallthrough;
4713         case BP_VECTOR:
4714                 /*
4715                  * Update instruction length as we may reinject #BP from
4716                  * user space while in guest debugging mode. Reading it for
4717                  * #DB as well causes no harm, it is not used in that case.
4718                  */
4719                 vmx->vcpu.arch.event_exit_inst_len =
4720                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4721                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
4722                 rip = kvm_rip_read(vcpu);
4723                 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
4724                 kvm_run->debug.arch.exception = ex_no;
4725                 break;
4726         case AC_VECTOR:
4727                 if (guest_inject_ac(vcpu)) {
4728                         kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
4729                         return 1;
4730                 }
4731
4732                 /*
4733                  * Handle split lock. Depending on detection mode this will
4734                  * either warn and disable split lock detection for this
4735                  * task or force SIGBUS on it.
4736                  */
4737                 if (handle_guest_split_lock(kvm_rip_read(vcpu)))
4738                         return 1;
4739                 fallthrough;
4740         default:
4741                 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
4742                 kvm_run->ex.exception = ex_no;
4743                 kvm_run->ex.error_code = error_code;
4744                 break;
4745         }
4746         return 0;
4747 }
4748
4749 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
4750 {
4751         ++vcpu->stat.irq_exits;
4752         return 1;
4753 }
4754
4755 static int handle_triple_fault(struct kvm_vcpu *vcpu)
4756 {
4757         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
4758         vcpu->mmio_needed = 0;
4759         return 0;
4760 }
4761
4762 static int handle_io(struct kvm_vcpu *vcpu)
4763 {
4764         unsigned long exit_qualification;
4765         int size, in, string;
4766         unsigned port;
4767
4768         exit_qualification = vmx_get_exit_qual(vcpu);
4769         string = (exit_qualification & 16) != 0;
4770
4771         ++vcpu->stat.io_exits;
4772
4773         if (string)
4774                 return kvm_emulate_instruction(vcpu, 0);
4775
4776         port = exit_qualification >> 16;
4777         size = (exit_qualification & 7) + 1;
4778         in = (exit_qualification & 8) != 0;
4779
4780         return kvm_fast_pio(vcpu, size, port, in);
4781 }
4782
4783 static void
4784 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4785 {
4786         /*
4787          * Patch in the VMCALL instruction:
4788          */
4789         hypercall[0] = 0x0f;
4790         hypercall[1] = 0x01;
4791         hypercall[2] = 0xc1;
4792 }
4793
4794 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
4795 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4796 {
4797         if (is_guest_mode(vcpu)) {
4798                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4799                 unsigned long orig_val = val;
4800
4801                 /*
4802                  * We get here when L2 changed cr0 in a way that did not change
4803                  * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
4804                  * but did change L0 shadowed bits. So we first calculate the
4805                  * effective cr0 value that L1 would like to write into the
4806                  * hardware. It consists of the L2-owned bits from the new
4807                  * value combined with the L1-owned bits from L1's guest_cr0.
4808                  */
4809                 val = (val & ~vmcs12->cr0_guest_host_mask) |
4810                         (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
4811
4812                 if (!nested_guest_cr0_valid(vcpu, val))
4813                         return 1;
4814
4815                 if (kvm_set_cr0(vcpu, val))
4816                         return 1;
4817                 vmcs_writel(CR0_READ_SHADOW, orig_val);
4818                 return 0;
4819         } else {
4820                 if (to_vmx(vcpu)->nested.vmxon &&
4821                     !nested_host_cr0_valid(vcpu, val))
4822                         return 1;
4823
4824                 return kvm_set_cr0(vcpu, val);
4825         }
4826 }
4827
4828 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
4829 {
4830         if (is_guest_mode(vcpu)) {
4831                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4832                 unsigned long orig_val = val;
4833
4834                 /* analogously to handle_set_cr0 */
4835                 val = (val & ~vmcs12->cr4_guest_host_mask) |
4836                         (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
4837                 if (kvm_set_cr4(vcpu, val))
4838                         return 1;
4839                 vmcs_writel(CR4_READ_SHADOW, orig_val);
4840                 return 0;
4841         } else
4842                 return kvm_set_cr4(vcpu, val);
4843 }
4844
4845 static int handle_desc(struct kvm_vcpu *vcpu)
4846 {
4847         WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
4848         return kvm_emulate_instruction(vcpu, 0);
4849 }
4850
4851 static int handle_cr(struct kvm_vcpu *vcpu)
4852 {
4853         unsigned long exit_qualification, val;
4854         int cr;
4855         int reg;
4856         int err;
4857         int ret;
4858
4859         exit_qualification = vmx_get_exit_qual(vcpu);
4860         cr = exit_qualification & 15;
4861         reg = (exit_qualification >> 8) & 15;
4862         switch ((exit_qualification >> 4) & 3) {
4863         case 0: /* mov to cr */
4864                 val = kvm_register_readl(vcpu, reg);
4865                 trace_kvm_cr_write(cr, val);
4866                 switch (cr) {
4867                 case 0:
4868                         err = handle_set_cr0(vcpu, val);
4869                         return kvm_complete_insn_gp(vcpu, err);
4870                 case 3:
4871                         WARN_ON_ONCE(enable_unrestricted_guest);
4872                         err = kvm_set_cr3(vcpu, val);
4873                         return kvm_complete_insn_gp(vcpu, err);
4874                 case 4:
4875                         err = handle_set_cr4(vcpu, val);
4876                         return kvm_complete_insn_gp(vcpu, err);
4877                 case 8: {
4878                                 u8 cr8_prev = kvm_get_cr8(vcpu);
4879                                 u8 cr8 = (u8)val;
4880                                 err = kvm_set_cr8(vcpu, cr8);
4881                                 ret = kvm_complete_insn_gp(vcpu, err);
4882                                 if (lapic_in_kernel(vcpu))
4883                                         return ret;
4884                                 if (cr8_prev <= cr8)
4885                                         return ret;
4886                                 /*
4887                                  * TODO: we might be squashing a
4888                                  * KVM_GUESTDBG_SINGLESTEP-triggered
4889                                  * KVM_EXIT_DEBUG here.
4890                                  */
4891                                 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
4892                                 return 0;
4893                         }
4894                 }
4895                 break;
4896         case 2: /* clts */
4897                 WARN_ONCE(1, "Guest should always own CR0.TS");
4898                 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
4899                 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
4900                 return kvm_skip_emulated_instruction(vcpu);
4901         case 1: /*mov from cr*/
4902                 switch (cr) {
4903                 case 3:
4904                         WARN_ON_ONCE(enable_unrestricted_guest);
4905                         val = kvm_read_cr3(vcpu);
4906                         kvm_register_write(vcpu, reg, val);
4907                         trace_kvm_cr_read(cr, val);
4908                         return kvm_skip_emulated_instruction(vcpu);
4909                 case 8:
4910                         val = kvm_get_cr8(vcpu);
4911                         kvm_register_write(vcpu, reg, val);
4912                         trace_kvm_cr_read(cr, val);
4913                         return kvm_skip_emulated_instruction(vcpu);
4914                 }
4915                 break;
4916         case 3: /* lmsw */
4917                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
4918                 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
4919                 kvm_lmsw(vcpu, val);
4920
4921                 return kvm_skip_emulated_instruction(vcpu);
4922         default:
4923                 break;
4924         }
4925         vcpu->run->exit_reason = 0;
4926         vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
4927                (int)(exit_qualification >> 4) & 3, cr);
4928         return 0;
4929 }
4930
4931 static int handle_dr(struct kvm_vcpu *vcpu)
4932 {
4933         unsigned long exit_qualification;
4934         int dr, dr7, reg;
4935
4936         exit_qualification = vmx_get_exit_qual(vcpu);
4937         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
4938
4939         /* First, if DR does not exist, trigger UD */
4940         if (!kvm_require_dr(vcpu, dr))
4941                 return 1;
4942
4943         /* Do not handle if the CPL > 0, will trigger GP on re-entry */
4944         if (!kvm_require_cpl(vcpu, 0))
4945                 return 1;
4946         dr7 = vmcs_readl(GUEST_DR7);
4947         if (dr7 & DR7_GD) {
4948                 /*
4949                  * As the vm-exit takes precedence over the debug trap, we
4950                  * need to emulate the latter, either for the host or the
4951                  * guest debugging itself.
4952                  */
4953                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4954                         vcpu->run->debug.arch.dr6 = DR6_BD | DR6_RTM | DR6_FIXED_1;
4955                         vcpu->run->debug.arch.dr7 = dr7;
4956                         vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
4957                         vcpu->run->debug.arch.exception = DB_VECTOR;
4958                         vcpu->run->exit_reason = KVM_EXIT_DEBUG;
4959                         return 0;
4960                 } else {
4961                         kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
4962                         return 1;
4963                 }
4964         }
4965
4966         if (vcpu->guest_debug == 0) {
4967                 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
4968
4969                 /*
4970                  * No more DR vmexits; force a reload of the debug registers
4971                  * and reenter on this instruction.  The next vmexit will
4972                  * retrieve the full state of the debug registers.
4973                  */
4974                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
4975                 return 1;
4976         }
4977
4978         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
4979         if (exit_qualification & TYPE_MOV_FROM_DR) {
4980                 unsigned long val;
4981
4982                 if (kvm_get_dr(vcpu, dr, &val))
4983                         return 1;
4984                 kvm_register_write(vcpu, reg, val);
4985         } else
4986                 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
4987                         return 1;
4988
4989         return kvm_skip_emulated_instruction(vcpu);
4990 }
4991
4992 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
4993 {
4994         get_debugreg(vcpu->arch.db[0], 0);
4995         get_debugreg(vcpu->arch.db[1], 1);
4996         get_debugreg(vcpu->arch.db[2], 2);
4997         get_debugreg(vcpu->arch.db[3], 3);
4998         get_debugreg(vcpu->arch.dr6, 6);
4999         vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5000
5001         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5002         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5003 }
5004
5005 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5006 {
5007         vmcs_writel(GUEST_DR7, val);
5008 }
5009
5010 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5011 {
5012         kvm_apic_update_ppr(vcpu);
5013         return 1;
5014 }
5015
5016 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5017 {
5018         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5019
5020         kvm_make_request(KVM_REQ_EVENT, vcpu);
5021
5022         ++vcpu->stat.irq_window_exits;
5023         return 1;
5024 }
5025
5026 static int handle_vmcall(struct kvm_vcpu *vcpu)
5027 {
5028         return kvm_emulate_hypercall(vcpu);
5029 }
5030
5031 static int handle_invd(struct kvm_vcpu *vcpu)
5032 {
5033         /* Treat an INVD instruction as a NOP and just skip it. */
5034         return kvm_skip_emulated_instruction(vcpu);
5035 }
5036
5037 static int handle_invlpg(struct kvm_vcpu *vcpu)
5038 {
5039         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5040
5041         kvm_mmu_invlpg(vcpu, exit_qualification);
5042         return kvm_skip_emulated_instruction(vcpu);
5043 }
5044
5045 static int handle_rdpmc(struct kvm_vcpu *vcpu)
5046 {
5047         int err;
5048
5049         err = kvm_rdpmc(vcpu);
5050         return kvm_complete_insn_gp(vcpu, err);
5051 }
5052
5053 static int handle_wbinvd(struct kvm_vcpu *vcpu)
5054 {
5055         return kvm_emulate_wbinvd(vcpu);
5056 }
5057
5058 static int handle_xsetbv(struct kvm_vcpu *vcpu)
5059 {
5060         u64 new_bv = kvm_read_edx_eax(vcpu);
5061         u32 index = kvm_rcx_read(vcpu);
5062
5063         if (kvm_set_xcr(vcpu, index, new_bv) == 0)
5064                 return kvm_skip_emulated_instruction(vcpu);
5065         return 1;
5066 }
5067
5068 static int handle_apic_access(struct kvm_vcpu *vcpu)
5069 {
5070         if (likely(fasteoi)) {
5071                 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5072                 int access_type, offset;
5073
5074                 access_type = exit_qualification & APIC_ACCESS_TYPE;
5075                 offset = exit_qualification & APIC_ACCESS_OFFSET;
5076                 /*
5077                  * Sane guest uses MOV to write EOI, with written value
5078                  * not cared. So make a short-circuit here by avoiding
5079                  * heavy instruction emulation.
5080                  */
5081                 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5082                     (offset == APIC_EOI)) {
5083                         kvm_lapic_set_eoi(vcpu);
5084                         return kvm_skip_emulated_instruction(vcpu);
5085                 }
5086         }
5087         return kvm_emulate_instruction(vcpu, 0);
5088 }
5089
5090 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5091 {
5092         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5093         int vector = exit_qualification & 0xff;
5094
5095         /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5096         kvm_apic_set_eoi_accelerated(vcpu, vector);
5097         return 1;
5098 }
5099
5100 static int handle_apic_write(struct kvm_vcpu *vcpu)
5101 {
5102         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5103         u32 offset = exit_qualification & 0xfff;
5104
5105         /* APIC-write VM exit is trap-like and thus no need to adjust IP */
5106         kvm_apic_write_nodecode(vcpu, offset);
5107         return 1;
5108 }
5109
5110 static int handle_task_switch(struct kvm_vcpu *vcpu)
5111 {
5112         struct vcpu_vmx *vmx = to_vmx(vcpu);
5113         unsigned long exit_qualification;
5114         bool has_error_code = false;
5115         u32 error_code = 0;
5116         u16 tss_selector;
5117         int reason, type, idt_v, idt_index;
5118
5119         idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5120         idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5121         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5122
5123         exit_qualification = vmx_get_exit_qual(vcpu);
5124
5125         reason = (u32)exit_qualification >> 30;
5126         if (reason == TASK_SWITCH_GATE && idt_v) {
5127                 switch (type) {
5128                 case INTR_TYPE_NMI_INTR:
5129                         vcpu->arch.nmi_injected = false;
5130                         vmx_set_nmi_mask(vcpu, true);
5131                         break;
5132                 case INTR_TYPE_EXT_INTR:
5133                 case INTR_TYPE_SOFT_INTR:
5134                         kvm_clear_interrupt_queue(vcpu);
5135                         break;
5136                 case INTR_TYPE_HARD_EXCEPTION:
5137                         if (vmx->idt_vectoring_info &
5138                             VECTORING_INFO_DELIVER_CODE_MASK) {
5139                                 has_error_code = true;
5140                                 error_code =
5141                                         vmcs_read32(IDT_VECTORING_ERROR_CODE);
5142                         }
5143                         fallthrough;
5144                 case INTR_TYPE_SOFT_EXCEPTION:
5145                         kvm_clear_exception_queue(vcpu);
5146                         break;
5147                 default:
5148                         break;
5149                 }
5150         }
5151         tss_selector = exit_qualification;
5152
5153         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5154                        type != INTR_TYPE_EXT_INTR &&
5155                        type != INTR_TYPE_NMI_INTR))
5156                 WARN_ON(!skip_emulated_instruction(vcpu));
5157
5158         /*
5159          * TODO: What about debug traps on tss switch?
5160          *       Are we supposed to inject them and update dr6?
5161          */
5162         return kvm_task_switch(vcpu, tss_selector,
5163                                type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
5164                                reason, has_error_code, error_code);
5165 }
5166
5167 static int handle_ept_violation(struct kvm_vcpu *vcpu)
5168 {
5169         unsigned long exit_qualification;
5170         gpa_t gpa;
5171         u64 error_code;
5172
5173         exit_qualification = vmx_get_exit_qual(vcpu);
5174
5175         /*
5176          * EPT violation happened while executing iret from NMI,
5177          * "blocked by NMI" bit has to be set before next VM entry.
5178          * There are errata that may cause this bit to not be set:
5179          * AAK134, BY25.
5180          */
5181         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5182                         enable_vnmi &&
5183                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5184                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5185
5186         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5187         trace_kvm_page_fault(gpa, exit_qualification);
5188
5189         /* Is it a read fault? */
5190         error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
5191                      ? PFERR_USER_MASK : 0;
5192         /* Is it a write fault? */
5193         error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
5194                       ? PFERR_WRITE_MASK : 0;
5195         /* Is it a fetch fault? */
5196         error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
5197                       ? PFERR_FETCH_MASK : 0;
5198         /* ept page table entry is present? */
5199         error_code |= (exit_qualification &
5200                        (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
5201                         EPT_VIOLATION_EXECUTABLE))
5202                       ? PFERR_PRESENT_MASK : 0;
5203
5204         error_code |= (exit_qualification & 0x100) != 0 ?
5205                PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
5206
5207         vcpu->arch.exit_qualification = exit_qualification;
5208
5209         /*
5210          * Check that the GPA doesn't exceed physical memory limits, as that is
5211          * a guest page fault.  We have to emulate the instruction here, because
5212          * if the illegal address is that of a paging structure, then
5213          * EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
5214          * would also use advanced VM-exit information for EPT violations to
5215          * reconstruct the page fault error code.
5216          */
5217         if (unlikely(kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
5218                 return kvm_emulate_instruction(vcpu, 0);
5219
5220         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5221 }
5222
5223 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5224 {
5225         gpa_t gpa;
5226
5227         /*
5228          * A nested guest cannot optimize MMIO vmexits, because we have an
5229          * nGPA here instead of the required GPA.
5230          */
5231         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5232         if (!is_guest_mode(vcpu) &&
5233             !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5234                 trace_kvm_fast_mmio(gpa);
5235                 return kvm_skip_emulated_instruction(vcpu);
5236         }
5237
5238         return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
5239 }
5240
5241 static int handle_nmi_window(struct kvm_vcpu *vcpu)
5242 {
5243         WARN_ON_ONCE(!enable_vnmi);
5244         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5245         ++vcpu->stat.nmi_window_exits;
5246         kvm_make_request(KVM_REQ_EVENT, vcpu);
5247
5248         return 1;
5249 }
5250
5251 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5252 {
5253         struct vcpu_vmx *vmx = to_vmx(vcpu);
5254         bool intr_window_requested;
5255         unsigned count = 130;
5256
5257         intr_window_requested = exec_controls_get(vmx) &
5258                                 CPU_BASED_INTR_WINDOW_EXITING;
5259
5260         while (vmx->emulation_required && count-- != 0) {
5261                 if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
5262                         return handle_interrupt_window(&vmx->vcpu);
5263
5264                 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
5265                         return 1;
5266
5267                 if (!kvm_emulate_instruction(vcpu, 0))
5268                         return 0;
5269
5270                 if (vmx->emulation_required && !vmx->rmode.vm86_active &&
5271                     vcpu->arch.exception.pending) {
5272                         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5273                         vcpu->run->internal.suberror =
5274                                                 KVM_INTERNAL_ERROR_EMULATION;
5275                         vcpu->run->internal.ndata = 0;
5276                         return 0;
5277                 }
5278
5279                 if (vcpu->arch.halt_request) {
5280                         vcpu->arch.halt_request = 0;
5281                         return kvm_vcpu_halt(vcpu);
5282                 }
5283
5284                 /*
5285                  * Note, return 1 and not 0, vcpu_run() will invoke
5286                  * xfer_to_guest_mode() which will create a proper return
5287                  * code.
5288                  */
5289                 if (__xfer_to_guest_mode_work_pending())
5290                         return 1;
5291         }
5292
5293         return 1;
5294 }
5295
5296 static void grow_ple_window(struct kvm_vcpu *vcpu)
5297 {
5298         struct vcpu_vmx *vmx = to_vmx(vcpu);
5299         unsigned int old = vmx->ple_window;
5300
5301         vmx->ple_window = __grow_ple_window(old, ple_window,
5302                                             ple_window_grow,
5303                                             ple_window_max);
5304
5305         if (vmx->ple_window != old) {
5306                 vmx->ple_window_dirty = true;
5307                 trace_kvm_ple_window_update(vcpu->vcpu_id,
5308                                             vmx->ple_window, old);
5309         }
5310 }
5311
5312 static void shrink_ple_window(struct kvm_vcpu *vcpu)
5313 {
5314         struct vcpu_vmx *vmx = to_vmx(vcpu);
5315         unsigned int old = vmx->ple_window;
5316
5317         vmx->ple_window = __shrink_ple_window(old, ple_window,
5318                                               ple_window_shrink,
5319                                               ple_window);
5320
5321         if (vmx->ple_window != old) {
5322                 vmx->ple_window_dirty = true;
5323                 trace_kvm_ple_window_update(vcpu->vcpu_id,
5324                                             vmx->ple_window, old);
5325         }
5326 }
5327
5328 static void vmx_enable_tdp(void)
5329 {
5330         kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
5331                 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
5332                 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
5333                 0ull, VMX_EPT_EXECUTABLE_MASK,
5334                 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
5335                 VMX_EPT_RWX_MASK, 0ull);
5336
5337         ept_set_mmio_spte_mask();
5338 }
5339
5340 /*
5341  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5342  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5343  */
5344 static int handle_pause(struct kvm_vcpu *vcpu)
5345 {
5346         if (!kvm_pause_in_guest(vcpu->kvm))
5347                 grow_ple_window(vcpu);
5348
5349         /*
5350          * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5351          * VM-execution control is ignored if CPL > 0. OTOH, KVM
5352          * never set PAUSE_EXITING and just set PLE if supported,
5353          * so the vcpu must be CPL=0 if it gets a PAUSE exit.
5354          */
5355         kvm_vcpu_on_spin(vcpu, true);
5356         return kvm_skip_emulated_instruction(vcpu);
5357 }
5358
5359 static int handle_nop(struct kvm_vcpu *vcpu)
5360 {
5361         return kvm_skip_emulated_instruction(vcpu);
5362 }
5363
5364 static int handle_mwait(struct kvm_vcpu *vcpu)
5365 {
5366         printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
5367         return handle_nop(vcpu);
5368 }
5369
5370 static int handle_invalid_op(struct kvm_vcpu *vcpu)
5371 {
5372         kvm_queue_exception(vcpu, UD_VECTOR);
5373         return 1;
5374 }
5375
5376 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
5377 {
5378         return 1;
5379 }
5380
5381 static int handle_monitor(struct kvm_vcpu *vcpu)
5382 {
5383         printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
5384         return handle_nop(vcpu);
5385 }
5386
5387 static int handle_invpcid(struct kvm_vcpu *vcpu)
5388 {
5389         u32 vmx_instruction_info;
5390         unsigned long type;
5391         gva_t gva;
5392         struct {
5393                 u64 pcid;
5394                 u64 gla;
5395         } operand;
5396
5397         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
5398                 kvm_queue_exception(vcpu, UD_VECTOR);
5399                 return 1;
5400         }
5401
5402         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5403         type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5404
5405         if (type > 3) {
5406                 kvm_inject_gp(vcpu, 0);
5407                 return 1;
5408         }
5409
5410         /* According to the Intel instruction reference, the memory operand
5411          * is read even if it isn't needed (e.g., for type==all)
5412          */
5413         if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5414                                 vmx_instruction_info, false,
5415                                 sizeof(operand), &gva))
5416                 return 1;
5417
5418         return kvm_handle_invpcid(vcpu, type, gva);
5419 }
5420
5421 static int handle_pml_full(struct kvm_vcpu *vcpu)
5422 {
5423         unsigned long exit_qualification;
5424
5425         trace_kvm_pml_full(vcpu->vcpu_id);
5426
5427         exit_qualification = vmx_get_exit_qual(vcpu);
5428
5429         /*
5430          * PML buffer FULL happened while executing iret from NMI,
5431          * "blocked by NMI" bit has to be set before next VM entry.
5432          */
5433         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5434                         enable_vnmi &&
5435                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5436                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5437                                 GUEST_INTR_STATE_NMI);
5438
5439         /*
5440          * PML buffer already flushed at beginning of VMEXIT. Nothing to do
5441          * here.., and there's no userspace involvement needed for PML.
5442          */
5443         return 1;
5444 }
5445
5446 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
5447 {
5448         struct vcpu_vmx *vmx = to_vmx(vcpu);
5449
5450         if (!vmx->req_immediate_exit &&
5451             !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
5452                 kvm_lapic_expired_hv_timer(vcpu);
5453                 return EXIT_FASTPATH_REENTER_GUEST;
5454         }
5455
5456         return EXIT_FASTPATH_NONE;
5457 }
5458
5459 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
5460 {
5461         handle_fastpath_preemption_timer(vcpu);
5462         return 1;
5463 }
5464
5465 /*
5466  * When nested=0, all VMX instruction VM Exits filter here.  The handlers
5467  * are overwritten by nested_vmx_setup() when nested=1.
5468  */
5469 static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
5470 {
5471         kvm_queue_exception(vcpu, UD_VECTOR);
5472         return 1;
5473 }
5474
5475 static int handle_encls(struct kvm_vcpu *vcpu)
5476 {
5477         /*
5478          * SGX virtualization is not yet supported.  There is no software
5479          * enable bit for SGX, so we have to trap ENCLS and inject a #UD
5480          * to prevent the guest from executing ENCLS.
5481          */
5482         kvm_queue_exception(vcpu, UD_VECTOR);
5483         return 1;
5484 }
5485
5486 /*
5487  * The exit handlers return 1 if the exit was handled fully and guest execution
5488  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
5489  * to be done to userspace and return 0.
5490  */
5491 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5492         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
5493         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
5494         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
5495         [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
5496         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
5497         [EXIT_REASON_CR_ACCESS]               = handle_cr,
5498         [EXIT_REASON_DR_ACCESS]               = handle_dr,
5499         [EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,
5500         [EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,
5501         [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
5502         [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
5503         [EXIT_REASON_HLT]                     = kvm_emulate_halt,
5504         [EXIT_REASON_INVD]                    = handle_invd,
5505         [EXIT_REASON_INVLPG]                  = handle_invlpg,
5506         [EXIT_REASON_RDPMC]                   = handle_rdpmc,
5507         [EXIT_REASON_VMCALL]                  = handle_vmcall,
5508         [EXIT_REASON_VMCLEAR]                 = handle_vmx_instruction,
5509         [EXIT_REASON_VMLAUNCH]                = handle_vmx_instruction,
5510         [EXIT_REASON_VMPTRLD]                 = handle_vmx_instruction,
5511         [EXIT_REASON_VMPTRST]                 = handle_vmx_instruction,
5512         [EXIT_REASON_VMREAD]                  = handle_vmx_instruction,
5513         [EXIT_REASON_VMRESUME]                = handle_vmx_instruction,
5514         [EXIT_REASON_VMWRITE]                 = handle_vmx_instruction,
5515         [EXIT_REASON_VMOFF]                   = handle_vmx_instruction,
5516         [EXIT_REASON_VMON]                    = handle_vmx_instruction,
5517         [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
5518         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
5519         [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
5520         [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
5521         [EXIT_REASON_WBINVD]                  = handle_wbinvd,
5522         [EXIT_REASON_XSETBV]                  = handle_xsetbv,
5523         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
5524         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
5525         [EXIT_REASON_GDTR_IDTR]               = handle_desc,
5526         [EXIT_REASON_LDTR_TR]                 = handle_desc,
5527         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
5528         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
5529         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
5530         [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_mwait,
5531         [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
5532         [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
5533         [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
5534         [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
5535         [EXIT_REASON_RDRAND]                  = handle_invalid_op,
5536         [EXIT_REASON_RDSEED]                  = handle_invalid_op,
5537         [EXIT_REASON_PML_FULL]                = handle_pml_full,
5538         [EXIT_REASON_INVPCID]                 = handle_invpcid,
5539         [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
5540         [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
5541         [EXIT_REASON_ENCLS]                   = handle_encls,
5542 };
5543
5544 static const int kvm_vmx_max_exit_handlers =
5545         ARRAY_SIZE(kvm_vmx_exit_handlers);
5546
5547 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
5548                               u32 *intr_info, u32 *error_code)
5549 {
5550         struct vcpu_vmx *vmx = to_vmx(vcpu);
5551
5552         *info1 = vmx_get_exit_qual(vcpu);
5553         if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
5554                 *info2 = vmx->idt_vectoring_info;
5555                 *intr_info = vmx_get_intr_info(vcpu);
5556                 if (is_exception_with_error_code(*intr_info))
5557                         *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5558                 else
5559                         *error_code = 0;
5560         } else {
5561                 *info2 = 0;
5562                 *intr_info = 0;
5563                 *error_code = 0;
5564         }
5565 }
5566
5567 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
5568 {
5569         if (vmx->pml_pg) {
5570                 __free_page(vmx->pml_pg);
5571                 vmx->pml_pg = NULL;
5572         }
5573 }
5574
5575 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
5576 {
5577         struct vcpu_vmx *vmx = to_vmx(vcpu);
5578         u64 *pml_buf;
5579         u16 pml_idx;
5580
5581         pml_idx = vmcs_read16(GUEST_PML_INDEX);
5582
5583         /* Do nothing if PML buffer is empty */
5584         if (pml_idx == (PML_ENTITY_NUM - 1))
5585                 return;
5586
5587         /* PML index always points to next available PML buffer entity */
5588         if (pml_idx >= PML_ENTITY_NUM)
5589                 pml_idx = 0;
5590         else
5591                 pml_idx++;
5592
5593         pml_buf = page_address(vmx->pml_pg);
5594         for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
5595                 u64 gpa;
5596
5597                 gpa = pml_buf[pml_idx];
5598                 WARN_ON(gpa & (PAGE_SIZE - 1));
5599                 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
5600         }
5601
5602         /* reset PML index */
5603         vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
5604 }
5605
5606 /*
5607  * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
5608  * Called before reporting dirty_bitmap to userspace.
5609  */
5610 static void kvm_flush_pml_buffers(struct kvm *kvm)
5611 {
5612         int i;
5613         struct kvm_vcpu *vcpu;
5614         /*
5615          * We only need to kick vcpu out of guest mode here, as PML buffer
5616          * is flushed at beginning of all VMEXITs, and it's obvious that only
5617          * vcpus running in guest are possible to have unflushed GPAs in PML
5618          * buffer.
5619          */
5620         kvm_for_each_vcpu(i, vcpu, kvm)
5621                 kvm_vcpu_kick(vcpu);
5622 }
5623
5624 static void vmx_dump_sel(char *name, uint32_t sel)
5625 {
5626         pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
5627                name, vmcs_read16(sel),
5628                vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
5629                vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
5630                vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
5631 }
5632
5633 static void vmx_dump_dtsel(char *name, uint32_t limit)
5634 {
5635         pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
5636                name, vmcs_read32(limit),
5637                vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
5638 }
5639
5640 void dump_vmcs(void)
5641 {
5642         u32 vmentry_ctl, vmexit_ctl;
5643         u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
5644         unsigned long cr4;
5645         u64 efer;
5646
5647         if (!dump_invalid_vmcs) {
5648                 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
5649                 return;
5650         }
5651
5652         vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
5653         vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
5654         cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5655         pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
5656         cr4 = vmcs_readl(GUEST_CR4);
5657         efer = vmcs_read64(GUEST_IA32_EFER);
5658         secondary_exec_control = 0;
5659         if (cpu_has_secondary_exec_ctrls())
5660                 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
5661
5662         pr_err("*** Guest State ***\n");
5663         pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
5664                vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
5665                vmcs_readl(CR0_GUEST_HOST_MASK));
5666         pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
5667                cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
5668         pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
5669         if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
5670             (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
5671         {
5672                 pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
5673                        vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
5674                 pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
5675                        vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
5676         }
5677         pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
5678                vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
5679         pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
5680                vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
5681         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
5682                vmcs_readl(GUEST_SYSENTER_ESP),
5683                vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
5684         vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
5685         vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
5686         vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
5687         vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
5688         vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
5689         vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
5690         vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
5691         vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
5692         vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
5693         vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
5694         if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
5695             (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
5696                 pr_err("EFER =     0x%016llx  PAT = 0x%016llx\n",
5697                        efer, vmcs_read64(GUEST_IA32_PAT));
5698         pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
5699                vmcs_read64(GUEST_IA32_DEBUGCTL),
5700                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
5701         if (cpu_has_load_perf_global_ctrl() &&
5702             vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
5703                 pr_err("PerfGlobCtl = 0x%016llx\n",
5704                        vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
5705         if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
5706                 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
5707         pr_err("Interruptibility = %08x  ActivityState = %08x\n",
5708                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
5709                vmcs_read32(GUEST_ACTIVITY_STATE));
5710         if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
5711                 pr_err("InterruptStatus = %04x\n",
5712                        vmcs_read16(GUEST_INTR_STATUS));
5713
5714         pr_err("*** Host State ***\n");
5715         pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
5716                vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
5717         pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
5718                vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
5719                vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
5720                vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
5721                vmcs_read16(HOST_TR_SELECTOR));
5722         pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
5723                vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
5724                vmcs_readl(HOST_TR_BASE));
5725         pr_err("GDTBase=%016lx IDTBase=%016lx\n",
5726                vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
5727         pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
5728                vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
5729                vmcs_readl(HOST_CR4));
5730         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
5731                vmcs_readl(HOST_IA32_SYSENTER_ESP),
5732                vmcs_read32(HOST_IA32_SYSENTER_CS),
5733                vmcs_readl(HOST_IA32_SYSENTER_EIP));
5734         if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
5735                 pr_err("EFER = 0x%016llx  PAT = 0x%016llx\n",
5736                        vmcs_read64(HOST_IA32_EFER),
5737                        vmcs_read64(HOST_IA32_PAT));
5738         if (cpu_has_load_perf_global_ctrl() &&
5739             vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
5740                 pr_err("PerfGlobCtl = 0x%016llx\n",
5741                        vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
5742
5743         pr_err("*** Control State ***\n");
5744         pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
5745                pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
5746         pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
5747         pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
5748                vmcs_read32(EXCEPTION_BITMAP),
5749                vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
5750                vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
5751         pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
5752                vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
5753                vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
5754                vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
5755         pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
5756                vmcs_read32(VM_EXIT_INTR_INFO),
5757                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5758                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
5759         pr_err("        reason=%08x qualification=%016lx\n",
5760                vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
5761         pr_err("IDTVectoring: info=%08x errcode=%08x\n",
5762                vmcs_read32(IDT_VECTORING_INFO_FIELD),
5763                vmcs_read32(IDT_VECTORING_ERROR_CODE));
5764         pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
5765         if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
5766                 pr_err("TSC Multiplier = 0x%016llx\n",
5767                        vmcs_read64(TSC_MULTIPLIER));
5768         if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
5769                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
5770                         u16 status = vmcs_read16(GUEST_INTR_STATUS);
5771                         pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
5772                 }
5773                 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
5774                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
5775                         pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
5776                 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
5777         }
5778         if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
5779                 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
5780         if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
5781                 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
5782         if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
5783                 pr_err("PLE Gap=%08x Window=%08x\n",
5784                        vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
5785         if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
5786                 pr_err("Virtual processor ID = 0x%04x\n",
5787                        vmcs_read16(VIRTUAL_PROCESSOR_ID));
5788 }
5789
5790 /*
5791  * The guest has exited.  See if we can fix it or if we need userspace
5792  * assistance.
5793  */
5794 static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
5795 {
5796         struct vcpu_vmx *vmx = to_vmx(vcpu);
5797         u32 exit_reason = vmx->exit_reason;
5798         u32 vectoring_info = vmx->idt_vectoring_info;
5799
5800         /*
5801          * Flush logged GPAs PML buffer, this will make dirty_bitmap more
5802          * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
5803          * querying dirty_bitmap, we only need to kick all vcpus out of guest
5804          * mode as if vcpus is in root mode, the PML buffer must has been
5805          * flushed already.
5806          */
5807         if (enable_pml)
5808                 vmx_flush_pml_buffer(vcpu);
5809
5810         /*
5811          * We should never reach this point with a pending nested VM-Enter, and
5812          * more specifically emulation of L2 due to invalid guest state (see
5813          * below) should never happen as that means we incorrectly allowed a
5814          * nested VM-Enter with an invalid vmcs12.
5815          */
5816         WARN_ON_ONCE(vmx->nested.nested_run_pending);
5817
5818         /* If guest state is invalid, start emulating */
5819         if (vmx->emulation_required)
5820                 return handle_invalid_guest_state(vcpu);
5821
5822         if (is_guest_mode(vcpu)) {
5823                 /*
5824                  * The host physical addresses of some pages of guest memory
5825                  * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
5826                  * Page). The CPU may write to these pages via their host
5827                  * physical address while L2 is running, bypassing any
5828                  * address-translation-based dirty tracking (e.g. EPT write
5829                  * protection).
5830                  *
5831                  * Mark them dirty on every exit from L2 to prevent them from
5832                  * getting out of sync with dirty tracking.
5833                  */
5834                 nested_mark_vmcs12_pages_dirty(vcpu);
5835
5836                 if (nested_vmx_reflect_vmexit(vcpu))
5837                         return 1;
5838         }
5839
5840         if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
5841                 dump_vmcs();
5842                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
5843                 vcpu->run->fail_entry.hardware_entry_failure_reason
5844                         = exit_reason;
5845                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
5846                 return 0;
5847         }
5848
5849         if (unlikely(vmx->fail)) {
5850                 dump_vmcs();
5851                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
5852                 vcpu->run->fail_entry.hardware_entry_failure_reason
5853                         = vmcs_read32(VM_INSTRUCTION_ERROR);
5854                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
5855                 return 0;
5856         }
5857
5858         /*
5859          * Note:
5860          * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
5861          * delivery event since it indicates guest is accessing MMIO.
5862          * The vm-exit can be triggered again after return to guest that
5863          * will cause infinite loop.
5864          */
5865         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
5866                         (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
5867                         exit_reason != EXIT_REASON_EPT_VIOLATION &&
5868                         exit_reason != EXIT_REASON_PML_FULL &&
5869                         exit_reason != EXIT_REASON_APIC_ACCESS &&
5870                         exit_reason != EXIT_REASON_TASK_SWITCH)) {
5871                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5872                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
5873                 vcpu->run->internal.ndata = 3;
5874                 vcpu->run->internal.data[0] = vectoring_info;
5875                 vcpu->run->internal.data[1] = exit_reason;
5876                 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
5877                 if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
5878                         vcpu->run->internal.ndata++;
5879                         vcpu->run->internal.data[3] =
5880                                 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5881                 }
5882                 vcpu->run->internal.data[vcpu->run->internal.ndata++] =
5883                         vcpu->arch.last_vmentry_cpu;
5884                 return 0;
5885         }
5886
5887         if (unlikely(!enable_vnmi &&
5888                      vmx->loaded_vmcs->soft_vnmi_blocked)) {
5889                 if (!vmx_interrupt_blocked(vcpu)) {
5890                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
5891                 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
5892                            vcpu->arch.nmi_pending) {
5893                         /*
5894                          * This CPU don't support us in finding the end of an
5895                          * NMI-blocked window if the guest runs with IRQs
5896                          * disabled. So we pull the trigger after 1 s of
5897                          * futile waiting, but inform the user about this.
5898                          */
5899                         printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
5900                                "state on VCPU %d after 1 s timeout\n",
5901                                __func__, vcpu->vcpu_id);
5902                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
5903                 }
5904         }
5905
5906         if (exit_fastpath != EXIT_FASTPATH_NONE)
5907                 return 1;
5908
5909         if (exit_reason >= kvm_vmx_max_exit_handlers)
5910                 goto unexpected_vmexit;
5911 #ifdef CONFIG_RETPOLINE
5912         if (exit_reason == EXIT_REASON_MSR_WRITE)
5913                 return kvm_emulate_wrmsr(vcpu);
5914         else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
5915                 return handle_preemption_timer(vcpu);
5916         else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
5917                 return handle_interrupt_window(vcpu);
5918         else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
5919                 return handle_external_interrupt(vcpu);
5920         else if (exit_reason == EXIT_REASON_HLT)
5921                 return kvm_emulate_halt(vcpu);
5922         else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
5923                 return handle_ept_misconfig(vcpu);
5924 #endif
5925
5926         exit_reason = array_index_nospec(exit_reason,
5927                                          kvm_vmx_max_exit_handlers);
5928         if (!kvm_vmx_exit_handlers[exit_reason])
5929                 goto unexpected_vmexit;
5930
5931         return kvm_vmx_exit_handlers[exit_reason](vcpu);
5932
5933 unexpected_vmexit:
5934         vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason);
5935         dump_vmcs();
5936         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5937         vcpu->run->internal.suberror =
5938                         KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
5939         vcpu->run->internal.ndata = 2;
5940         vcpu->run->internal.data[0] = exit_reason;
5941         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
5942         return 0;
5943 }
5944
5945 /*
5946  * Software based L1D cache flush which is used when microcode providing
5947  * the cache control MSR is not loaded.
5948  *
5949  * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
5950  * flush it is required to read in 64 KiB because the replacement algorithm
5951  * is not exactly LRU. This could be sized at runtime via topology
5952  * information but as all relevant affected CPUs have 32KiB L1D cache size
5953  * there is no point in doing so.
5954  */
5955 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
5956 {
5957         int size = PAGE_SIZE << L1D_CACHE_ORDER;
5958
5959         /*
5960          * This code is only executed when the the flush mode is 'cond' or
5961          * 'always'
5962          */
5963         if (static_branch_likely(&vmx_l1d_flush_cond)) {
5964                 bool flush_l1d;
5965
5966                 /*
5967                  * Clear the per-vcpu flush bit, it gets set again
5968                  * either from vcpu_run() or from one of the unsafe
5969                  * VMEXIT handlers.
5970                  */
5971                 flush_l1d = vcpu->arch.l1tf_flush_l1d;
5972                 vcpu->arch.l1tf_flush_l1d = false;
5973
5974                 /*
5975                  * Clear the per-cpu flush bit, it gets set again from
5976                  * the interrupt handlers.
5977                  */
5978                 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
5979                 kvm_clear_cpu_l1tf_flush_l1d();
5980
5981                 if (!flush_l1d)
5982                         return;
5983         }
5984
5985         vcpu->stat.l1d_flush++;
5986
5987         if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
5988                 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
5989                 return;
5990         }
5991
5992         asm volatile(
5993                 /* First ensure the pages are in the TLB */
5994                 "xorl   %%eax, %%eax\n"
5995                 ".Lpopulate_tlb:\n\t"
5996                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
5997                 "addl   $4096, %%eax\n\t"
5998                 "cmpl   %%eax, %[size]\n\t"
5999                 "jne    .Lpopulate_tlb\n\t"
6000                 "xorl   %%eax, %%eax\n\t"
6001                 "cpuid\n\t"
6002                 /* Now fill the cache */
6003                 "xorl   %%eax, %%eax\n"
6004                 ".Lfill_cache:\n"
6005                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6006                 "addl   $64, %%eax\n\t"
6007                 "cmpl   %%eax, %[size]\n\t"
6008                 "jne    .Lfill_cache\n\t"
6009                 "lfence\n"
6010                 :: [flush_pages] "r" (vmx_l1d_flush_pages),
6011                     [size] "r" (size)
6012                 : "eax", "ebx", "ecx", "edx");
6013 }
6014
6015 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6016 {
6017         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6018         int tpr_threshold;
6019
6020         if (is_guest_mode(vcpu) &&
6021                 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6022                 return;
6023
6024         tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6025         if (is_guest_mode(vcpu))
6026                 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6027         else
6028                 vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6029 }
6030
6031 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6032 {
6033         struct vcpu_vmx *vmx = to_vmx(vcpu);
6034         u32 sec_exec_control;
6035
6036         if (!lapic_in_kernel(vcpu))
6037                 return;
6038
6039         if (!flexpriority_enabled &&
6040             !cpu_has_vmx_virtualize_x2apic_mode())
6041                 return;
6042
6043         /* Postpone execution until vmcs01 is the current VMCS. */
6044         if (is_guest_mode(vcpu)) {
6045                 vmx->nested.change_vmcs01_virtual_apic_mode = true;
6046                 return;
6047         }
6048
6049         sec_exec_control = secondary_exec_controls_get(vmx);
6050         sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6051                               SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6052
6053         switch (kvm_get_apic_mode(vcpu)) {
6054         case LAPIC_MODE_INVALID:
6055                 WARN_ONCE(true, "Invalid local APIC state");
6056         case LAPIC_MODE_DISABLED:
6057                 break;
6058         case LAPIC_MODE_XAPIC:
6059                 if (flexpriority_enabled) {
6060                         sec_exec_control |=
6061                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6062                         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6063
6064                         /*
6065                          * Flush the TLB, reloading the APIC access page will
6066                          * only do so if its physical address has changed, but
6067                          * the guest may have inserted a non-APIC mapping into
6068                          * the TLB while the APIC access page was disabled.
6069                          */
6070                         kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
6071                 }
6072                 break;
6073         case LAPIC_MODE_X2APIC:
6074                 if (cpu_has_vmx_virtualize_x2apic_mode())
6075                         sec_exec_control |=
6076                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6077                 break;
6078         }
6079         secondary_exec_controls_set(vmx, sec_exec_control);
6080
6081         vmx_update_msr_bitmap(vcpu);
6082 }
6083
6084 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
6085 {
6086         struct page *page;
6087
6088         /* Defer reload until vmcs01 is the current VMCS. */
6089         if (is_guest_mode(vcpu)) {
6090                 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
6091                 return;
6092         }
6093
6094         if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
6095             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
6096                 return;
6097
6098         page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
6099         if (is_error_page(page))
6100                 return;
6101
6102         vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
6103         vmx_flush_tlb_current(vcpu);
6104
6105         /*
6106          * Do not pin apic access page in memory, the MMU notifier
6107          * will call us again if it is migrated or swapped out.
6108          */
6109         put_page(page);
6110 }
6111
6112 static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
6113 {
6114         u16 status;
6115         u8 old;
6116
6117         if (max_isr == -1)
6118                 max_isr = 0;
6119
6120         status = vmcs_read16(GUEST_INTR_STATUS);
6121         old = status >> 8;
6122         if (max_isr != old) {
6123                 status &= 0xff;
6124                 status |= max_isr << 8;
6125                 vmcs_write16(GUEST_INTR_STATUS, status);
6126         }
6127 }
6128
6129 static void vmx_set_rvi(int vector)
6130 {
6131         u16 status;
6132         u8 old;
6133
6134         if (vector == -1)
6135                 vector = 0;
6136
6137         status = vmcs_read16(GUEST_INTR_STATUS);
6138         old = (u8)status & 0xff;
6139         if ((u8)vector != old) {
6140                 status &= ~0xff;
6141                 status |= (u8)vector;
6142                 vmcs_write16(GUEST_INTR_STATUS, status);
6143         }
6144 }
6145
6146 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6147 {
6148         /*
6149          * When running L2, updating RVI is only relevant when
6150          * vmcs12 virtual-interrupt-delivery enabled.
6151          * However, it can be enabled only when L1 also
6152          * intercepts external-interrupts and in that case
6153          * we should not update vmcs02 RVI but instead intercept
6154          * interrupt. Therefore, do nothing when running L2.
6155          */
6156         if (!is_guest_mode(vcpu))
6157                 vmx_set_rvi(max_irr);
6158 }
6159
6160 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6161 {
6162         struct vcpu_vmx *vmx = to_vmx(vcpu);
6163         int max_irr;
6164         bool max_irr_updated;
6165
6166         WARN_ON(!vcpu->arch.apicv_active);
6167         if (pi_test_on(&vmx->pi_desc)) {
6168                 pi_clear_on(&vmx->pi_desc);
6169                 /*
6170                  * IOMMU can write to PID.ON, so the barrier matters even on UP.
6171                  * But on x86 this is just a compiler barrier anyway.
6172                  */
6173                 smp_mb__after_atomic();
6174                 max_irr_updated =
6175                         kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
6176
6177                 /*
6178                  * If we are running L2 and L1 has a new pending interrupt
6179                  * which can be injected, we should re-evaluate
6180                  * what should be done with this new L1 interrupt.
6181                  * If L1 intercepts external-interrupts, we should
6182                  * exit from L2 to L1. Otherwise, interrupt should be
6183                  * delivered directly to L2.
6184                  */
6185                 if (is_guest_mode(vcpu) && max_irr_updated) {
6186                         if (nested_exit_on_intr(vcpu))
6187                                 kvm_vcpu_exiting_guest_mode(vcpu);
6188                         else
6189                                 kvm_make_request(KVM_REQ_EVENT, vcpu);
6190                 }
6191         } else {
6192                 max_irr = kvm_lapic_find_highest_irr(vcpu);
6193         }
6194         vmx_hwapic_irr_update(vcpu, max_irr);
6195         return max_irr;
6196 }
6197
6198 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6199 {
6200         if (!kvm_vcpu_apicv_active(vcpu))
6201                 return;
6202
6203         vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6204         vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6205         vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6206         vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6207 }
6208
6209 static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
6210 {
6211         struct vcpu_vmx *vmx = to_vmx(vcpu);
6212
6213         pi_clear_on(&vmx->pi_desc);
6214         memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
6215 }
6216
6217 void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
6218
6219 static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
6220 {
6221         unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
6222         gate_desc *desc = (gate_desc *)host_idt_base + vector;
6223
6224         kvm_before_interrupt(vcpu);
6225         vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
6226         kvm_after_interrupt(vcpu);
6227 }
6228
6229 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
6230 {
6231         u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
6232
6233         /* if exit due to PF check for async PF */
6234         if (is_page_fault(intr_info))
6235                 vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
6236         /* Handle machine checks before interrupts are enabled */
6237         else if (is_machine_check(intr_info))
6238                 kvm_machine_check();
6239         /* We need to handle NMIs before interrupts are enabled */
6240         else if (is_nmi(intr_info))
6241                 handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
6242 }
6243
6244 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
6245 {
6246         u32 intr_info = vmx_get_intr_info(vcpu);
6247
6248         if (WARN_ONCE(!is_external_intr(intr_info),
6249             "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
6250                 return;
6251
6252         handle_interrupt_nmi_irqoff(vcpu, intr_info);
6253 }
6254
6255 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
6256 {
6257         struct vcpu_vmx *vmx = to_vmx(vcpu);
6258
6259         if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
6260                 handle_external_interrupt_irqoff(vcpu);
6261         else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
6262                 handle_exception_nmi_irqoff(vmx);
6263 }
6264
6265 static bool vmx_has_emulated_msr(u32 index)
6266 {
6267         switch (index) {
6268         case MSR_IA32_SMBASE:
6269                 /*
6270                  * We cannot do SMM unless we can run the guest in big
6271                  * real mode.
6272                  */
6273                 return enable_unrestricted_guest || emulate_invalid_guest_state;
6274         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
6275                 return nested;
6276         case MSR_AMD64_VIRT_SPEC_CTRL:
6277                 /* This is AMD only.  */
6278                 return false;
6279         default:
6280                 return true;
6281         }
6282 }
6283
6284 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6285 {
6286         u32 exit_intr_info;
6287         bool unblock_nmi;
6288         u8 vector;
6289         bool idtv_info_valid;
6290
6291         idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6292
6293         if (enable_vnmi) {
6294                 if (vmx->loaded_vmcs->nmi_known_unmasked)
6295                         return;
6296
6297                 exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
6298                 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
6299                 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
6300                 /*
6301                  * SDM 3: 27.7.1.2 (September 2008)
6302                  * Re-set bit "block by NMI" before VM entry if vmexit caused by
6303                  * a guest IRET fault.
6304                  * SDM 3: 23.2.2 (September 2008)
6305                  * Bit 12 is undefined in any of the following cases:
6306                  *  If the VM exit sets the valid bit in the IDT-vectoring
6307                  *   information field.
6308                  *  If the VM exit is due to a double fault.
6309                  */
6310                 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
6311                     vector != DF_VECTOR && !idtv_info_valid)
6312                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6313                                       GUEST_INTR_STATE_NMI);
6314                 else
6315                         vmx->loaded_vmcs->nmi_known_unmasked =
6316                                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
6317                                   & GUEST_INTR_STATE_NMI);
6318         } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
6319                 vmx->loaded_vmcs->vnmi_blocked_time +=
6320                         ktime_to_ns(ktime_sub(ktime_get(),
6321                                               vmx->loaded_vmcs->entry_time));
6322 }
6323
6324 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
6325                                       u32 idt_vectoring_info,
6326                                       int instr_len_field,
6327                                       int error_code_field)
6328 {
6329         u8 vector;
6330         int type;
6331         bool idtv_info_valid;
6332
6333         idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6334
6335         vcpu->arch.nmi_injected = false;
6336         kvm_clear_exception_queue(vcpu);
6337         kvm_clear_interrupt_queue(vcpu);
6338
6339         if (!idtv_info_valid)
6340                 return;
6341
6342         kvm_make_request(KVM_REQ_EVENT, vcpu);
6343
6344         vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
6345         type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
6346
6347         switch (type) {
6348         case INTR_TYPE_NMI_INTR:
6349                 vcpu->arch.nmi_injected = true;
6350                 /*
6351                  * SDM 3: 27.7.1.2 (September 2008)
6352                  * Clear bit "block by NMI" before VM entry if a NMI
6353                  * delivery faulted.
6354                  */
6355                 vmx_set_nmi_mask(vcpu, false);
6356                 break;
6357         case INTR_TYPE_SOFT_EXCEPTION:
6358                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6359                 fallthrough;
6360         case INTR_TYPE_HARD_EXCEPTION:
6361                 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
6362                         u32 err = vmcs_read32(error_code_field);
6363                         kvm_requeue_exception_e(vcpu, vector, err);
6364                 } else
6365                         kvm_requeue_exception(vcpu, vector);
6366                 break;
6367         case INTR_TYPE_SOFT_INTR:
6368                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6369                 fallthrough;
6370         case INTR_TYPE_EXT_INTR:
6371                 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
6372                 break;
6373         default:
6374                 break;
6375         }
6376 }
6377
6378 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
6379 {
6380         __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
6381                                   VM_EXIT_INSTRUCTION_LEN,
6382                                   IDT_VECTORING_ERROR_CODE);
6383 }
6384
6385 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
6386 {
6387         __vmx_complete_interrupts(vcpu,
6388                                   vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6389                                   VM_ENTRY_INSTRUCTION_LEN,
6390                                   VM_ENTRY_EXCEPTION_ERROR_CODE);
6391
6392         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
6393 }
6394
6395 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
6396 {
6397         int i, nr_msrs;
6398         struct perf_guest_switch_msr *msrs;
6399
6400         msrs = perf_guest_get_msrs(&nr_msrs);
6401
6402         if (!msrs)
6403                 return;
6404
6405         for (i = 0; i < nr_msrs; i++)
6406                 if (msrs[i].host == msrs[i].guest)
6407                         clear_atomic_switch_msr(vmx, msrs[i].msr);
6408                 else
6409                         add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
6410                                         msrs[i].host, false);
6411 }
6412
6413 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
6414 {
6415         struct vcpu_vmx *vmx = to_vmx(vcpu);
6416         u64 tscl;
6417         u32 delta_tsc;
6418
6419         if (vmx->req_immediate_exit) {
6420                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
6421                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
6422         } else if (vmx->hv_deadline_tsc != -1) {
6423                 tscl = rdtsc();
6424                 if (vmx->hv_deadline_tsc > tscl)
6425                         /* set_hv_timer ensures the delta fits in 32-bits */
6426                         delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
6427                                 cpu_preemption_timer_multi);
6428                 else
6429                         delta_tsc = 0;
6430
6431                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
6432                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
6433         } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
6434                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
6435                 vmx->loaded_vmcs->hv_timer_soft_disabled = true;
6436         }
6437 }
6438
6439 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
6440 {
6441         if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
6442                 vmx->loaded_vmcs->host_state.rsp = host_rsp;
6443                 vmcs_writel(HOST_RSP, host_rsp);
6444         }
6445 }
6446
6447 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
6448 {
6449         switch (to_vmx(vcpu)->exit_reason) {
6450         case EXIT_REASON_MSR_WRITE:
6451                 return handle_fastpath_set_msr_irqoff(vcpu);
6452         case EXIT_REASON_PREEMPTION_TIMER:
6453                 return handle_fastpath_preemption_timer(vcpu);
6454         default:
6455                 return EXIT_FASTPATH_NONE;
6456         }
6457 }
6458
6459 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
6460
6461 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
6462                                         struct vcpu_vmx *vmx)
6463 {
6464         /*
6465          * VMENTER enables interrupts (host state), but the kernel state is
6466          * interrupts disabled when this is invoked. Also tell RCU about
6467          * it. This is the same logic as for exit_to_user_mode().
6468          *
6469          * This ensures that e.g. latency analysis on the host observes
6470          * guest mode as interrupt enabled.
6471          *
6472          * guest_enter_irqoff() informs context tracking about the
6473          * transition to guest mode and if enabled adjusts RCU state
6474          * accordingly.
6475          */
6476         instrumentation_begin();
6477         trace_hardirqs_on_prepare();
6478         lockdep_hardirqs_on_prepare(CALLER_ADDR0);
6479         instrumentation_end();
6480
6481         guest_enter_irqoff();
6482         lockdep_hardirqs_on(CALLER_ADDR0);
6483
6484         /* L1D Flush includes CPU buffer clear to mitigate MDS */
6485         if (static_branch_unlikely(&vmx_l1d_should_flush))
6486                 vmx_l1d_flush(vcpu);
6487         else if (static_branch_unlikely(&mds_user_clear))
6488                 mds_clear_cpu_buffers();
6489
6490         if (vcpu->arch.cr2 != native_read_cr2())
6491                 native_write_cr2(vcpu->arch.cr2);
6492
6493         vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
6494                                    vmx->loaded_vmcs->launched);
6495
6496         vcpu->arch.cr2 = native_read_cr2();
6497
6498         /*
6499          * VMEXIT disables interrupts (host state), but tracing and lockdep
6500          * have them in state 'on' as recorded before entering guest mode.
6501          * Same as enter_from_user_mode().
6502          *
6503          * guest_exit_irqoff() restores host context and reinstates RCU if
6504          * enabled and required.
6505          *
6506          * This needs to be done before the below as native_read_msr()
6507          * contains a tracepoint and x86_spec_ctrl_restore_host() calls
6508          * into world and some more.
6509          */
6510         lockdep_hardirqs_off(CALLER_ADDR0);
6511         guest_exit_irqoff();
6512
6513         instrumentation_begin();
6514         trace_hardirqs_off_finish();
6515         instrumentation_end();
6516 }
6517
6518 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
6519 {
6520         fastpath_t exit_fastpath;
6521         struct vcpu_vmx *vmx = to_vmx(vcpu);
6522         unsigned long cr3, cr4;
6523
6524 reenter_guest:
6525         /* Record the guest's net vcpu time for enforced NMI injections. */
6526         if (unlikely(!enable_vnmi &&
6527                      vmx->loaded_vmcs->soft_vnmi_blocked))
6528                 vmx->loaded_vmcs->entry_time = ktime_get();
6529
6530         /* Don't enter VMX if guest state is invalid, let the exit handler
6531            start emulation until we arrive back to a valid state */
6532         if (vmx->emulation_required)
6533                 return EXIT_FASTPATH_NONE;
6534
6535         if (vmx->ple_window_dirty) {
6536                 vmx->ple_window_dirty = false;
6537                 vmcs_write32(PLE_WINDOW, vmx->ple_window);
6538         }
6539
6540         /*
6541          * We did this in prepare_switch_to_guest, because it needs to
6542          * be within srcu_read_lock.
6543          */
6544         WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
6545
6546         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
6547                 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
6548         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
6549                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
6550
6551         cr3 = __get_current_cr3_fast();
6552         if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
6553                 vmcs_writel(HOST_CR3, cr3);
6554                 vmx->loaded_vmcs->host_state.cr3 = cr3;
6555         }
6556
6557         cr4 = cr4_read_shadow();
6558         if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
6559                 vmcs_writel(HOST_CR4, cr4);
6560                 vmx->loaded_vmcs->host_state.cr4 = cr4;
6561         }
6562
6563         /* When single-stepping over STI and MOV SS, we must clear the
6564          * corresponding interruptibility bits in the guest state. Otherwise
6565          * vmentry fails as it then expects bit 14 (BS) in pending debug
6566          * exceptions being set, but that's not correct for the guest debugging
6567          * case. */
6568         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
6569                 vmx_set_interrupt_shadow(vcpu, 0);
6570
6571         kvm_load_guest_xsave_state(vcpu);
6572
6573         pt_guest_enter(vmx);
6574
6575         atomic_switch_perf_msrs(vmx);
6576
6577         if (enable_preemption_timer)
6578                 vmx_update_hv_timer(vcpu);
6579
6580         kvm_wait_lapic_expire(vcpu);
6581
6582         /*
6583          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
6584          * it's non-zero. Since vmentry is serialising on affected CPUs, there
6585          * is no need to worry about the conditional branch over the wrmsr
6586          * being speculatively taken.
6587          */
6588         x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
6589
6590         /* The actual VMENTER/EXIT is in the .noinstr.text section. */
6591         vmx_vcpu_enter_exit(vcpu, vmx);
6592
6593         /*
6594          * We do not use IBRS in the kernel. If this vCPU has used the
6595          * SPEC_CTRL MSR it may have left it on; save the value and
6596          * turn it off. This is much more efficient than blindly adding
6597          * it to the atomic save/restore list. Especially as the former
6598          * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
6599          *
6600          * For non-nested case:
6601          * If the L01 MSR bitmap does not intercept the MSR, then we need to
6602          * save it.
6603          *
6604          * For nested case:
6605          * If the L02 MSR bitmap does not intercept the MSR, then we need to
6606          * save it.
6607          */
6608         if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
6609                 vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
6610
6611         x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
6612
6613         /* All fields are clean at this point */
6614         if (static_branch_unlikely(&enable_evmcs))
6615                 current_evmcs->hv_clean_fields |=
6616                         HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
6617
6618         if (static_branch_unlikely(&enable_evmcs))
6619                 current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
6620
6621         /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
6622         if (vmx->host_debugctlmsr)
6623                 update_debugctlmsr(vmx->host_debugctlmsr);
6624
6625 #ifndef CONFIG_X86_64
6626         /*
6627          * The sysexit path does not restore ds/es, so we must set them to
6628          * a reasonable value ourselves.
6629          *
6630          * We can't defer this to vmx_prepare_switch_to_host() since that
6631          * function may be executed in interrupt context, which saves and
6632          * restore segments around it, nullifying its effect.
6633          */
6634         loadsegment(ds, __USER_DS);
6635         loadsegment(es, __USER_DS);
6636 #endif
6637
6638         vmx_register_cache_reset(vcpu);
6639
6640         pt_guest_exit(vmx);
6641
6642         kvm_load_host_xsave_state(vcpu);
6643
6644         vmx->nested.nested_run_pending = 0;
6645         vmx->idt_vectoring_info = 0;
6646
6647         if (unlikely(vmx->fail)) {
6648                 vmx->exit_reason = 0xdead;
6649                 return EXIT_FASTPATH_NONE;
6650         }
6651
6652         vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
6653         if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY))
6654                 kvm_machine_check();
6655
6656         trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
6657
6658         if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
6659                 return EXIT_FASTPATH_NONE;
6660
6661         vmx->loaded_vmcs->launched = 1;
6662         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
6663
6664         vmx_recover_nmi_blocking(vmx);
6665         vmx_complete_interrupts(vmx);
6666
6667         if (is_guest_mode(vcpu))
6668                 return EXIT_FASTPATH_NONE;
6669
6670         exit_fastpath = vmx_exit_handlers_fastpath(vcpu);
6671         if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) {
6672                 if (!kvm_vcpu_exit_request(vcpu)) {
6673                         /*
6674                          * FIXME: this goto should be a loop in vcpu_enter_guest,
6675                          * but it would incur the cost of a retpoline for now.
6676                          * Revisit once static calls are available.
6677                          */
6678                         if (vcpu->arch.apicv_active)
6679                                 vmx_sync_pir_to_irr(vcpu);
6680                         goto reenter_guest;
6681                 }
6682                 exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
6683         }
6684
6685         return exit_fastpath;
6686 }
6687
6688 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
6689 {
6690         struct vcpu_vmx *vmx = to_vmx(vcpu);
6691
6692         if (enable_pml)
6693                 vmx_destroy_pml_buffer(vmx);
6694         free_vpid(vmx->vpid);
6695         nested_vmx_free_vcpu(vcpu);
6696         free_loaded_vmcs(vmx->loaded_vmcs);
6697 }
6698
6699 static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
6700 {
6701         struct vcpu_vmx *vmx;
6702         unsigned long *msr_bitmap;
6703         int i, cpu, err;
6704
6705         BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
6706         vmx = to_vmx(vcpu);
6707
6708         err = -ENOMEM;
6709
6710         vmx->vpid = allocate_vpid();
6711
6712         /*
6713          * If PML is turned on, failure on enabling PML just results in failure
6714          * of creating the vcpu, therefore we can simplify PML logic (by
6715          * avoiding dealing with cases, such as enabling PML partially on vcpus
6716          * for the guest), etc.
6717          */
6718         if (enable_pml) {
6719                 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
6720                 if (!vmx->pml_pg)
6721                         goto free_vpid;
6722         }
6723
6724         BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != MAX_NR_USER_RETURN_MSRS);
6725
6726         for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
6727                 u32 index = vmx_msr_index[i];
6728                 u32 data_low, data_high;
6729                 int j = vmx->nr_uret_msrs;
6730
6731                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
6732                         continue;
6733                 if (wrmsr_safe(index, data_low, data_high) < 0)
6734                         continue;
6735
6736                 vmx->guest_uret_msrs[j].index = i;
6737                 vmx->guest_uret_msrs[j].data = 0;
6738                 switch (index) {
6739                 case MSR_IA32_TSX_CTRL:
6740                         /*
6741                          * No need to pass TSX_CTRL_CPUID_CLEAR through, so
6742                          * let's avoid changing CPUID bits under the host
6743                          * kernel's feet.
6744                          */
6745                         vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
6746                         break;
6747                 default:
6748                         vmx->guest_uret_msrs[j].mask = -1ull;
6749                         break;
6750                 }
6751                 ++vmx->nr_uret_msrs;
6752         }
6753
6754         err = alloc_loaded_vmcs(&vmx->vmcs01);
6755         if (err < 0)
6756                 goto free_pml;
6757
6758         msr_bitmap = vmx->vmcs01.msr_bitmap;
6759         vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
6760         vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
6761         vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
6762         vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
6763         vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
6764         vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
6765         vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
6766         if (kvm_cstate_in_guest(vcpu->kvm)) {
6767                 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
6768                 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
6769                 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
6770                 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
6771         }
6772         vmx->msr_bitmap_mode = 0;
6773
6774         vmx->loaded_vmcs = &vmx->vmcs01;
6775         cpu = get_cpu();
6776         vmx_vcpu_load(vcpu, cpu);
6777         vcpu->cpu = cpu;
6778         init_vmcs(vmx);
6779         vmx_vcpu_put(vcpu);
6780         put_cpu();
6781         if (cpu_need_virtualize_apic_accesses(vcpu)) {
6782                 err = alloc_apic_access_page(vcpu->kvm);
6783                 if (err)
6784                         goto free_vmcs;
6785         }
6786
6787         if (enable_ept && !enable_unrestricted_guest) {
6788                 err = init_rmode_identity_map(vcpu->kvm);
6789                 if (err)
6790                         goto free_vmcs;
6791         }
6792
6793         if (nested)
6794                 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
6795         else
6796                 memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
6797
6798         vmx->nested.posted_intr_nv = -1;
6799         vmx->nested.current_vmptr = -1ull;
6800
6801         vcpu->arch.microcode_version = 0x100000000ULL;
6802         vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
6803
6804         /*
6805          * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
6806          * or POSTED_INTR_WAKEUP_VECTOR.
6807          */
6808         vmx->pi_desc.nv = POSTED_INTR_VECTOR;
6809         vmx->pi_desc.sn = 1;
6810
6811         vmx->ept_pointer = INVALID_PAGE;
6812
6813         return 0;
6814
6815 free_vmcs:
6816         free_loaded_vmcs(vmx->loaded_vmcs);
6817 free_pml:
6818         vmx_destroy_pml_buffer(vmx);
6819 free_vpid:
6820         free_vpid(vmx->vpid);
6821         return err;
6822 }
6823
6824 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
6825 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
6826
6827 static int vmx_vm_init(struct kvm *kvm)
6828 {
6829         spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
6830
6831         if (!ple_gap)
6832                 kvm->arch.pause_in_guest = true;
6833
6834         if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
6835                 switch (l1tf_mitigation) {
6836                 case L1TF_MITIGATION_OFF:
6837                 case L1TF_MITIGATION_FLUSH_NOWARN:
6838                         /* 'I explicitly don't care' is set */
6839                         break;
6840                 case L1TF_MITIGATION_FLUSH:
6841                 case L1TF_MITIGATION_FLUSH_NOSMT:
6842                 case L1TF_MITIGATION_FULL:
6843                         /*
6844                          * Warn upon starting the first VM in a potentially
6845                          * insecure environment.
6846                          */
6847                         if (sched_smt_active())
6848                                 pr_warn_once(L1TF_MSG_SMT);
6849                         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
6850                                 pr_warn_once(L1TF_MSG_L1D);
6851                         break;
6852                 case L1TF_MITIGATION_FULL_FORCE:
6853                         /* Flush is enforced */
6854                         break;
6855                 }
6856         }
6857         kvm_apicv_init(kvm, enable_apicv);
6858         return 0;
6859 }
6860
6861 static int __init vmx_check_processor_compat(void)
6862 {
6863         struct vmcs_config vmcs_conf;
6864         struct vmx_capability vmx_cap;
6865
6866         if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
6867             !this_cpu_has(X86_FEATURE_VMX)) {
6868                 pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
6869                 return -EIO;
6870         }
6871
6872         if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
6873                 return -EIO;
6874         if (nested)
6875                 nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
6876         if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
6877                 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
6878                                 smp_processor_id());
6879                 return -EIO;
6880         }
6881         return 0;
6882 }
6883
6884 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
6885 {
6886         u8 cache;
6887         u64 ipat = 0;
6888
6889         /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
6890          * memory aliases with conflicting memory types and sometimes MCEs.
6891          * We have to be careful as to what are honored and when.
6892          *
6893          * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
6894          * UC.  The effective memory type is UC or WC depending on guest PAT.
6895          * This was historically the source of MCEs and we want to be
6896          * conservative.
6897          *
6898          * When there is no need to deal with noncoherent DMA (e.g., no VT-d
6899          * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
6900          * EPT memory type is set to WB.  The effective memory type is forced
6901          * WB.
6902          *
6903          * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
6904          * EPT memory type is used to emulate guest CD/MTRR.
6905          */
6906
6907         if (is_mmio) {
6908                 cache = MTRR_TYPE_UNCACHABLE;
6909                 goto exit;
6910         }
6911
6912         if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
6913                 ipat = VMX_EPT_IPAT_BIT;
6914                 cache = MTRR_TYPE_WRBACK;
6915                 goto exit;
6916         }
6917
6918         if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
6919                 ipat = VMX_EPT_IPAT_BIT;
6920                 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
6921                         cache = MTRR_TYPE_WRBACK;
6922                 else
6923                         cache = MTRR_TYPE_UNCACHABLE;
6924                 goto exit;
6925         }
6926
6927         cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
6928
6929 exit:
6930         return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
6931 }
6932
6933 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
6934 {
6935         /*
6936          * These bits in the secondary execution controls field
6937          * are dynamic, the others are mostly based on the hypervisor
6938          * architecture and the guest's CPUID.  Do not touch the
6939          * dynamic bits.
6940          */
6941         u32 mask =
6942                 SECONDARY_EXEC_SHADOW_VMCS |
6943                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
6944                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6945                 SECONDARY_EXEC_DESC;
6946
6947         u32 new_ctl = vmx->secondary_exec_control;
6948         u32 cur_ctl = secondary_exec_controls_get(vmx);
6949
6950         secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
6951 }
6952
6953 /*
6954  * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
6955  * (indicating "allowed-1") if they are supported in the guest's CPUID.
6956  */
6957 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
6958 {
6959         struct vcpu_vmx *vmx = to_vmx(vcpu);
6960         struct kvm_cpuid_entry2 *entry;
6961
6962         vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
6963         vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
6964
6965 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {            \
6966         if (entry && (entry->_reg & (_cpuid_mask)))                     \
6967                 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);     \
6968 } while (0)
6969
6970         entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
6971         cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
6972         cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
6973         cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
6974         cr4_fixed1_update(X86_CR4_DE,         edx, feature_bit(DE));
6975         cr4_fixed1_update(X86_CR4_PSE,        edx, feature_bit(PSE));
6976         cr4_fixed1_update(X86_CR4_PAE,        edx, feature_bit(PAE));
6977         cr4_fixed1_update(X86_CR4_MCE,        edx, feature_bit(MCE));
6978         cr4_fixed1_update(X86_CR4_PGE,        edx, feature_bit(PGE));
6979         cr4_fixed1_update(X86_CR4_OSFXSR,     edx, feature_bit(FXSR));
6980         cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
6981         cr4_fixed1_update(X86_CR4_VMXE,       ecx, feature_bit(VMX));
6982         cr4_fixed1_update(X86_CR4_SMXE,       ecx, feature_bit(SMX));
6983         cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
6984         cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));
6985
6986         entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
6987         cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
6988         cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
6989         cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
6990         cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
6991         cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
6992         cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
6993
6994 #undef cr4_fixed1_update
6995 }
6996
6997 static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
6998 {
6999         struct vcpu_vmx *vmx = to_vmx(vcpu);
7000
7001         if (kvm_mpx_supported()) {
7002                 bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
7003
7004                 if (mpx_enabled) {
7005                         vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
7006                         vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
7007                 } else {
7008                         vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
7009                         vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
7010                 }
7011         }
7012 }
7013
7014 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7015 {
7016         struct vcpu_vmx *vmx = to_vmx(vcpu);
7017         struct kvm_cpuid_entry2 *best = NULL;
7018         int i;
7019
7020         for (i = 0; i < PT_CPUID_LEAVES; i++) {
7021                 best = kvm_find_cpuid_entry(vcpu, 0x14, i);
7022                 if (!best)
7023                         return;
7024                 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7025                 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7026                 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7027                 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7028         }
7029
7030         /* Get the number of configurable Address Ranges for filtering */
7031         vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
7032                                                 PT_CAP_num_address_ranges);
7033
7034         /* Initialize and clear the no dependency bits */
7035         vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7036                         RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
7037
7038         /*
7039          * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7040          * will inject an #GP
7041          */
7042         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7043                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7044
7045         /*
7046          * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7047          * PSBFreq can be set
7048          */
7049         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7050                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7051                                 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7052
7053         /*
7054          * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
7055          * MTCFreq can be set
7056          */
7057         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7058                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7059                                 RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
7060
7061         /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7062         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7063                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7064                                                         RTIT_CTL_PTW_EN);
7065
7066         /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7067         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7068                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7069
7070         /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7071         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7072                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7073
7074         /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
7075         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7076                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7077
7078         /* unmask address range configure area */
7079         for (i = 0; i < vmx->pt_desc.addr_range; i++)
7080                 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
7081 }
7082
7083 static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
7084 {
7085         struct vcpu_vmx *vmx = to_vmx(vcpu);
7086
7087         /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
7088         vcpu->arch.xsaves_enabled = false;
7089
7090         if (cpu_has_secondary_exec_ctrls()) {
7091                 vmx_compute_secondary_exec_control(vmx);
7092                 vmcs_set_secondary_exec_control(vmx);
7093         }
7094
7095         if (nested_vmx_allowed(vcpu))
7096                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
7097                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7098                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
7099         else
7100                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
7101                         ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7102                           FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
7103
7104         if (nested_vmx_allowed(vcpu)) {
7105                 nested_vmx_cr_fixed1_bits_update(vcpu);
7106                 nested_vmx_entry_exit_ctls_update(vcpu);
7107         }
7108
7109         if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7110                         guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7111                 update_intel_pt_cfg(vcpu);
7112
7113         if (boot_cpu_has(X86_FEATURE_RTM)) {
7114                 struct vmx_uret_msr *msr;
7115                 msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL);
7116                 if (msr) {
7117                         bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7118                         vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
7119                 }
7120         }
7121 }
7122
7123 static __init void vmx_set_cpu_caps(void)
7124 {
7125         kvm_set_cpu_caps();
7126
7127         /* CPUID 0x1 */
7128         if (nested)
7129                 kvm_cpu_cap_set(X86_FEATURE_VMX);
7130
7131         /* CPUID 0x7 */
7132         if (kvm_mpx_supported())
7133                 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
7134         if (cpu_has_vmx_invpcid())
7135                 kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
7136         if (vmx_pt_mode_is_host_guest())
7137                 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
7138
7139         if (vmx_umip_emulated())
7140                 kvm_cpu_cap_set(X86_FEATURE_UMIP);
7141
7142         /* CPUID 0xD.1 */
7143         supported_xss = 0;
7144         if (!cpu_has_vmx_xsaves())
7145                 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
7146
7147         /* CPUID 0x80000001 */
7148         if (!cpu_has_vmx_rdtscp())
7149                 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
7150
7151         if (cpu_has_vmx_waitpkg())
7152                 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
7153 }
7154
7155 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
7156 {
7157         to_vmx(vcpu)->req_immediate_exit = true;
7158 }
7159
7160 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
7161                                   struct x86_instruction_info *info)
7162 {
7163         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7164         unsigned short port;
7165         bool intercept;
7166         int size;
7167
7168         if (info->intercept == x86_intercept_in ||
7169             info->intercept == x86_intercept_ins) {
7170                 port = info->src_val;
7171                 size = info->dst_bytes;
7172         } else {
7173                 port = info->dst_val;
7174                 size = info->src_bytes;
7175         }
7176
7177         /*
7178          * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
7179          * VM-exits depend on the 'unconditional IO exiting' VM-execution
7180          * control.
7181          *
7182          * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
7183          */
7184         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
7185                 intercept = nested_cpu_has(vmcs12,
7186                                            CPU_BASED_UNCOND_IO_EXITING);
7187         else
7188                 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
7189
7190         /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7191         return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
7192 }
7193
7194 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7195                                struct x86_instruction_info *info,
7196                                enum x86_intercept_stage stage,
7197                                struct x86_exception *exception)
7198 {
7199         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7200
7201         switch (info->intercept) {
7202         /*
7203          * RDPID causes #UD if disabled through secondary execution controls.
7204          * Because it is marked as EmulateOnUD, we need to intercept it here.
7205          */
7206         case x86_intercept_rdtscp:
7207                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
7208                         exception->vector = UD_VECTOR;
7209                         exception->error_code_valid = false;
7210                         return X86EMUL_PROPAGATE_FAULT;
7211                 }
7212                 break;
7213
7214         case x86_intercept_in:
7215         case x86_intercept_ins:
7216         case x86_intercept_out:
7217         case x86_intercept_outs:
7218                 return vmx_check_intercept_io(vcpu, info);
7219
7220         case x86_intercept_lgdt:
7221         case x86_intercept_lidt:
7222         case x86_intercept_lldt:
7223         case x86_intercept_ltr:
7224         case x86_intercept_sgdt:
7225         case x86_intercept_sidt:
7226         case x86_intercept_sldt:
7227         case x86_intercept_str:
7228                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
7229                         return X86EMUL_CONTINUE;
7230
7231                 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7232                 break;
7233
7234         /* TODO: check more intercepts... */
7235         default:
7236                 break;
7237         }
7238
7239         return X86EMUL_UNHANDLEABLE;
7240 }
7241
7242 #ifdef CONFIG_X86_64
7243 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
7244 static inline int u64_shl_div_u64(u64 a, unsigned int shift,
7245                                   u64 divisor, u64 *result)
7246 {
7247         u64 low = a << shift, high = a >> (64 - shift);
7248
7249         /* To avoid the overflow on divq */
7250         if (high >= divisor)
7251                 return 1;
7252
7253         /* Low hold the result, high hold rem which is discarded */
7254         asm("divq %2\n\t" : "=a" (low), "=d" (high) :
7255             "rm" (divisor), "0" (low), "1" (high));
7256         *result = low;
7257
7258         return 0;
7259 }
7260
7261 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
7262                             bool *expired)
7263 {
7264         struct vcpu_vmx *vmx;
7265         u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
7266         struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
7267
7268         vmx = to_vmx(vcpu);
7269         tscl = rdtsc();
7270         guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
7271         delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
7272         lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
7273                                                     ktimer->timer_advance_ns);
7274
7275         if (delta_tsc > lapic_timer_advance_cycles)
7276                 delta_tsc -= lapic_timer_advance_cycles;
7277         else
7278                 delta_tsc = 0;
7279
7280         /* Convert to host delta tsc if tsc scaling is enabled */
7281         if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
7282             delta_tsc && u64_shl_div_u64(delta_tsc,
7283                                 kvm_tsc_scaling_ratio_frac_bits,
7284                                 vcpu->arch.tsc_scaling_ratio, &delta_tsc))
7285                 return -ERANGE;
7286
7287         /*
7288          * If the delta tsc can't fit in the 32 bit after the multi shift,
7289          * we can't use the preemption timer.
7290          * It's possible that it fits on later vmentries, but checking
7291          * on every vmentry is costly so we just use an hrtimer.
7292          */
7293         if (delta_tsc >> (cpu_preemption_timer_multi + 32))
7294                 return -ERANGE;
7295
7296         vmx->hv_deadline_tsc = tscl + delta_tsc;
7297         *expired = !delta_tsc;
7298         return 0;
7299 }
7300
7301 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
7302 {
7303         to_vmx(vcpu)->hv_deadline_tsc = -1;
7304 }
7305 #endif
7306
7307 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
7308 {
7309         if (!kvm_pause_in_guest(vcpu->kvm))
7310                 shrink_ple_window(vcpu);
7311 }
7312
7313 static void vmx_slot_enable_log_dirty(struct kvm *kvm,
7314                                      struct kvm_memory_slot *slot)
7315 {
7316         if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
7317                 kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
7318         kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
7319 }
7320
7321 static void vmx_slot_disable_log_dirty(struct kvm *kvm,
7322                                        struct kvm_memory_slot *slot)
7323 {
7324         kvm_mmu_slot_set_dirty(kvm, slot);
7325 }
7326
7327 static void vmx_flush_log_dirty(struct kvm *kvm)
7328 {
7329         kvm_flush_pml_buffers(kvm);
7330 }
7331
7332 static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
7333                                            struct kvm_memory_slot *memslot,
7334                                            gfn_t offset, unsigned long mask)
7335 {
7336         kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
7337 }
7338
7339 static int vmx_pre_block(struct kvm_vcpu *vcpu)
7340 {
7341         if (pi_pre_block(vcpu))
7342                 return 1;
7343
7344         if (kvm_lapic_hv_timer_in_use(vcpu))
7345                 kvm_lapic_switch_to_sw_timer(vcpu);
7346
7347         return 0;
7348 }
7349
7350 static void vmx_post_block(struct kvm_vcpu *vcpu)
7351 {
7352         if (kvm_x86_ops.set_hv_timer)
7353                 kvm_lapic_switch_to_hv_timer(vcpu);
7354
7355         pi_post_block(vcpu);
7356 }
7357
7358 static void vmx_setup_mce(struct kvm_vcpu *vcpu)
7359 {
7360         if (vcpu->arch.mcg_cap & MCG_LMCE_P)
7361                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
7362                         FEAT_CTL_LMCE_ENABLED;
7363         else
7364                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
7365                         ~FEAT_CTL_LMCE_ENABLED;
7366 }
7367
7368 static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
7369 {
7370         /* we need a nested vmexit to enter SMM, postpone if run is pending */
7371         if (to_vmx(vcpu)->nested.nested_run_pending)
7372                 return -EBUSY;
7373         return !is_smm(vcpu);
7374 }
7375
7376 static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
7377 {
7378         struct vcpu_vmx *vmx = to_vmx(vcpu);
7379
7380         vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
7381         if (vmx->nested.smm.guest_mode)
7382                 nested_vmx_vmexit(vcpu, -1, 0, 0);
7383
7384         vmx->nested.smm.vmxon = vmx->nested.vmxon;
7385         vmx->nested.vmxon = false;
7386         vmx_clear_hlt(vcpu);
7387         return 0;
7388 }
7389
7390 static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
7391 {
7392         struct vcpu_vmx *vmx = to_vmx(vcpu);
7393         int ret;
7394
7395         if (vmx->nested.smm.vmxon) {
7396                 vmx->nested.vmxon = true;
7397                 vmx->nested.smm.vmxon = false;
7398         }
7399
7400         if (vmx->nested.smm.guest_mode) {
7401                 ret = nested_vmx_enter_non_root_mode(vcpu, false);
7402                 if (ret)
7403                         return ret;
7404
7405                 vmx->nested.smm.guest_mode = false;
7406         }
7407         return 0;
7408 }
7409
7410 static void enable_smi_window(struct kvm_vcpu *vcpu)
7411 {
7412         /* RSM will cause a vmexit anyway.  */
7413 }
7414
7415 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
7416 {
7417         return to_vmx(vcpu)->nested.vmxon;
7418 }
7419
7420 static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
7421 {
7422         if (is_guest_mode(vcpu)) {
7423                 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
7424
7425                 if (hrtimer_try_to_cancel(timer) == 1)
7426                         hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
7427         }
7428 }
7429
7430 static void hardware_unsetup(void)
7431 {
7432         if (nested)
7433                 nested_vmx_hardware_unsetup();
7434
7435         free_kvm_area();
7436 }
7437
7438 static bool vmx_check_apicv_inhibit_reasons(ulong bit)
7439 {
7440         ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
7441                           BIT(APICV_INHIBIT_REASON_HYPERV);
7442
7443         return supported & BIT(bit);
7444 }
7445
7446 static struct kvm_x86_ops vmx_x86_ops __initdata = {
7447         .hardware_unsetup = hardware_unsetup,
7448
7449         .hardware_enable = hardware_enable,
7450         .hardware_disable = hardware_disable,
7451         .cpu_has_accelerated_tpr = report_flexpriority,
7452         .has_emulated_msr = vmx_has_emulated_msr,
7453
7454         .vm_size = sizeof(struct kvm_vmx),
7455         .vm_init = vmx_vm_init,
7456
7457         .vcpu_create = vmx_create_vcpu,
7458         .vcpu_free = vmx_free_vcpu,
7459         .vcpu_reset = vmx_vcpu_reset,
7460
7461         .prepare_guest_switch = vmx_prepare_switch_to_guest,
7462         .vcpu_load = vmx_vcpu_load,
7463         .vcpu_put = vmx_vcpu_put,
7464
7465         .update_exception_bitmap = update_exception_bitmap,
7466         .get_msr_feature = vmx_get_msr_feature,
7467         .get_msr = vmx_get_msr,
7468         .set_msr = vmx_set_msr,
7469         .get_segment_base = vmx_get_segment_base,
7470         .get_segment = vmx_get_segment,
7471         .set_segment = vmx_set_segment,
7472         .get_cpl = vmx_get_cpl,
7473         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
7474         .set_cr0 = vmx_set_cr0,
7475         .set_cr4 = vmx_set_cr4,
7476         .set_efer = vmx_set_efer,
7477         .get_idt = vmx_get_idt,
7478         .set_idt = vmx_set_idt,
7479         .get_gdt = vmx_get_gdt,
7480         .set_gdt = vmx_set_gdt,
7481         .set_dr7 = vmx_set_dr7,
7482         .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
7483         .cache_reg = vmx_cache_reg,
7484         .get_rflags = vmx_get_rflags,
7485         .set_rflags = vmx_set_rflags,
7486
7487         .tlb_flush_all = vmx_flush_tlb_all,
7488         .tlb_flush_current = vmx_flush_tlb_current,
7489         .tlb_flush_gva = vmx_flush_tlb_gva,
7490         .tlb_flush_guest = vmx_flush_tlb_guest,
7491
7492         .run = vmx_vcpu_run,
7493         .handle_exit = vmx_handle_exit,
7494         .skip_emulated_instruction = vmx_skip_emulated_instruction,
7495         .update_emulated_instruction = vmx_update_emulated_instruction,
7496         .set_interrupt_shadow = vmx_set_interrupt_shadow,
7497         .get_interrupt_shadow = vmx_get_interrupt_shadow,
7498         .patch_hypercall = vmx_patch_hypercall,
7499         .set_irq = vmx_inject_irq,
7500         .set_nmi = vmx_inject_nmi,
7501         .queue_exception = vmx_queue_exception,
7502         .cancel_injection = vmx_cancel_injection,
7503         .interrupt_allowed = vmx_interrupt_allowed,
7504         .nmi_allowed = vmx_nmi_allowed,
7505         .get_nmi_mask = vmx_get_nmi_mask,
7506         .set_nmi_mask = vmx_set_nmi_mask,
7507         .enable_nmi_window = enable_nmi_window,
7508         .enable_irq_window = enable_irq_window,
7509         .update_cr8_intercept = update_cr8_intercept,
7510         .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
7511         .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
7512         .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
7513         .load_eoi_exitmap = vmx_load_eoi_exitmap,
7514         .apicv_post_state_restore = vmx_apicv_post_state_restore,
7515         .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
7516         .hwapic_irr_update = vmx_hwapic_irr_update,
7517         .hwapic_isr_update = vmx_hwapic_isr_update,
7518         .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
7519         .sync_pir_to_irr = vmx_sync_pir_to_irr,
7520         .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
7521         .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
7522
7523         .set_tss_addr = vmx_set_tss_addr,
7524         .set_identity_map_addr = vmx_set_identity_map_addr,
7525         .get_mt_mask = vmx_get_mt_mask,
7526
7527         .get_exit_info = vmx_get_exit_info,
7528
7529         .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
7530
7531         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
7532
7533         .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
7534
7535         .load_mmu_pgd = vmx_load_mmu_pgd,
7536
7537         .check_intercept = vmx_check_intercept,
7538         .handle_exit_irqoff = vmx_handle_exit_irqoff,
7539
7540         .request_immediate_exit = vmx_request_immediate_exit,
7541
7542         .sched_in = vmx_sched_in,
7543
7544         .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
7545         .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
7546         .flush_log_dirty = vmx_flush_log_dirty,
7547         .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
7548
7549         .pre_block = vmx_pre_block,
7550         .post_block = vmx_post_block,
7551
7552         .pmu_ops = &intel_pmu_ops,
7553         .nested_ops = &vmx_nested_ops,
7554
7555         .update_pi_irte = pi_update_irte,
7556
7557 #ifdef CONFIG_X86_64
7558         .set_hv_timer = vmx_set_hv_timer,
7559         .cancel_hv_timer = vmx_cancel_hv_timer,
7560 #endif
7561
7562         .setup_mce = vmx_setup_mce,
7563
7564         .smi_allowed = vmx_smi_allowed,
7565         .pre_enter_smm = vmx_pre_enter_smm,
7566         .pre_leave_smm = vmx_pre_leave_smm,
7567         .enable_smi_window = enable_smi_window,
7568
7569         .can_emulate_instruction = vmx_can_emulate_instruction,
7570         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
7571         .migrate_timers = vmx_migrate_timers,
7572 };
7573
7574 static __init int hardware_setup(void)
7575 {
7576         unsigned long host_bndcfgs;
7577         struct desc_ptr dt;
7578         int r, i, ept_lpage_level;
7579
7580         store_idt(&dt);
7581         host_idt_base = dt.address;
7582
7583         for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
7584                 kvm_define_user_return_msr(i, vmx_msr_index[i]);
7585
7586         if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
7587                 return -EIO;
7588
7589         if (boot_cpu_has(X86_FEATURE_NX))
7590                 kvm_enable_efer_bits(EFER_NX);
7591
7592         if (boot_cpu_has(X86_FEATURE_MPX)) {
7593                 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
7594                 WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
7595         }
7596
7597         if (!cpu_has_vmx_mpx())
7598                 supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
7599                                     XFEATURE_MASK_BNDCSR);
7600
7601         if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
7602             !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
7603                 enable_vpid = 0;
7604
7605         if (!cpu_has_vmx_ept() ||
7606             !cpu_has_vmx_ept_4levels() ||
7607             !cpu_has_vmx_ept_mt_wb() ||
7608             !cpu_has_vmx_invept_global())
7609                 enable_ept = 0;
7610
7611         if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
7612                 enable_ept_ad_bits = 0;
7613
7614         if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
7615                 enable_unrestricted_guest = 0;
7616
7617         if (!cpu_has_vmx_flexpriority())
7618                 flexpriority_enabled = 0;
7619
7620         if (!cpu_has_virtual_nmis())
7621                 enable_vnmi = 0;
7622
7623         /*
7624          * set_apic_access_page_addr() is used to reload apic access
7625          * page upon invalidation.  No need to do anything if not
7626          * using the APIC_ACCESS_ADDR VMCS field.
7627          */
7628         if (!flexpriority_enabled)
7629                 vmx_x86_ops.set_apic_access_page_addr = NULL;
7630
7631         if (!cpu_has_vmx_tpr_shadow())
7632                 vmx_x86_ops.update_cr8_intercept = NULL;
7633
7634 #if IS_ENABLED(CONFIG_HYPERV)
7635         if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
7636             && enable_ept) {
7637                 vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
7638                 vmx_x86_ops.tlb_remote_flush_with_range =
7639                                 hv_remote_flush_tlb_with_range;
7640         }
7641 #endif
7642
7643         if (!cpu_has_vmx_ple()) {
7644                 ple_gap = 0;
7645                 ple_window = 0;
7646                 ple_window_grow = 0;
7647                 ple_window_max = 0;
7648                 ple_window_shrink = 0;
7649         }
7650
7651         if (!cpu_has_vmx_apicv()) {
7652                 enable_apicv = 0;
7653                 vmx_x86_ops.sync_pir_to_irr = NULL;
7654         }
7655
7656         if (cpu_has_vmx_tsc_scaling()) {
7657                 kvm_has_tsc_control = true;
7658                 kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
7659                 kvm_tsc_scaling_ratio_frac_bits = 48;
7660         }
7661
7662         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
7663
7664         if (enable_ept)
7665                 vmx_enable_tdp();
7666
7667         if (!enable_ept)
7668                 ept_lpage_level = 0;
7669         else if (cpu_has_vmx_ept_1g_page())
7670                 ept_lpage_level = PG_LEVEL_1G;
7671         else if (cpu_has_vmx_ept_2m_page())
7672                 ept_lpage_level = PG_LEVEL_2M;
7673         else
7674                 ept_lpage_level = PG_LEVEL_4K;
7675         kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
7676
7677         /*
7678          * Only enable PML when hardware supports PML feature, and both EPT
7679          * and EPT A/D bit features are enabled -- PML depends on them to work.
7680          */
7681         if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
7682                 enable_pml = 0;
7683
7684         if (!enable_pml) {
7685                 vmx_x86_ops.slot_enable_log_dirty = NULL;
7686                 vmx_x86_ops.slot_disable_log_dirty = NULL;
7687                 vmx_x86_ops.flush_log_dirty = NULL;
7688                 vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
7689         }
7690
7691         if (!cpu_has_vmx_preemption_timer())
7692                 enable_preemption_timer = false;
7693
7694         if (enable_preemption_timer) {
7695                 u64 use_timer_freq = 5000ULL * 1000 * 1000;
7696                 u64 vmx_msr;
7697
7698                 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
7699                 cpu_preemption_timer_multi =
7700                         vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
7701
7702                 if (tsc_khz)
7703                         use_timer_freq = (u64)tsc_khz * 1000;
7704                 use_timer_freq >>= cpu_preemption_timer_multi;
7705
7706                 /*
7707                  * KVM "disables" the preemption timer by setting it to its max
7708                  * value.  Don't use the timer if it might cause spurious exits
7709                  * at a rate faster than 0.1 Hz (of uninterrupted guest time).
7710                  */
7711                 if (use_timer_freq > 0xffffffffu / 10)
7712                         enable_preemption_timer = false;
7713         }
7714
7715         if (!enable_preemption_timer) {
7716                 vmx_x86_ops.set_hv_timer = NULL;
7717                 vmx_x86_ops.cancel_hv_timer = NULL;
7718                 vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
7719         }
7720
7721         kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
7722
7723         kvm_mce_cap_supported |= MCG_LMCE_P;
7724
7725         if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
7726                 return -EINVAL;
7727         if (!enable_ept || !cpu_has_vmx_intel_pt())
7728                 pt_mode = PT_MODE_SYSTEM;
7729
7730         if (nested) {
7731                 nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
7732                                            vmx_capability.ept);
7733
7734                 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
7735                 if (r)
7736                         return r;
7737         }
7738
7739         vmx_set_cpu_caps();
7740
7741         r = alloc_kvm_area();
7742         if (r)
7743                 nested_vmx_hardware_unsetup();
7744         return r;
7745 }
7746
7747 static struct kvm_x86_init_ops vmx_init_ops __initdata = {
7748         .cpu_has_kvm_support = cpu_has_kvm_support,
7749         .disabled_by_bios = vmx_disabled_by_bios,
7750         .check_processor_compatibility = vmx_check_processor_compat,
7751         .hardware_setup = hardware_setup,
7752
7753         .runtime_ops = &vmx_x86_ops,
7754 };
7755
7756 static void vmx_cleanup_l1d_flush(void)
7757 {
7758         if (vmx_l1d_flush_pages) {
7759                 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
7760                 vmx_l1d_flush_pages = NULL;
7761         }
7762         /* Restore state so sysfs ignores VMX */
7763         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
7764 }
7765
7766 static void vmx_exit(void)
7767 {
7768 #ifdef CONFIG_KEXEC_CORE
7769         RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
7770         synchronize_rcu();
7771 #endif
7772
7773         kvm_exit();
7774
7775 #if IS_ENABLED(CONFIG_HYPERV)
7776         if (static_branch_unlikely(&enable_evmcs)) {
7777                 int cpu;
7778                 struct hv_vp_assist_page *vp_ap;
7779                 /*
7780                  * Reset everything to support using non-enlightened VMCS
7781                  * access later (e.g. when we reload the module with
7782                  * enlightened_vmcs=0)
7783                  */
7784                 for_each_online_cpu(cpu) {
7785                         vp_ap = hv_get_vp_assist_page(cpu);
7786
7787                         if (!vp_ap)
7788                                 continue;
7789
7790                         vp_ap->nested_control.features.directhypercall = 0;
7791                         vp_ap->current_nested_vmcs = 0;
7792                         vp_ap->enlighten_vmentry = 0;
7793                 }
7794
7795                 static_branch_disable(&enable_evmcs);
7796         }
7797 #endif
7798         vmx_cleanup_l1d_flush();
7799 }
7800 module_exit(vmx_exit);
7801
7802 static int __init vmx_init(void)
7803 {
7804         int r, cpu;
7805
7806 #if IS_ENABLED(CONFIG_HYPERV)
7807         /*
7808          * Enlightened VMCS usage should be recommended and the host needs
7809          * to support eVMCS v1 or above. We can also disable eVMCS support
7810          * with module parameter.
7811          */
7812         if (enlightened_vmcs &&
7813             ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
7814             (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
7815             KVM_EVMCS_VERSION) {
7816                 int cpu;
7817
7818                 /* Check that we have assist pages on all online CPUs */
7819                 for_each_online_cpu(cpu) {
7820                         if (!hv_get_vp_assist_page(cpu)) {
7821                                 enlightened_vmcs = false;
7822                                 break;
7823                         }
7824                 }
7825
7826                 if (enlightened_vmcs) {
7827                         pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
7828                         static_branch_enable(&enable_evmcs);
7829                 }
7830
7831                 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
7832                         vmx_x86_ops.enable_direct_tlbflush
7833                                 = hv_enable_direct_tlbflush;
7834
7835         } else {
7836                 enlightened_vmcs = false;
7837         }
7838 #endif
7839
7840         r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
7841                      __alignof__(struct vcpu_vmx), THIS_MODULE);
7842         if (r)
7843                 return r;
7844
7845         /*
7846          * Must be called after kvm_init() so enable_ept is properly set
7847          * up. Hand the parameter mitigation value in which was stored in
7848          * the pre module init parser. If no parameter was given, it will
7849          * contain 'auto' which will be turned into the default 'cond'
7850          * mitigation mode.
7851          */
7852         r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
7853         if (r) {
7854                 vmx_exit();
7855                 return r;
7856         }
7857
7858         for_each_possible_cpu(cpu) {
7859                 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
7860
7861                 pi_init(cpu);
7862         }
7863
7864 #ifdef CONFIG_KEXEC_CORE
7865         rcu_assign_pointer(crash_vmclear_loaded_vmcss,
7866                            crash_vmclear_local_loaded_vmcss);
7867 #endif
7868         vmx_check_vmcs12_offsets();
7869
7870         /*
7871          * Intel processors don't have problems with
7872          * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable
7873          * it for VMX by default
7874          */
7875         allow_smaller_maxphyaddr = true;
7876
7877         return 0;
7878 }
7879 module_init(vmx_init);