Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 26 Aug 2018 17:13:21 +0000 (10:13 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 26 Aug 2018 17:13:21 +0000 (10:13 -0700)
Pull x86 fixes from Thomas Gleixner:

 - Correct the L1TF fallout on 32bit and the off by one in the 'too much
   RAM for protection' calculation.

 - Add a helpful kernel message for the 'too much RAM' case

 - Unbreak the VDSO in case that the compiler desides to use indirect
   jumps/calls and emits retpolines which cannot be resolved because the
   kernel uses its own thunks, which does not work for the VDSO. Make it
   use the builtin thunks.

 - Re-export start_thread() which was unexported when the 32/64bit
   implementation was unified. start_thread() is required by modular
   binfmt handlers.

 - Trivial cleanups

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/speculation/l1tf: Suggest what to do on systems with too much RAM
  x86/speculation/l1tf: Fix off-by-one error when warning that system has too much RAM
  x86/kvm/vmx: Remove duplicate l1d flush definitions
  x86/speculation/l1tf: Fix overflow in l1tf_pfn_limit() on 32bit
  x86/process: Re-export start_thread()
  x86/mce: Add notifier_block forward declaration
  x86/vdso: Fix vDSO build if a retpoline is emitted

1  2 
Makefile
arch/x86/kvm/vmx.c
arch/x86/mm/init.c

diff --combined Makefile
+++ b/Makefile
@@@ -440,7 -440,7 +440,7 @@@ KBUILD_CFLAGS_KERNEL :
  KBUILD_AFLAGS_MODULE  := -DMODULE
  KBUILD_CFLAGS_MODULE  := -DMODULE
  KBUILD_LDFLAGS_MODULE := -T $(srctree)/scripts/module-common.lds
 -LDFLAGS :=
 +KBUILD_LDFLAGS :=
  GCC_PLUGINS_CFLAGS :=
  
  export ARCH SRCARCH CONFIG_SHELL HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE AS LD CC
@@@ -448,7 -448,7 +448,7 @@@ export CPP AR NM STRIP OBJCOPY OBJDUMP 
  export MAKE LEX YACC AWK GENKSYMS INSTALLKERNEL PERL PYTHON PYTHON2 PYTHON3 UTS_MACHINE
  export HOSTCXX KBUILD_HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
  
 -export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
 +export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS KBUILD_LDFLAGS
  export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE
  export CFLAGS_KASAN CFLAGS_KASAN_NOSANITIZE CFLAGS_UBSAN
  export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
@@@ -507,9 -507,13 +507,13 @@@ KBUILD_AFLAGS += $(call cc-option, -no-
  endif
  
  RETPOLINE_CFLAGS_GCC := -mindirect-branch=thunk-extern -mindirect-branch-register
+ RETPOLINE_VDSO_CFLAGS_GCC := -mindirect-branch=thunk-inline -mindirect-branch-register
  RETPOLINE_CFLAGS_CLANG := -mretpoline-external-thunk
+ RETPOLINE_VDSO_CFLAGS_CLANG := -mretpoline
  RETPOLINE_CFLAGS := $(call cc-option,$(RETPOLINE_CFLAGS_GCC),$(call cc-option,$(RETPOLINE_CFLAGS_CLANG)))
+ RETPOLINE_VDSO_CFLAGS := $(call cc-option,$(RETPOLINE_VDSO_CFLAGS_GCC),$(call cc-option,$(RETPOLINE_VDSO_CFLAGS_CLANG)))
  export RETPOLINE_CFLAGS
+ export RETPOLINE_VDSO_CFLAGS
  
  KBUILD_CFLAGS += $(call cc-option,-fno-PIE)
  KBUILD_AFLAGS += $(call cc-option,-fno-PIE)
@@@ -754,28 -758,12 +758,28 @@@ ifdef CONFIG_FUNCTION_TRACE
  ifndef CC_FLAGS_FTRACE
  CC_FLAGS_FTRACE := -pg
  endif
 -export CC_FLAGS_FTRACE
 +ifdef CONFIG_FTRACE_MCOUNT_RECORD
 +  # gcc 5 supports generating the mcount tables directly
 +  ifeq ($(call cc-option-yn,-mrecord-mcount),y)
 +    CC_FLAGS_FTRACE   += -mrecord-mcount
 +    export CC_USING_RECORD_MCOUNT := 1
 +  endif
 +  ifdef CONFIG_HAVE_NOP_MCOUNT
 +    ifeq ($(call cc-option-yn, -mnop-mcount),y)
 +      CC_FLAGS_FTRACE += -mnop-mcount
 +      CC_FLAGS_USING  += -DCC_USING_NOP_MCOUNT
 +    endif
 +  endif
 +endif
  ifdef CONFIG_HAVE_FENTRY
 -CC_USING_FENTRY       := $(call cc-option, -mfentry -DCC_USING_FENTRY)
 +  ifeq ($(call cc-option-yn, -mfentry),y)
 +    CC_FLAGS_FTRACE   += -mfentry
 +    CC_FLAGS_USING    += -DCC_USING_FENTRY
 +  endif
  endif
 -KBUILD_CFLAGS += $(CC_FLAGS_FTRACE) $(CC_USING_FENTRY)
 -KBUILD_AFLAGS += $(CC_USING_FENTRY)
 +export CC_FLAGS_FTRACE
 +KBUILD_CFLAGS += $(CC_FLAGS_FTRACE) $(CC_FLAGS_USING)
 +KBUILD_AFLAGS += $(CC_FLAGS_USING)
  ifdef CONFIG_DYNAMIC_FTRACE
        ifdef CONFIG_HAVE_C_RECORDMCOUNT
                BUILD_C_RECORDMCOUNT := y
@@@ -790,8 -778,8 +794,8 @@@ KBUILD_CFLAGS += $(call cc-option, -fno
  endif
  
  ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
 -KBUILD_CFLAGS_KERNEL  += $(call cc-option,-ffunction-sections,)
 -KBUILD_CFLAGS_KERNEL  += $(call cc-option,-fdata-sections,)
 +KBUILD_CFLAGS_KERNEL += -ffunction-sections -fdata-sections
 +LDFLAGS_vmlinux += --gc-sections
  endif
  
  # arch Makefile may override CC so keep this after arch Makefile is included
@@@ -857,6 -845,10 +861,6 @@@ LDFLAGS_BUILD_ID := $(call ld-option, -
  KBUILD_LDFLAGS_MODULE += $(LDFLAGS_BUILD_ID)
  LDFLAGS_vmlinux += $(LDFLAGS_BUILD_ID)
  
 -ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
 -LDFLAGS_vmlinux       += $(call ld-option, --gc-sections,)
 -endif
 -
  ifeq ($(CONFIG_STRIP_ASM_SYMS),y)
  LDFLAGS_vmlinux       += $(call ld-option, -X,)
  endif
@@@ -1020,7 -1012,7 +1024,7 @@@ ARCH_POSTLINK := $(wildcard $(srctree)/
  
  # Final link of vmlinux with optional arch pass after final link
  cmd_link-vmlinux =                                                 \
 -      $(CONFIG_SHELL) $< $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux) ;    \
 +      $(CONFIG_SHELL) $< $(LD) $(KBUILD_LDFLAGS) $(LDFLAGS_vmlinux) ;    \
        $(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) $@, true)
  
  vmlinux: scripts/link-vmlinux.sh autoksyms_recursive $(vmlinux-deps) FORCE
@@@ -1350,12 -1342,16 +1354,12 @@@ distclean: mrprope
  
  # Packaging of the kernel to various formats
  # ---------------------------------------------------------------------------
 -# rpm target kept for backward compatibility
  package-dir   := scripts/package
  
  %src-pkg: FORCE
        $(Q)$(MAKE) $(build)=$(package-dir) $@
  %pkg: include/config/kernel.release FORCE
        $(Q)$(MAKE) $(build)=$(package-dir) $@
 -rpm: rpm-pkg
 -      @echo "  WARNING: \"rpm\" target will be removed after Linux 4.18"
 -      @echo "           Please use \"rpm-pkg\" instead."
  
  
  # Brief documentation of the typical targets used
diff --combined arch/x86/kvm/vmx.c
@@@ -38,7 -38,6 +38,7 @@@
  #include "kvm_cache_regs.h"
  #include "x86.h"
  
 +#include <asm/asm.h>
  #include <asm/cpu.h>
  #include <asm/io.h>
  #include <asm/desc.h>
@@@ -198,14 -197,12 +198,14 @@@ static enum vmx_l1d_flush_state __read_
  
  static const struct {
        const char *option;
 -      enum vmx_l1d_flush_state cmd;
 +      bool for_parse;
  } vmentry_l1d_param[] = {
 -      {"auto",        VMENTER_L1D_FLUSH_AUTO},
 -      {"never",       VMENTER_L1D_FLUSH_NEVER},
 -      {"cond",        VMENTER_L1D_FLUSH_COND},
 -      {"always",      VMENTER_L1D_FLUSH_ALWAYS},
 +      [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
 +      [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
 +      [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
 +      [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
 +      [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
 +      [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
  };
  
  #define L1D_CACHE_ORDER 4
@@@ -221,15 -218,15 +221,15 @@@ static int vmx_setup_l1d_flush(enum vmx
                return 0;
        }
  
 -       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
 -             u64 msr;
 +      if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
 +              u64 msr;
  
 -             rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
 -             if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
 -                     l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
 -                     return 0;
 -             }
 -       }
 +              rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
 +              if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
 +                      l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
 +                      return 0;
 +              }
 +      }
  
        /* If set to auto use the default l1tf mitigation method */
        if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
@@@ -289,9 -286,8 +289,9 @@@ static int vmentry_l1d_flush_parse(cons
  
        if (s) {
                for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
 -                      if (sysfs_streq(s, vmentry_l1d_param[i].option))
 -                              return vmentry_l1d_param[i].cmd;
 +                      if (vmentry_l1d_param[i].for_parse &&
 +                          sysfs_streq(s, vmentry_l1d_param[i].option))
 +                              return i;
                }
        }
        return -EINVAL;
@@@ -301,13 -297,13 +301,13 @@@ static int vmentry_l1d_flush_set(const 
  {
        int l1tf, ret;
  
 -      if (!boot_cpu_has(X86_BUG_L1TF))
 -              return 0;
 -
        l1tf = vmentry_l1d_flush_parse(s);
        if (l1tf < 0)
                return l1tf;
  
 +      if (!boot_cpu_has(X86_BUG_L1TF))
 +              return 0;
 +
        /*
         * Has vmx_init() run already? If not then this is the pre init
         * parameter parsing. In that case just store the value and let
  
  static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
  {
 +      if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
 +              return sprintf(s, "???\n");
 +
        return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
  }
  
@@@ -339,53 -332,22 +339,53 @@@ static const struct kernel_param_ops vm
  };
  module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
  
 +enum ept_pointers_status {
 +      EPT_POINTERS_CHECK = 0,
 +      EPT_POINTERS_MATCH = 1,
 +      EPT_POINTERS_MISMATCH = 2
 +};
 +
  struct kvm_vmx {
        struct kvm kvm;
  
        unsigned int tss_addr;
        bool ept_identity_pagetable_done;
        gpa_t ept_identity_map_addr;
 +
 +      enum ept_pointers_status ept_pointers_match;
 +      spinlock_t ept_pointer_lock;
  };
  
  #define NR_AUTOLOAD_MSRS 8
  
 +struct vmcs_hdr {
 +      u32 revision_id:31;
 +      u32 shadow_vmcs:1;
 +};
 +
  struct vmcs {
 -      u32 revision_id;
 +      struct vmcs_hdr hdr;
        u32 abort;
        char data[0];
  };
  
 +/*
 + * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
 + * and whose values change infrequently, but are not constant.  I.e. this is
 + * used as a write-through cache of the corresponding VMCS fields.
 + */
 +struct vmcs_host_state {
 +      unsigned long cr3;      /* May not match real cr3 */
 +      unsigned long cr4;      /* May not match real cr4 */
 +      unsigned long gs_base;
 +      unsigned long fs_base;
 +
 +      u16           fs_sel, gs_sel, ldt_sel;
 +#ifdef CONFIG_X86_64
 +      u16           ds_sel, es_sel;
 +#endif
 +};
 +
  /*
   * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
   * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
@@@ -397,13 -359,14 +397,13 @@@ struct loaded_vmcs 
        int cpu;
        bool launched;
        bool nmi_known_unmasked;
 -      unsigned long vmcs_host_cr3;    /* May not match real cr3 */
 -      unsigned long vmcs_host_cr4;    /* May not match real cr4 */
        /* Support for vnmi-less CPUs */
        int soft_vnmi_blocked;
        ktime_t entry_time;
        s64 vnmi_blocked_time;
        unsigned long *msr_bitmap;
        struct list_head loaded_vmcss_on_cpu_link;
 +      struct vmcs_host_state host_state;
  };
  
  struct shared_msr_entry {
@@@ -434,7 -397,7 +434,7 @@@ struct __packed vmcs12 
        /* According to the Intel spec, a VMCS region must start with the
         * following two fields. Then follow implementation-specific data.
         */
 -      u32 revision_id;
 +      struct vmcs_hdr hdr;
        u32 abort;
  
        u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
                "Offset of " #field " in struct vmcs12 has changed.")
  
  static inline void vmx_check_vmcs12_offsets(void) {
 -      CHECK_OFFSET(revision_id, 0);
 +      CHECK_OFFSET(hdr, 0);
        CHECK_OFFSET(abort, 4);
        CHECK_OFFSET(launch_state, 8);
        CHECK_OFFSET(io_bitmap_a, 40);
@@@ -820,12 -783,6 +820,12 @@@ struct nested_vmx 
         * memory during VMCLEAR and VMPTRLD.
         */
        struct vmcs12 *cached_vmcs12;
 +      /*
 +       * Cache of the guest's shadow VMCS, existing outside of guest
 +       * memory. Loaded from guest memory during VM entry. Flushed
 +       * to guest memory during VM exit.
 +       */
 +      struct vmcs12 *cached_shadow_vmcs12;
        /*
         * Indicates if the shadow vmcs must be updated with the
         * data hold by vmcs12
@@@ -976,20 -933,25 +976,20 @@@ struct vcpu_vmx 
        /*
         * loaded_vmcs points to the VMCS currently used in this vcpu. For a
         * non-nested (L1) guest, it always points to vmcs01. For a nested
 -       * guest (L2), it points to a different VMCS.
 +       * guest (L2), it points to a different VMCS.  loaded_cpu_state points
 +       * to the VMCS whose state is loaded into the CPU registers that only
 +       * need to be switched when transitioning to/from the kernel; a NULL
 +       * value indicates that host state is loaded.
         */
        struct loaded_vmcs    vmcs01;
        struct loaded_vmcs   *loaded_vmcs;
 +      struct loaded_vmcs   *loaded_cpu_state;
        bool                  __launched; /* temporary, used in vmx_vcpu_run */
        struct msr_autoload {
                struct vmx_msrs guest;
                struct vmx_msrs host;
        } msr_autoload;
 -      struct {
 -              int           loaded;
 -              u16           fs_sel, gs_sel, ldt_sel;
 -#ifdef CONFIG_X86_64
 -              u16           ds_sel, es_sel;
 -#endif
 -              int           gs_ldt_reload_needed;
 -              int           fs_reload_needed;
 -              u64           msr_host_bndcfgs;
 -      } host_state;
 +
        struct {
                int vm86_active;
                ulong save_rflags;
         */
        u64 msr_ia32_feature_control;
        u64 msr_ia32_feature_control_valid_bits;
 +      u64 ept_pointer;
  };
  
  enum segment_cache_field {
@@@ -1259,11 -1220,6 +1259,11 @@@ static inline struct vmcs12 *get_vmcs12
        return to_vmx(vcpu)->nested.cached_vmcs12;
  }
  
 +static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
 +{
 +      return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
 +}
 +
  static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
  static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
  static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
@@@ -1534,48 -1490,6 +1534,48 @@@ static void evmcs_sanitize_exec_ctrls(s
         *      GUEST_IA32_RTIT_CTL             = 0x00002814,
         */
  }
 +
 +/* check_ept_pointer() should be under protection of ept_pointer_lock. */
 +static void check_ept_pointer_match(struct kvm *kvm)
 +{
 +      struct kvm_vcpu *vcpu;
 +      u64 tmp_eptp = INVALID_PAGE;
 +      int i;
 +
 +      kvm_for_each_vcpu(i, vcpu, kvm) {
 +              if (!VALID_PAGE(tmp_eptp)) {
 +                      tmp_eptp = to_vmx(vcpu)->ept_pointer;
 +              } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
 +                      to_kvm_vmx(kvm)->ept_pointers_match
 +                              = EPT_POINTERS_MISMATCH;
 +                      return;
 +              }
 +      }
 +
 +      to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
 +}
 +
 +static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
 +{
 +      int ret;
 +
 +      spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
 +
 +      if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
 +              check_ept_pointer_match(kvm);
 +
 +      if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
 +              ret = -ENOTSUPP;
 +              goto out;
 +      }
 +
 +      ret = hyperv_flush_guest_mapping(
 +                      to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer);
 +
 +out:
 +      spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
 +      return ret;
 +}
  #else /* !IS_ENABLED(CONFIG_HYPERV) */
  static inline void evmcs_write64(unsigned long field, u64 value) {}
  static inline void evmcs_write32(unsigned long field, u32 value) {}
@@@ -1690,12 -1604,6 +1690,12 @@@ static inline bool cpu_has_vmx_virtual_
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
  }
  
 +static inline bool cpu_has_vmx_encls_vmexit(void)
 +{
 +      return vmcs_config.cpu_based_2nd_exec_ctrl &
 +              SECONDARY_EXEC_ENCLS_EXITING;
 +}
 +
  /*
   * Comment's format: document - errata name - stepping - processor name.
   * Refer from
@@@ -1956,12 -1864,6 +1956,12 @@@ static inline bool nested_cpu_supports_
                        CPU_BASED_MONITOR_TRAP_FLAG;
  }
  
 +static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
 +{
 +      return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
 +              SECONDARY_EXEC_SHADOW_VMCS;
 +}
 +
  static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
  {
        return vmcs12->cpu_based_vm_exec_control & bit;
@@@ -2042,11 -1944,6 +2042,11 @@@ static inline bool nested_cpu_has_eptp_
                 VMX_VMFUNC_EPTP_SWITCHING);
  }
  
 +static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
 +{
 +      return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
 +}
 +
  static inline bool is_nmi(u32 intr_info)
  {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@@ -2077,12 -1974,11 +2077,12 @@@ static inline void __invvpid(int ext, u
        u64 rsvd : 48;
        u64 gva;
      } operand = { vpid, 0, gva };
 +    bool error;
  
 -    asm volatile (__ex(ASM_VMX_INVVPID)
 -                /* CF==1 or ZF==1 --> rc = -1 */
 -                "; ja 1f ; ud2 ; 1:"
 -                : : "a"(&operand), "c"(ext) : "cc", "memory");
 +    asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na)
 +                : CC_OUT(na) (error) : "a"(&operand), "c"(ext)
 +                : "memory");
 +    BUG_ON(error);
  }
  
  static inline void __invept(int ext, u64 eptp, gpa_t gpa)
        struct {
                u64 eptp, gpa;
        } operand = {eptp, gpa};
 +      bool error;
  
 -      asm volatile (__ex(ASM_VMX_INVEPT)
 -                      /* CF==1 or ZF==1 --> rc = -1 */
 -                      "; ja 1f ; ud2 ; 1:\n"
 -                      : : "a" (&operand), "c" (ext) : "cc", "memory");
 +      asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na)
 +                    : CC_OUT(na) (error) : "a" (&operand), "c" (ext)
 +                    : "memory");
 +      BUG_ON(error);
  }
  
  static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
  static void vmcs_clear(struct vmcs *vmcs)
  {
        u64 phys_addr = __pa(vmcs);
 -      u8 error;
 +      bool error;
  
 -      asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
 -                    : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
 -                    : "cc", "memory");
 -      if (error)
 +      asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na)
 +                    : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
 +                    : "memory");
 +      if (unlikely(error))
                printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
                       vmcs, phys_addr);
  }
@@@ -2133,15 -2028,15 +2133,15 @@@ static inline void loaded_vmcs_init(str
  static void vmcs_load(struct vmcs *vmcs)
  {
        u64 phys_addr = __pa(vmcs);
 -      u8 error;
 +      bool error;
  
        if (static_branch_unlikely(&enable_evmcs))
                return evmcs_load(phys_addr);
  
 -      asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
 -                      : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
 -                      : "cc", "memory");
 -      if (error)
 +      asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na)
 +                    : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
 +                    : "memory");
 +      if (unlikely(error))
                printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
                       vmcs, phys_addr);
  }
@@@ -2219,19 -2114,6 +2219,19 @@@ static void loaded_vmcs_clear(struct lo
                         __loaded_vmcs_clear, loaded_vmcs, 1);
  }
  
 +static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
 +{
 +      if (vpid == 0)
 +              return true;
 +
 +      if (cpu_has_vmx_invvpid_individual_addr()) {
 +              __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
 +              return true;
 +      }
 +
 +      return false;
 +}
 +
  static inline void vpid_sync_vcpu_single(int vpid)
  {
        if (vpid == 0)
@@@ -2366,10 -2248,10 +2366,10 @@@ static noinline void vmwrite_error(unsi
  
  static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
  {
 -      u8 error;
 +      bool error;
  
 -      asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
 -                     : "=q"(error) : "a"(value), "d"(field) : "cc");
 +      asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na)
 +                    : CC_OUT(na) (error) : "a"(value), "d"(field));
        if (unlikely(error))
                vmwrite_error(field, value);
  }
@@@ -2853,150 -2735,121 +2853,150 @@@ static unsigned long segment_base(u16 s
  }
  #endif
  
 -static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 +static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 +      struct vmcs_host_state *host_state;
  #ifdef CONFIG_X86_64
        int cpu = raw_smp_processor_id();
 -      unsigned long fs_base, kernel_gs_base;
  #endif
 +      unsigned long fs_base, gs_base;
 +      u16 fs_sel, gs_sel;
        int i;
  
 -      if (vmx->host_state.loaded)
 +      if (vmx->loaded_cpu_state)
                return;
  
 -      vmx->host_state.loaded = 1;
 +      vmx->loaded_cpu_state = vmx->loaded_vmcs;
 +      host_state = &vmx->loaded_cpu_state->host_state;
 +
        /*
         * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
         * allow segment selectors with cpl > 0 or ti == 1.
         */
 -      vmx->host_state.ldt_sel = kvm_read_ldt();
 -      vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
 +      host_state->ldt_sel = kvm_read_ldt();
  
  #ifdef CONFIG_X86_64
 +      savesegment(ds, host_state->ds_sel);
 +      savesegment(es, host_state->es_sel);
 +
 +      gs_base = cpu_kernelmode_gs_base(cpu);
        if (likely(is_64bit_mm(current->mm))) {
                save_fsgs_for_kvm();
 -              vmx->host_state.fs_sel = current->thread.fsindex;
 -              vmx->host_state.gs_sel = current->thread.gsindex;
 +              fs_sel = current->thread.fsindex;
 +              gs_sel = current->thread.gsindex;
                fs_base = current->thread.fsbase;
 -              kernel_gs_base = current->thread.gsbase;
 +              vmx->msr_host_kernel_gs_base = current->thread.gsbase;
        } else {
 -#endif
 -              savesegment(fs, vmx->host_state.fs_sel);
 -              savesegment(gs, vmx->host_state.gs_sel);
 -#ifdef CONFIG_X86_64
 +              savesegment(fs, fs_sel);
 +              savesegment(gs, gs_sel);
                fs_base = read_msr(MSR_FS_BASE);
 -              kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
 -      }
 -#endif
 -      if (!(vmx->host_state.fs_sel & 7)) {
 -              vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
 -              vmx->host_state.fs_reload_needed = 0;
 -      } else {
 -              vmcs_write16(HOST_FS_SELECTOR, 0);
 -              vmx->host_state.fs_reload_needed = 1;
 -      }
 -      if (!(vmx->host_state.gs_sel & 7))
 -              vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
 -      else {
 -              vmcs_write16(HOST_GS_SELECTOR, 0);
 -              vmx->host_state.gs_ldt_reload_needed = 1;
 +              vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
        }
  
 -#ifdef CONFIG_X86_64
 -      savesegment(ds, vmx->host_state.ds_sel);
 -      savesegment(es, vmx->host_state.es_sel);
 -
 -      vmcs_writel(HOST_FS_BASE, fs_base);
 -      vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu));
 -
 -      vmx->msr_host_kernel_gs_base = kernel_gs_base;
        if (is_long_mode(&vmx->vcpu))
                wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
  #else
 -      vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
 -      vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
 +      savesegment(fs, fs_sel);
 +      savesegment(gs, gs_sel);
 +      fs_base = segment_base(fs_sel);
 +      gs_base = segment_base(gs_sel);
  #endif
 -      if (boot_cpu_has(X86_FEATURE_MPX))
 -              rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
 +
 +      if (unlikely(fs_sel != host_state->fs_sel)) {
 +              if (!(fs_sel & 7))
 +                      vmcs_write16(HOST_FS_SELECTOR, fs_sel);
 +              else
 +                      vmcs_write16(HOST_FS_SELECTOR, 0);
 +              host_state->fs_sel = fs_sel;
 +      }
 +      if (unlikely(gs_sel != host_state->gs_sel)) {
 +              if (!(gs_sel & 7))
 +                      vmcs_write16(HOST_GS_SELECTOR, gs_sel);
 +              else
 +                      vmcs_write16(HOST_GS_SELECTOR, 0);
 +              host_state->gs_sel = gs_sel;
 +      }
 +      if (unlikely(fs_base != host_state->fs_base)) {
 +              vmcs_writel(HOST_FS_BASE, fs_base);
 +              host_state->fs_base = fs_base;
 +      }
 +      if (unlikely(gs_base != host_state->gs_base)) {
 +              vmcs_writel(HOST_GS_BASE, gs_base);
 +              host_state->gs_base = gs_base;
 +      }
 +
        for (i = 0; i < vmx->save_nmsrs; ++i)
                kvm_set_shared_msr(vmx->guest_msrs[i].index,
                                   vmx->guest_msrs[i].data,
                                   vmx->guest_msrs[i].mask);
  }
  
 -static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 +static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
  {
 -      if (!vmx->host_state.loaded)
 +      struct vmcs_host_state *host_state;
 +
 +      if (!vmx->loaded_cpu_state)
                return;
  
 +      WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
 +      host_state = &vmx->loaded_cpu_state->host_state;
 +
        ++vmx->vcpu.stat.host_state_reload;
 -      vmx->host_state.loaded = 0;
 +      vmx->loaded_cpu_state = NULL;
 +
  #ifdef CONFIG_X86_64
        if (is_long_mode(&vmx->vcpu))
                rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
  #endif
 -      if (vmx->host_state.gs_ldt_reload_needed) {
 -              kvm_load_ldt(vmx->host_state.ldt_sel);
 +      if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
 +              kvm_load_ldt(host_state->ldt_sel);
  #ifdef CONFIG_X86_64
 -              load_gs_index(vmx->host_state.gs_sel);
 +              load_gs_index(host_state->gs_sel);
  #else
 -              loadsegment(gs, vmx->host_state.gs_sel);
 +              loadsegment(gs, host_state->gs_sel);
  #endif
        }
 -      if (vmx->host_state.fs_reload_needed)
 -              loadsegment(fs, vmx->host_state.fs_sel);
 +      if (host_state->fs_sel & 7)
 +              loadsegment(fs, host_state->fs_sel);
  #ifdef CONFIG_X86_64
 -      if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
 -              loadsegment(ds, vmx->host_state.ds_sel);
 -              loadsegment(es, vmx->host_state.es_sel);
 +      if (unlikely(host_state->ds_sel | host_state->es_sel)) {
 +              loadsegment(ds, host_state->ds_sel);
 +              loadsegment(es, host_state->es_sel);
        }
  #endif
        invalidate_tss_limit();
  #ifdef CONFIG_X86_64
        wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
  #endif
 -      if (vmx->host_state.msr_host_bndcfgs)
 -              wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
        load_fixmap_gdt(raw_smp_processor_id());
  }
  
 -static void vmx_load_host_state(struct vcpu_vmx *vmx)
 +#ifdef CONFIG_X86_64
 +static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
  {
 -      preempt_disable();
 -      __vmx_load_host_state(vmx);
 -      preempt_enable();
 +      if (is_long_mode(&vmx->vcpu)) {
 +              preempt_disable();
 +              if (vmx->loaded_cpu_state)
 +                      rdmsrl(MSR_KERNEL_GS_BASE,
 +                             vmx->msr_guest_kernel_gs_base);
 +              preempt_enable();
 +      }
 +      return vmx->msr_guest_kernel_gs_base;
  }
  
 +static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
 +{
 +      if (is_long_mode(&vmx->vcpu)) {
 +              preempt_disable();
 +              if (vmx->loaded_cpu_state)
 +                      wrmsrl(MSR_KERNEL_GS_BASE, data);
 +              preempt_enable();
 +      }
 +      vmx->msr_guest_kernel_gs_base = data;
 +}
 +#endif
 +
  static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
  {
        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@@ -3138,7 -2991,7 +3138,7 @@@ static void vmx_vcpu_put(struct kvm_vcp
  {
        vmx_vcpu_pi_put(vcpu);
  
 -      __vmx_load_host_state(to_vmx(vcpu));
 +      vmx_prepare_switch_to_host(to_vmx(vcpu));
  }
  
  static bool emulation_required(struct kvm_vcpu *vcpu)
@@@ -3359,7 -3212,7 +3359,7 @@@ static bool vmx_rdtscp_supported(void
  
  static bool vmx_invpcid_supported(void)
  {
 -      return cpu_has_vmx_invpcid() && enable_ept;
 +      return cpu_has_vmx_invpcid();
  }
  
  /*
@@@ -3602,12 -3455,6 +3602,12 @@@ static void nested_vmx_setup_ctls_msrs(
                SECONDARY_EXEC_APIC_REGISTER_VIRT |
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                SECONDARY_EXEC_WBINVD_EXITING;
 +      /*
 +       * We can emulate "VMCS shadowing," even if the hardware
 +       * doesn't support it.
 +       */
 +      msrs->secondary_ctls_high |=
 +              SECONDARY_EXEC_SHADOW_VMCS;
  
        if (enable_ept) {
                /* nested EPT: emulate EPT also to L1 */
@@@ -4075,7 -3922,8 +4075,7 @@@ static int vmx_get_msr(struct kvm_vcpu 
                msr_info->data = vmcs_readl(GUEST_GS_BASE);
                break;
        case MSR_KERNEL_GS_BASE:
 -              vmx_load_host_state(vmx);
 -              msr_info->data = vmx->msr_guest_kernel_gs_base;
 +              msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
                break;
  #endif
        case MSR_EFER:
@@@ -4175,7 -4023,8 +4175,7 @@@ static int vmx_set_msr(struct kvm_vcpu 
                vmcs_writel(GUEST_GS_BASE, data);
                break;
        case MSR_KERNEL_GS_BASE:
 -              vmx_load_host_state(vmx);
 -              vmx->msr_guest_kernel_gs_base = data;
 +              vmx_write_guest_kernel_gs_base(vmx, data);
                break;
  #endif
        case MSR_IA32_SYSENTER_CS:
@@@ -4563,8 -4412,7 +4563,8 @@@ static __init int setup_vmcs_config(str
                        SECONDARY_EXEC_RDRAND_EXITING |
                        SECONDARY_EXEC_ENABLE_PML |
                        SECONDARY_EXEC_TSC_SCALING |
 -                      SECONDARY_EXEC_ENABLE_VMFUNC;
 +                      SECONDARY_EXEC_ENABLE_VMFUNC |
 +                      SECONDARY_EXEC_ENCLS_EXITING;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
        return 0;
  }
  
 -static struct vmcs *alloc_vmcs_cpu(int cpu)
 +static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
  {
        int node = cpu_to_node(cpu);
        struct page *pages;
  
        /* KVM supports Enlightened VMCS v1 only */
        if (static_branch_unlikely(&enable_evmcs))
 -              vmcs->revision_id = KVM_EVMCS_VERSION;
 +              vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
        else
 -              vmcs->revision_id = vmcs_config.revision_id;
 +              vmcs->hdr.revision_id = vmcs_config.revision_id;
  
 +      if (shadow)
 +              vmcs->hdr.shadow_vmcs = 1;
        return vmcs;
  }
  
@@@ -4754,14 -4600,14 +4754,14 @@@ static void free_loaded_vmcs(struct loa
        WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
  }
  
 -static struct vmcs *alloc_vmcs(void)
 +static struct vmcs *alloc_vmcs(bool shadow)
  {
 -      return alloc_vmcs_cpu(raw_smp_processor_id());
 +      return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
  }
  
  static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
  {
 -      loaded_vmcs->vmcs = alloc_vmcs();
 +      loaded_vmcs->vmcs = alloc_vmcs(false);
        if (!loaded_vmcs->vmcs)
                return -ENOMEM;
  
                        evmcs->hv_enlightenments_control.msr_bitmap = 1;
                }
        }
 +
 +      memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
 +
        return 0;
  
  out_vmcs:
@@@ -4895,7 -4738,7 +4895,7 @@@ static __init int alloc_kvm_area(void
        for_each_possible_cpu(cpu) {
                struct vmcs *vmcs;
  
 -              vmcs = alloc_vmcs_cpu(cpu);
 +              vmcs = alloc_vmcs_cpu(false, cpu);
                if (!vmcs) {
                        free_kvm_area();
                        return -ENOMEM;
                 * physical CPU.
                 */
                if (static_branch_unlikely(&enable_evmcs))
 -                      vmcs->revision_id = vmcs_config.revision_id;
 +                      vmcs->hdr.revision_id = vmcs_config.revision_id;
  
                per_cpu(vmxarea, cpu) = vmcs;
        }
@@@ -5069,18 -4912,10 +5069,18 @@@ static void vmx_set_efer(struct kvm_vcp
                return;
  
        /*
 -       * Force kernel_gs_base reloading before EFER changes, as control
 -       * of this msr depends on is_long_mode().
 +       * MSR_KERNEL_GS_BASE is not intercepted when the guest is in
 +       * 64-bit mode as a 64-bit kernel may frequently access the
 +       * MSR.  This means we need to manually save/restore the MSR
 +       * when switching between guest and host state, but only if
 +       * the guest is in 64-bit mode.  Sync our cached value if the
 +       * guest is transitioning to 32-bit mode and the CPU contains
 +       * guest state, i.e. the cache is stale.
         */
 -      vmx_load_host_state(to_vmx(vcpu));
 +#ifdef CONFIG_X86_64
 +      if (!(efer & EFER_LMA))
 +              (void)vmx_read_guest_kernel_gs_base(vmx);
 +#endif
        vcpu->arch.efer = efer;
        if (efer & EFER_LMA) {
                vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
@@@ -5137,20 -4972,6 +5137,20 @@@ static void vmx_flush_tlb(struct kvm_vc
        __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
  }
  
 +static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
 +{
 +      int vpid = to_vmx(vcpu)->vpid;
 +
 +      if (!vpid_sync_vcpu_addr(vpid, addr))
 +              vpid_sync_context(vpid);
 +
 +      /*
 +       * If VPIDs are not supported or enabled, then the above is a no-op.
 +       * But we don't really need a TLB flush in that case anyway, because
 +       * each VM entry/exit includes an implicit flush when VPID is 0.
 +       */
 +}
 +
  static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
  {
        ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@@ -5332,7 -5153,6 +5332,7 @@@ static u64 construct_eptp(struct kvm_vc
  
  static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  {
 +      struct kvm *kvm = vcpu->kvm;
        unsigned long guest_cr3;
        u64 eptp;
  
        if (enable_ept) {
                eptp = construct_eptp(vcpu, cr3);
                vmcs_write64(EPT_POINTER, eptp);
 +
 +              if (kvm_x86_ops->tlb_remote_flush) {
 +                      spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
 +                      to_vmx(vcpu)->ept_pointer = eptp;
 +                      to_kvm_vmx(kvm)->ept_pointers_match
 +                              = EPT_POINTERS_CHECK;
 +                      spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
 +              }
 +
                if (enable_unrestricted_guest || is_paging(vcpu) ||
                    is_guest_mode(vcpu))
                        guest_cr3 = kvm_read_cr3(vcpu);
                else
 -                      guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr;
 +                      guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
                ept_load_pdptrs(vcpu);
        }
  
 -      vmx_flush_tlb(vcpu, true);
        vmcs_writel(GUEST_CR3, guest_cr3);
  }
  
@@@ -6292,19 -6104,19 +6292,19 @@@ static void vmx_set_constant_host_state
         */
        cr3 = __read_cr3();
        vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
 -      vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
 +      vmx->loaded_vmcs->host_state.cr3 = cr3;
  
        /* Save the most likely value for this task's CR4 in the VMCS. */
        cr4 = cr4_read_shadow();
        vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
 -      vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
 +      vmx->loaded_vmcs->host_state.cr4 = cr4;
  
        vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
  #ifdef CONFIG_X86_64
        /*
         * Load null selectors, so we can avoid reloading them in
 -       * __vmx_load_host_state(), in case userspace uses the null selectors
 -       * too (the expected case).
 +       * vmx_prepare_switch_to_host(), in case userspace uses
 +       * the null selectors too (the expected case).
         */
        vmcs_write16(HOST_DS_SELECTOR, 0);
        vmcs_write16(HOST_ES_SELECTOR, 0);
@@@ -6429,6 -6241,8 +6429,6 @@@ static void vmx_compute_secondary_exec_
        if (!enable_ept) {
                exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
                enable_unrestricted_guest = 0;
 -              /* Enable INVPCID for non-ept guests may cause performance regression. */
 -              exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
        }
        if (!enable_unrestricted_guest)
                exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@@ -6557,6 -6371,9 +6557,6 @@@ static void ept_set_mmio_spte_mask(void
   */
  static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
  {
 -#ifdef CONFIG_X86_64
 -      unsigned long a;
 -#endif
        int i;
  
        if (enable_shadow_vmcs) {
        vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
        vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
        vmx_set_constant_host_state(vmx);
 -#ifdef CONFIG_X86_64
 -      rdmsrl(MSR_FS_BASE, a);
 -      vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
 -      rdmsrl(MSR_GS_BASE, a);
 -      vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
 -#else
        vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
        vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
 -#endif
  
        if (cpu_has_vmx_vmfunc())
                vmcs_write64(VM_FUNCTION_CONTROL, 0);
                vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
                vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
        }
 +
 +      if (cpu_has_vmx_encls_vmexit())
 +              vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
  }
  
  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@@ -7849,7 -7670,6 +7849,7 @@@ static void vmx_enable_tdp(void
  
  static __init int hardware_setup(void)
  {
 +      unsigned long host_bndcfgs;
        int r = -ENOMEM, i;
  
        rdmsrl_safe(MSR_EFER, &host_efer);
        if (boot_cpu_has(X86_FEATURE_NX))
                kvm_enable_efer_bits(EFER_NX);
  
 +      if (boot_cpu_has(X86_FEATURE_MPX)) {
 +              rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
 +              WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
 +      }
 +
        if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
                !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
                enable_vpid = 0;
        if (enable_ept && !cpu_has_vmx_ept_2m_page())
                kvm_disable_largepages();
  
 +#if IS_ENABLED(CONFIG_HYPERV)
 +      if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
 +          && enable_ept)
 +              kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb;
 +#endif
 +
        if (!cpu_has_vmx_ple()) {
                ple_gap = 0;
                ple_window = 0;
        else
                kvm_disable_tdp();
  
 +      if (!nested) {
 +              kvm_x86_ops->get_nested_state = NULL;
 +              kvm_x86_ops->set_nested_state = NULL;
 +      }
 +
        /*
         * Only enable PML when hardware supports PML feature, and both EPT
         * and EPT A/D bit features are enabled -- PML depends on them to work.
@@@ -8228,35 -8032,10 +8228,35 @@@ static int nested_vmx_get_vmptr(struct 
        return 0;
  }
  
 +/*
 + * Allocate a shadow VMCS and associate it with the currently loaded
 + * VMCS, unless such a shadow VMCS already exists. The newly allocated
 + * VMCS is also VMCLEARed, so that it is ready for use.
 + */
 +static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
 +{
 +      struct vcpu_vmx *vmx = to_vmx(vcpu);
 +      struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
 +
 +      /*
 +       * We should allocate a shadow vmcs for vmcs01 only when L1
 +       * executes VMXON and free it when L1 executes VMXOFF.
 +       * As it is invalid to execute VMXON twice, we shouldn't reach
 +       * here when vmcs01 already have an allocated shadow vmcs.
 +       */
 +      WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
 +
 +      if (!loaded_vmcs->shadow_vmcs) {
 +              loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
 +              if (loaded_vmcs->shadow_vmcs)
 +                      vmcs_clear(loaded_vmcs->shadow_vmcs);
 +      }
 +      return loaded_vmcs->shadow_vmcs;
 +}
 +
  static int enter_vmx_operation(struct kvm_vcpu *vcpu)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 -      struct vmcs *shadow_vmcs;
        int r;
  
        r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
        if (!vmx->nested.cached_vmcs12)
                goto out_cached_vmcs12;
  
 -      if (enable_shadow_vmcs) {
 -              shadow_vmcs = alloc_vmcs();
 -              if (!shadow_vmcs)
 -                      goto out_shadow_vmcs;
 -              /* mark vmcs as shadow */
 -              shadow_vmcs->revision_id |= (1u << 31);
 -              /* init shadow vmcs */
 -              vmcs_clear(shadow_vmcs);
 -              vmx->vmcs01.shadow_vmcs = shadow_vmcs;
 -      }
 +      vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
 +      if (!vmx->nested.cached_shadow_vmcs12)
 +              goto out_cached_shadow_vmcs12;
 +
 +      if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
 +              goto out_shadow_vmcs;
  
        hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
                     HRTIMER_MODE_REL_PINNED);
        return 0;
  
  out_shadow_vmcs:
 +      kfree(vmx->nested.cached_shadow_vmcs12);
 +
 +out_cached_shadow_vmcs12:
        kfree(vmx->nested.cached_vmcs12);
  
  out_cached_vmcs12:
@@@ -8329,7 -8109,7 +8329,7 @@@ static int handle_vmon(struct kvm_vcpu 
  
        /* CPL=0 must be checked manually. */
        if (vmx_get_cpl(vcpu)) {
 -              kvm_queue_exception(vcpu, UD_VECTOR);
 +              kvm_inject_gp(vcpu, 0);
                return 1;
        }
  
   */
  static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
  {
 -      if (vmx_get_cpl(vcpu)) {
 +      if (!to_vmx(vcpu)->nested.vmxon) {
                kvm_queue_exception(vcpu, UD_VECTOR);
                return 0;
        }
  
 -      if (!to_vmx(vcpu)->nested.vmxon) {
 -              kvm_queue_exception(vcpu, UD_VECTOR);
 +      if (vmx_get_cpl(vcpu)) {
 +              kvm_inject_gp(vcpu, 0);
                return 0;
        }
 +
        return 1;
  }
  
@@@ -8454,7 -8233,6 +8454,7 @@@ static void free_nested(struct vcpu_vm
                vmx->vmcs01.shadow_vmcs = NULL;
        }
        kfree(vmx->nested.cached_vmcs12);
 +      kfree(vmx->nested.cached_shadow_vmcs12);
        /* Unpin physical memory we referred to in the vmcs02 */
        if (vmx->nested.apic_access_page) {
                kvm_release_page_dirty(vmx->nested.apic_access_page);
@@@ -8540,7 -8318,7 +8540,7 @@@ static int handle_vmresume(struct kvm_v
   * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
   * 64-bit fields are to be returned).
   */
 -static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
 +static inline int vmcs12_read_any(struct vmcs12 *vmcs12,
                                  unsigned long field, u64 *ret)
  {
        short offset = vmcs_field_to_offset(field);
        if (offset < 0)
                return offset;
  
 -      p = ((char *)(get_vmcs12(vcpu))) + offset;
 +      p = (char *)vmcs12 + offset;
  
        switch (vmcs_field_width(field)) {
        case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
  }
  
  
 -static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
 +static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
                                   unsigned long field, u64 field_value){
        short offset = vmcs_field_to_offset(field);
 -      char *p = ((char *) get_vmcs12(vcpu)) + offset;
 +      char *p = (char *)vmcs12 + offset;
        if (offset < 0)
                return offset;
  
@@@ -8627,7 -8405,7 +8627,7 @@@ static void copy_shadow_to_vmcs12(struc
                for (i = 0; i < max_fields[q]; i++) {
                        field = fields[q][i];
                        field_value = __vmcs_readl(field);
 -                      vmcs12_write_any(&vmx->vcpu, field, field_value);
 +                      vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
                }
                /*
                 * Skip the VM-exit information fields if they are read-only.
@@@ -8662,7 -8440,7 +8662,7 @@@ static void copy_vmcs12_to_shadow(struc
        for (q = 0; q < ARRAY_SIZE(fields); q++) {
                for (i = 0; i < max_fields[q]; i++) {
                        field = fields[q][i];
 -                      vmcs12_read_any(&vmx->vcpu, field, &field_value);
 +                      vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
                        __vmcs_writel(field, field_value);
                }
        }
@@@ -8692,7 -8470,6 +8692,7 @@@ static int handle_vmread(struct kvm_vcp
        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
        u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
        gva_t gva = 0;
 +      struct vmcs12 *vmcs12;
  
        if (!nested_vmx_check_permission(vcpu))
                return 1;
        if (!nested_vmx_check_vmcs12(vcpu))
                return kvm_skip_emulated_instruction(vcpu);
  
 +      if (!is_guest_mode(vcpu))
 +              vmcs12 = get_vmcs12(vcpu);
 +      else {
 +              /*
 +               * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
 +               * to shadowed-field sets the ALU flags for VMfailInvalid.
 +               */
 +              if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
 +                      nested_vmx_failInvalid(vcpu);
 +                      return kvm_skip_emulated_instruction(vcpu);
 +              }
 +              vmcs12 = get_shadow_vmcs12(vcpu);
 +      }
 +
        /* Decode instruction info and find the field to read */
        field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
        /* Read the field, zero-extended to a u64 field_value */
 -      if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
 +      if (vmcs12_read_any(vmcs12, field, &field_value) < 0) {
                nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
                return kvm_skip_emulated_instruction(vcpu);
        }
@@@ -8759,7 -8522,6 +8759,7 @@@ static int handle_vmwrite(struct kvm_vc
         */
        u64 field_value = 0;
        struct x86_exception e;
 +      struct vmcs12 *vmcs12;
  
        if (!nested_vmx_check_permission(vcpu))
                return 1;
                return kvm_skip_emulated_instruction(vcpu);
        }
  
 -      if (vmcs12_write_any(vcpu, field, field_value) < 0) {
 +      if (!is_guest_mode(vcpu))
 +              vmcs12 = get_vmcs12(vcpu);
 +      else {
 +              /*
 +               * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
 +               * to shadowed-field sets the ALU flags for VMfailInvalid.
 +               */
 +              if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
 +                      nested_vmx_failInvalid(vcpu);
 +                      return kvm_skip_emulated_instruction(vcpu);
 +              }
 +              vmcs12 = get_shadow_vmcs12(vcpu);
 +
 +      }
 +
 +      if (vmcs12_write_any(vmcs12, field, field_value) < 0) {
                nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
                return kvm_skip_emulated_instruction(vcpu);
        }
  
 -      switch (field) {
 +      /*
 +       * Do not track vmcs12 dirty-state if in guest-mode
 +       * as we actually dirty shadow vmcs12 instead of vmcs12.
 +       */
 +      if (!is_guest_mode(vcpu)) {
 +              switch (field) {
  #define SHADOW_FIELD_RW(x) case x:
  #include "vmx_shadow_fields.h"
 -              /*
 -               * The fields that can be updated by L1 without a vmexit are
 -               * always updated in the vmcs02, the others go down the slow
 -               * path of prepare_vmcs02.
 -               */
 -              break;
 -      default:
 -              vmx->nested.dirty_vmcs12 = true;
 -              break;
 +                      /*
 +                       * The fields that can be updated by L1 without a vmexit are
 +                       * always updated in the vmcs02, the others go down the slow
 +                       * path of prepare_vmcs02.
 +                       */
 +                      break;
 +              default:
 +                      vmx->nested.dirty_vmcs12 = true;
 +                      break;
 +              }
        }
  
        nested_vmx_succeed(vcpu);
@@@ -8882,9 -8623,7 +8882,9 @@@ static int handle_vmptrld(struct kvm_vc
                        return kvm_skip_emulated_instruction(vcpu);
                }
                new_vmcs12 = kmap(page);
 -              if (new_vmcs12->revision_id != VMCS12_REVISION) {
 +              if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
 +                  (new_vmcs12->hdr.shadow_vmcs &&
 +                   !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
                        kunmap(page);
                        kvm_release_page_clean(page);
                        nested_vmx_failValid(vcpu,
@@@ -9082,105 -8821,6 +9082,105 @@@ static int handle_invvpid(struct kvm_vc
        return kvm_skip_emulated_instruction(vcpu);
  }
  
 +static int handle_invpcid(struct kvm_vcpu *vcpu)
 +{
 +      u32 vmx_instruction_info;
 +      unsigned long type;
 +      bool pcid_enabled;
 +      gva_t gva;
 +      struct x86_exception e;
 +      unsigned i;
 +      unsigned long roots_to_free = 0;
 +      struct {
 +              u64 pcid;
 +              u64 gla;
 +      } operand;
 +
 +      if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
 +              kvm_queue_exception(vcpu, UD_VECTOR);
 +              return 1;
 +      }
 +
 +      vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 +      type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
 +
 +      if (type > 3) {
 +              kvm_inject_gp(vcpu, 0);
 +              return 1;
 +      }
 +
 +      /* According to the Intel instruction reference, the memory operand
 +       * is read even if it isn't needed (e.g., for type==all)
 +       */
 +      if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
 +                              vmx_instruction_info, false, &gva))
 +              return 1;
 +
 +      if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
 +              kvm_inject_page_fault(vcpu, &e);
 +              return 1;
 +      }
 +
 +      if (operand.pcid >> 12 != 0) {
 +              kvm_inject_gp(vcpu, 0);
 +              return 1;
 +      }
 +
 +      pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
 +
 +      switch (type) {
 +      case INVPCID_TYPE_INDIV_ADDR:
 +              if ((!pcid_enabled && (operand.pcid != 0)) ||
 +                  is_noncanonical_address(operand.gla, vcpu)) {
 +                      kvm_inject_gp(vcpu, 0);
 +                      return 1;
 +              }
 +              kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
 +              return kvm_skip_emulated_instruction(vcpu);
 +
 +      case INVPCID_TYPE_SINGLE_CTXT:
 +              if (!pcid_enabled && (operand.pcid != 0)) {
 +                      kvm_inject_gp(vcpu, 0);
 +                      return 1;
 +              }
 +
 +              if (kvm_get_active_pcid(vcpu) == operand.pcid) {
 +                      kvm_mmu_sync_roots(vcpu);
 +                      kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 +              }
 +
 +              for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
 +                      if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
 +                          == operand.pcid)
 +                              roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
 +
 +              kvm_mmu_free_roots(vcpu, roots_to_free);
 +              /*
 +               * If neither the current cr3 nor any of the prev_roots use the
 +               * given PCID, then nothing needs to be done here because a
 +               * resync will happen anyway before switching to any other CR3.
 +               */
 +
 +              return kvm_skip_emulated_instruction(vcpu);
 +
 +      case INVPCID_TYPE_ALL_NON_GLOBAL:
 +              /*
 +               * Currently, KVM doesn't mark global entries in the shadow
 +               * page tables, so a non-global flush just degenerates to a
 +               * global flush. If needed, we could optimize this later by
 +               * keeping track of global entries in shadow page tables.
 +               */
 +
 +              /* fall-through */
 +      case INVPCID_TYPE_ALL_INCL_GLOBAL:
 +              kvm_mmu_unload(vcpu);
 +              return kvm_skip_emulated_instruction(vcpu);
 +
 +      default:
 +              BUG(); /* We have already checked above that type <= 3 */
 +      }
 +}
 +
  static int handle_pml_full(struct kvm_vcpu *vcpu)
  {
        unsigned long exit_qualification;
@@@ -9330,17 -8970,6 +9330,17 @@@ fail
        return 1;
  }
  
 +static int handle_encls(struct kvm_vcpu *vcpu)
 +{
 +      /*
 +       * SGX virtualization is not yet supported.  There is no software
 +       * enable bit for SGX, so we have to trap ENCLS and inject a #UD
 +       * to prevent the guest from executing ENCLS.
 +       */
 +      kvm_queue_exception(vcpu, UD_VECTOR);
 +      return 1;
 +}
 +
  /*
   * The exit handlers return 1 if the exit was handled fully and guest execution
   * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@@ -9395,10 -9024,8 +9395,10 @@@ static int (*const kvm_vmx_exit_handler
        [EXIT_REASON_XSAVES]                  = handle_xsaves,
        [EXIT_REASON_XRSTORS]                 = handle_xrstors,
        [EXIT_REASON_PML_FULL]                = handle_pml_full,
 +      [EXIT_REASON_INVPCID]                 = handle_invpcid,
        [EXIT_REASON_VMFUNC]                  = handle_vmfunc,
        [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
 +      [EXIT_REASON_ENCLS]                   = handle_encls,
  };
  
  static const int kvm_vmx_max_exit_handlers =
@@@ -9569,30 -9196,6 +9569,30 @@@ static bool nested_vmx_exit_handled_cr(
        return false;
  }
  
 +static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
 +      struct vmcs12 *vmcs12, gpa_t bitmap)
 +{
 +      u32 vmx_instruction_info;
 +      unsigned long field;
 +      u8 b;
 +
 +      if (!nested_cpu_has_shadow_vmcs(vmcs12))
 +              return true;
 +
 +      /* Decode instruction info and find the field to access */
 +      vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 +      field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
 +
 +      /* Out-of-range fields always cause a VM exit from L2 to L1 */
 +      if (field >> 15)
 +              return true;
 +
 +      if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
 +              return true;
 +
 +      return 1 & (b >> (field & 7));
 +}
 +
  /*
   * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
   * should handle it ourselves in L0 (and then continue L2). Only call this
@@@ -9677,15 -9280,10 +9677,15 @@@ static bool nested_vmx_exit_reflected(s
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
        case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
                return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
 +      case EXIT_REASON_VMREAD:
 +              return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
 +                      vmcs12->vmread_bitmap);
 +      case EXIT_REASON_VMWRITE:
 +              return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
 +                      vmcs12->vmwrite_bitmap);
        case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
        case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
 -      case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
 -      case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
 +      case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
        case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
        case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
                /*
        case EXIT_REASON_VMFUNC:
                /* VM functions are emulated through L2->L0 vmexits. */
                return false;
 +      case EXIT_REASON_ENCLS:
 +              /* SGX is never exposed to L1 */
 +              return false;
        default:
                return true;
        }
@@@ -10131,9 -9726,6 +10131,6 @@@ static int vmx_handle_exit(struct kvm_v
   * information but as all relevant affected CPUs have 32KiB L1D cache size
   * there is no point in doing so.
   */
- #define L1D_CACHE_ORDER 4
- static void *vmx_l1d_flush_pages;
  static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
  {
        int size = PAGE_SIZE << L1D_CACHE_ORDER;
@@@ -10649,15 -10241,15 +10646,15 @@@ static void __noclone vmx_vcpu_run(stru
                vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
  
        cr3 = __get_current_cr3_fast();
 -      if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) {
 +      if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
                vmcs_writel(HOST_CR3, cr3);
 -              vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
 +              vmx->loaded_vmcs->host_state.cr3 = cr3;
        }
  
        cr4 = cr4_read_shadow();
 -      if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) {
 +      if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
                vmcs_writel(HOST_CR4, cr4);
 -              vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
 +              vmx->loaded_vmcs->host_state.cr4 = cr4;
        }
  
        /* When single-stepping over STI and MOV SS, we must clear the
         * The sysexit path does not restore ds/es, so we must set them to
         * a reasonable value ourselves.
         *
 -       * We can't defer this to vmx_load_host_state() since that function
 -       * may be executed in interrupt context, which saves and restore segments
 -       * around it, nullifying its effect.
 +       * We can't defer this to vmx_prepare_switch_to_host() since that
 +       * function may be executed in interrupt context, which saves and
 +       * restore segments around it, nullifying its effect.
         */
        loadsegment(ds, __USER_DS);
        loadsegment(es, __USER_DS);
@@@ -10916,8 -10508,8 +10913,8 @@@ static void vmx_switch_vmcs(struct kvm_
                return;
  
        cpu = get_cpu();
 -      vmx->loaded_vmcs = vmcs;
        vmx_vcpu_put(vcpu);
 +      vmx->loaded_vmcs = vmcs;
        vmx_vcpu_load(vcpu, cpu);
        put_cpu();
  }
@@@ -11057,8 -10649,6 +11054,8 @@@ free_vcpu
  
  static int vmx_vm_init(struct kvm *kvm)
  {
 +      spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
 +
        if (!ple_gap)
                kvm->arch.pause_in_guest = true;
  
@@@ -11283,11 -10873,11 +11280,11 @@@ static int nested_ept_init_mmu_context(
        if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
                return 1;
  
 -      kvm_mmu_unload(vcpu);
        kvm_init_shadow_ept_mmu(vcpu,
                        to_vmx(vcpu)->nested.msrs.ept_caps &
                        VMX_EPT_EXECUTE_ONLY_BIT,
 -                      nested_ept_ad_enabled(vcpu));
 +                      nested_ept_ad_enabled(vcpu),
 +                      nested_ept_get_cr3(vcpu));
        vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
        vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
        vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@@ -11335,9 -10925,9 +11332,9 @@@ static void vmx_inject_page_fault_neste
  static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
                                                 struct vmcs12 *vmcs12);
  
 -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 -                                      struct vmcs12 *vmcs12)
 +static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
  {
 +      struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct page *page;
        u64 hpa;
@@@ -11578,38 -11168,6 +11575,38 @@@ static inline bool nested_vmx_prepare_m
        return true;
  }
  
 +static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
 +                                     struct vmcs12 *vmcs12)
 +{
 +      struct vmcs12 *shadow;
 +      struct page *page;
 +
 +      if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
 +          vmcs12->vmcs_link_pointer == -1ull)
 +              return;
 +
 +      shadow = get_shadow_vmcs12(vcpu);
 +      page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
 +
 +      memcpy(shadow, kmap(page), VMCS12_SIZE);
 +
 +      kunmap(page);
 +      kvm_release_page_clean(page);
 +}
 +
 +static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
 +                                            struct vmcs12 *vmcs12)
 +{
 +      struct vcpu_vmx *vmx = to_vmx(vcpu);
 +
 +      if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
 +          vmcs12->vmcs_link_pointer == -1ull)
 +              return;
 +
 +      kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
 +                      get_shadow_vmcs12(vcpu), VMCS12_SIZE);
 +}
 +
  static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
                                          struct vmcs12 *vmcs12)
  {
@@@ -11667,12 -11225,11 +11664,12 @@@ static int nested_vmx_check_msr_switch(
                                       unsigned long count_field,
                                       unsigned long addr_field)
  {
 +      struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        int maxphyaddr;
        u64 count, addr;
  
 -      if (vmcs12_read_any(vcpu, count_field, &count) ||
 -          vmcs12_read_any(vcpu, addr_field, &addr)) {
 +      if (vmcs12_read_any(vmcs12, count_field, &count) ||
 +          vmcs12_read_any(vmcs12, addr_field, &addr)) {
                WARN_ON(1);
                return -EINVAL;
        }
@@@ -11722,19 -11279,6 +11719,19 @@@ static int nested_vmx_check_pml_control
        return 0;
  }
  
 +static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
 +                                               struct vmcs12 *vmcs12)
 +{
 +      if (!nested_cpu_has_shadow_vmcs(vmcs12))
 +              return 0;
 +
 +      if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
 +          !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
 +              return -EINVAL;
 +
 +      return 0;
 +}
 +
  static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
                                       struct vmx_msr_entry *e)
  {
@@@ -11884,16 -11428,12 +11881,16 @@@ static int nested_vmx_load_cr3(struct k
                                return 1;
                        }
                }
 -
 -              vcpu->arch.cr3 = cr3;
 -              __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
        }
  
 -      kvm_mmu_reset_context(vcpu);
 +      if (!nested_ept)
 +              kvm_mmu_new_cr3(vcpu, cr3, false);
 +
 +      vcpu->arch.cr3 = cr3;
 +      __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 +
 +      kvm_init_mmu(vcpu, false);
 +
        return 0;
  }
  
@@@ -11980,8 -11520,7 +11977,8 @@@ static void prepare_vmcs02_full(struct 
         * Set host-state according to L0's settings (vmcs12 is irrelevant here)
         * Some constant fields are set here by vmx_set_constant_host_state().
         * Other fields are different per CPU, and will be set later when
 -       * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
 +       * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest()
 +       * is called.
         */
        vmx_set_constant_host_state(vmx);
  
@@@ -12053,6 -11592,11 +12050,6 @@@ static int prepare_vmcs02(struct kvm_vc
        vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
        vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
  
 -      /*
 -       * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR,
 -       * HOST_FS_BASE, HOST_GS_BASE.
 -       */
 -
        if (vmx->nested.nested_run_pending &&
            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
                kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
                        exec_control |= vmcs12_exec_ctrl;
                }
  
 +              /* VMCS shadowing for L2 is emulated for now */
 +              exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
 +
                if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
                        vmcs_write16(GUEST_INTR_STATUS,
                                vmcs12->guest_intr_status);
                if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
                        vmcs_write64(APIC_ACCESS_ADDR, -1ull);
  
 +              if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
 +                      vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
 +
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
        }
  
@@@ -12342,9 -11880,6 +12339,9 @@@ static int check_vmentry_prereqs(struc
        if (nested_vmx_check_pml_controls(vcpu, vmcs12))
                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
  
 +      if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12))
 +              return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 +
        if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
                                vmx->nested.msrs.procbased_ctls_low,
                                vmx->nested.msrs.procbased_ctls_high) ||
        return 0;
  }
  
 +static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
 +                                        struct vmcs12 *vmcs12)
 +{
 +      int r;
 +      struct page *page;
 +      struct vmcs12 *shadow;
 +
 +      if (vmcs12->vmcs_link_pointer == -1ull)
 +              return 0;
 +
 +      if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
 +              return -EINVAL;
 +
 +      page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
 +      if (is_error_page(page))
 +              return -EINVAL;
 +
 +      r = 0;
 +      shadow = kmap(page);
 +      if (shadow->hdr.revision_id != VMCS12_REVISION ||
 +          shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
 +              r = -EINVAL;
 +      kunmap(page);
 +      kvm_release_page_clean(page);
 +      return r;
 +}
 +
  static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                                  u32 *exit_qual)
  {
            !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
                return 1;
  
 -      if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
 -          vmcs12->vmcs_link_pointer != -1ull) {
 +      if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
                *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
                return 1;
        }
        return 0;
  }
  
 -static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
 +/*
 + * If exit_qual is NULL, this is being called from state restore (either RSM
 + * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
 + */
 +static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 -      u32 exit_qual;
 -      int r;
 +      bool from_vmentry = !!exit_qual;
 +      u32 dummy_exit_qual;
 +      int r = 0;
  
        enter_guest_mode(vcpu);
  
                vcpu->arch.tsc_offset += vmcs12->tsc_offset;
  
        r = EXIT_REASON_INVALID_STATE;
 -      if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
 +      if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual))
                goto fail;
  
 -      nested_get_vmcs12_pages(vcpu, vmcs12);
 +      if (from_vmentry) {
 +              nested_get_vmcs12_pages(vcpu);
  
 -      r = EXIT_REASON_MSR_LOAD_FAIL;
 -      exit_qual = nested_vmx_load_msr(vcpu,
 -                                      vmcs12->vm_entry_msr_load_addr,
 -                                      vmcs12->vm_entry_msr_load_count);
 -      if (exit_qual)
 -              goto fail;
 +              r = EXIT_REASON_MSR_LOAD_FAIL;
 +              *exit_qual = nested_vmx_load_msr(vcpu,
 +                                               vmcs12->vm_entry_msr_load_addr,
 +                                               vmcs12->vm_entry_msr_load_count);
 +              if (*exit_qual)
 +                      goto fail;
 +      } else {
 +              /*
 +               * The MMU is not initialized to point at the right entities yet and
 +               * "get pages" would need to read data from the guest (i.e. we will
 +               * need to perform gpa to hpa translation). Request a call
 +               * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
 +               * have already been set at vmentry time and should not be reset.
 +               */
 +              kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
 +      }
  
        /*
         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@@ -12590,7 -12083,8 +12587,7 @@@ fail
                vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
        leave_guest_mode(vcpu);
        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
 -      nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual);
 -      return 1;
 +      return r;
  }
  
  /*
@@@ -12613,17 -12107,6 +12610,17 @@@ static int nested_vmx_run(struct kvm_vc
  
        vmcs12 = get_vmcs12(vcpu);
  
 +      /*
 +       * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
 +       * that there *is* a valid VMCS pointer, RFLAGS.CF is set
 +       * rather than RFLAGS.ZF, and no error number is stored to the
 +       * VM-instruction error field.
 +       */
 +      if (vmcs12->hdr.shadow_vmcs) {
 +              nested_vmx_failInvalid(vcpu);
 +              goto out;
 +      }
 +
        if (enable_shadow_vmcs)
                copy_shadow_to_vmcs12(vmx);
  
         */
  
        vmx->nested.nested_run_pending = 1;
 -      ret = enter_vmx_non_root_mode(vcpu);
 +      ret = enter_vmx_non_root_mode(vcpu, &exit_qual);
        if (ret) {
 +              nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual);
                vmx->nested.nested_run_pending = 0;
 -              return ret;
 +              return 1;
        }
  
        /* Hide L1D cache contents from the nested guest.  */
        vmx->vcpu.arch.l1tf_flush_l1d = true;
  
 +      /*
 +       * Must happen outside of enter_vmx_non_root_mode() as it will
 +       * also be used as part of restoring nVMX state for
 +       * snapshot restore (migration).
 +       *
 +       * In this flow, it is assumed that vmcs12 cache was
 +       * trasferred as part of captured nVMX state and should
 +       * therefore not be read from guest memory (which may not
 +       * exist on destination host yet).
 +       */
 +      nested_cache_shadow_vmcs12(vcpu, vmcs12);
 +
        /*
         * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
         * by event injection, halt vcpu.
@@@ -13209,17 -12679,6 +13206,17 @@@ static void nested_vmx_vmexit(struct kv
                        prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
                                       exit_qualification);
  
 +              /*
 +               * Must happen outside of sync_vmcs12() as it will
 +               * also be used to capture vmcs12 cache as part of
 +               * capturing nVMX state for snapshot (migration).
 +               *
 +               * Otherwise, this flush will dirty guest memory at a
 +               * point it is already assumed by user-space to be
 +               * immutable.
 +               */
 +              nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
 +
                if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
                                         vmcs12->vm_exit_msr_store_count))
                        nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
@@@ -13794,7 -13253,7 +13791,7 @@@ static int vmx_pre_leave_smm(struct kvm
  
        if (vmx->nested.smm.guest_mode) {
                vcpu->arch.hflags &= ~HF_SMM_MASK;
 -              ret = enter_vmx_non_root_mode(vcpu);
 +              ret = enter_vmx_non_root_mode(vcpu, NULL);
                vcpu->arch.hflags |= HF_SMM_MASK;
                if (ret)
                        return ret;
@@@ -13809,199 -13268,6 +13806,199 @@@ static int enable_smi_window(struct kvm
        return 0;
  }
  
 +static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 +                              struct kvm_nested_state __user *user_kvm_nested_state,
 +                              u32 user_data_size)
 +{
 +      struct vcpu_vmx *vmx;
 +      struct vmcs12 *vmcs12;
 +      struct kvm_nested_state kvm_state = {
 +              .flags = 0,
 +              .format = 0,
 +              .size = sizeof(kvm_state),
 +              .vmx.vmxon_pa = -1ull,
 +              .vmx.vmcs_pa = -1ull,
 +      };
 +
 +      if (!vcpu)
 +              return kvm_state.size + 2 * VMCS12_SIZE;
 +
 +      vmx = to_vmx(vcpu);
 +      vmcs12 = get_vmcs12(vcpu);
 +      if (nested_vmx_allowed(vcpu) &&
 +          (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
 +              kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
 +              kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
 +
 +              if (vmx->nested.current_vmptr != -1ull) {
 +                      kvm_state.size += VMCS12_SIZE;
 +
 +                      if (is_guest_mode(vcpu) &&
 +                          nested_cpu_has_shadow_vmcs(vmcs12) &&
 +                          vmcs12->vmcs_link_pointer != -1ull)
 +                              kvm_state.size += VMCS12_SIZE;
 +              }
 +
 +              if (vmx->nested.smm.vmxon)
 +                      kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
 +
 +              if (vmx->nested.smm.guest_mode)
 +                      kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
 +
 +              if (is_guest_mode(vcpu)) {
 +                      kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
 +
 +                      if (vmx->nested.nested_run_pending)
 +                              kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
 +              }
 +      }
 +
 +      if (user_data_size < kvm_state.size)
 +              goto out;
 +
 +      if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
 +              return -EFAULT;
 +
 +      if (vmx->nested.current_vmptr == -1ull)
 +              goto out;
 +
 +      /*
 +       * When running L2, the authoritative vmcs12 state is in the
 +       * vmcs02. When running L1, the authoritative vmcs12 state is
 +       * in the shadow vmcs linked to vmcs01, unless
 +       * sync_shadow_vmcs is set, in which case, the authoritative
 +       * vmcs12 state is in the vmcs12 already.
 +       */
 +      if (is_guest_mode(vcpu))
 +              sync_vmcs12(vcpu, vmcs12);
 +      else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
 +              copy_shadow_to_vmcs12(vmx);
 +
 +      if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
 +              return -EFAULT;
 +
 +      if (nested_cpu_has_shadow_vmcs(vmcs12) &&
 +          vmcs12->vmcs_link_pointer != -1ull) {
 +              if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
 +                               get_shadow_vmcs12(vcpu), sizeof(*vmcs12)))
 +                      return -EFAULT;
 +      }
 +
 +out:
 +      return kvm_state.size;
 +}
 +
 +static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
 +                              struct kvm_nested_state __user *user_kvm_nested_state,
 +                              struct kvm_nested_state *kvm_state)
 +{
 +      struct vcpu_vmx *vmx = to_vmx(vcpu);
 +      struct vmcs12 *vmcs12;
 +      u32 exit_qual;
 +      int ret;
 +
 +      if (kvm_state->format != 0)
 +              return -EINVAL;
 +
 +      if (!nested_vmx_allowed(vcpu))
 +              return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
 +
 +      if (kvm_state->vmx.vmxon_pa == -1ull) {
 +              if (kvm_state->vmx.smm.flags)
 +                      return -EINVAL;
 +
 +              if (kvm_state->vmx.vmcs_pa != -1ull)
 +                      return -EINVAL;
 +
 +              vmx_leave_nested(vcpu);
 +              return 0;
 +      }
 +
 +      if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
 +              return -EINVAL;
 +
 +      if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
 +              return -EINVAL;
 +
 +      if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
 +          !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
 +              return -EINVAL;
 +
 +      if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
 +          (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
 +              return -EINVAL;
 +
 +      if (kvm_state->vmx.smm.flags &
 +          ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
 +              return -EINVAL;
 +
 +      if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
 +          !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
 +              return -EINVAL;
 +
 +      vmx_leave_nested(vcpu);
 +      if (kvm_state->vmx.vmxon_pa == -1ull)
 +              return 0;
 +
 +      vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
 +      ret = enter_vmx_operation(vcpu);
 +      if (ret)
 +              return ret;
 +
 +      set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
 +
 +      if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
 +              vmx->nested.smm.vmxon = true;
 +              vmx->nested.vmxon = false;
 +
 +              if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
 +                      vmx->nested.smm.guest_mode = true;
 +      }
 +
 +      vmcs12 = get_vmcs12(vcpu);
 +      if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
 +              return -EFAULT;
 +
 +      if (vmcs12->hdr.revision_id != VMCS12_REVISION)
 +              return -EINVAL;
 +
 +      if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
 +              return 0;
 +
 +      vmx->nested.nested_run_pending =
 +              !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
 +
 +      if (nested_cpu_has_shadow_vmcs(vmcs12) &&
 +          vmcs12->vmcs_link_pointer != -1ull) {
 +              struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
 +              if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12))
 +                      return -EINVAL;
 +
 +              if (copy_from_user(shadow_vmcs12,
 +                                 user_kvm_nested_state->data + VMCS12_SIZE,
 +                                 sizeof(*vmcs12)))
 +                      return -EFAULT;
 +
 +              if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
 +                  !shadow_vmcs12->hdr.shadow_vmcs)
 +                      return -EINVAL;
 +      }
 +
 +      if (check_vmentry_prereqs(vcpu, vmcs12) ||
 +          check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
 +              return -EINVAL;
 +
 +      if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
 +              vmx->nested.nested_run_pending = 1;
 +
 +      vmx->nested.dirty_vmcs12 = true;
 +      ret = enter_vmx_non_root_mode(vcpu, NULL);
 +      if (ret)
 +              return -EINVAL;
 +
 +      return 0;
 +}
 +
  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
        .vcpu_free = vmx_free_vcpu,
        .vcpu_reset = vmx_vcpu_reset,
  
 -      .prepare_guest_switch = vmx_save_host_state,
 +      .prepare_guest_switch = vmx_prepare_switch_to_guest,
        .vcpu_load = vmx_vcpu_load,
        .vcpu_put = vmx_vcpu_put,
  
        .set_rflags = vmx_set_rflags,
  
        .tlb_flush = vmx_flush_tlb,
 +      .tlb_flush_gva = vmx_flush_tlb_gva,
  
        .run = vmx_vcpu_run,
        .handle_exit = vmx_handle_exit,
  
        .setup_mce = vmx_setup_mce,
  
 +      .get_nested_state = vmx_get_nested_state,
 +      .set_nested_state = vmx_set_nested_state,
 +      .get_vmcs12_pages = nested_get_vmcs12_pages,
 +
        .smi_allowed = vmx_smi_allowed,
        .pre_enter_smm = vmx_pre_enter_smm,
        .pre_leave_smm = vmx_pre_leave_smm,
diff --combined arch/x86/mm/init.c
@@@ -99,22 -99,15 +99,22 @@@ __ref void *alloc_low_pages(unsigned in
        }
  
        if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
 -              unsigned long ret;
 -              if (min_pfn_mapped >= max_pfn_mapped)
 -                      panic("alloc_low_pages: ran out of memory");
 -              ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
 +              unsigned long ret = 0;
 +
 +              if (min_pfn_mapped < max_pfn_mapped) {
 +                      ret = memblock_find_in_range(
 +                                      min_pfn_mapped << PAGE_SHIFT,
                                        max_pfn_mapped << PAGE_SHIFT,
                                        PAGE_SIZE * num , PAGE_SIZE);
 +              }
 +              if (ret)
 +                      memblock_reserve(ret, PAGE_SIZE * num);
 +              else if (can_use_brk_pgt)
 +                      ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
 +
                if (!ret)
                        panic("alloc_low_pages: can not alloc memory");
 -              memblock_reserve(ret, PAGE_SIZE * num);
 +
                pfn = ret >> PAGE_SHIFT;
        } else {
                pfn = pgt_buf_end;
@@@ -930,7 -923,7 +930,7 @@@ unsigned long max_swapfile_size(void
  
        if (boot_cpu_has_bug(X86_BUG_L1TF)) {
                /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
-               unsigned long l1tf_limit = l1tf_pfn_limit() + 1;
+               unsigned long long l1tf_limit = l1tf_pfn_limit();
                /*
                 * We encode swap offsets also with 3 bits below those for pfn
                 * which makes the usable limit higher.
  #if CONFIG_PGTABLE_LEVELS > 2
                l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
  #endif
-               pages = min_t(unsigned long, l1tf_limit, pages);
+               pages = min_t(unsigned long long, l1tf_limit, pages);
        }
        return pages;
  }