Merge tag 'kvmarm-5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmar...

author Paolo Bonzini <pbonzini@redhat.com>

Tue, 20 Oct 2020 12:14:25 +0000 (08:14 -0400)

committer Paolo Bonzini <pbonzini@redhat.com>

Tue, 20 Oct 2020 12:14:25 +0000 (08:14 -0400)
author Paolo Bonzini <pbonzini@redhat.com>
Tue, 20 Oct 2020 12:14:25 +0000 (08:14 -0400)
committer Paolo Bonzini <pbonzini@redhat.com>
Tue, 20 Oct 2020 12:14:25 +0000 (08:14 -0400)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst

index d2b733d..425325f 100644 (file)
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4498,11 +4498,14 @@ Currently, the following list of CPUID leaves are returned:
   - HYPERV_CPUID_ENLIGHTMENT_INFO
   - HYPERV_CPUID_IMPLEMENT_LIMITS
   - HYPERV_CPUID_NESTED_FEATURES
+ - HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS
+ - HYPERV_CPUID_SYNDBG_INTERFACE
+ - HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES
  
  HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was
  enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS).
  
-Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure
+Userspace invokes KVM_GET_SUPPORTED_HV_CPUID by passing a kvm_cpuid2 structure
  with the 'nent' field indicating the number of entries in the variable-size
  array 'entries'.  If the number of entries is too low to describe all Hyper-V
  feature leaves, an error (E2BIG) is returned. If the number is more or equal
@@ -4704,6 +4707,99 @@ KVM_PV_VM_VERIFY
    Verify the integrity of the unpacked image. Only if this succeeds,
    KVM is allowed to start protected VCPUs.
  
+4.126 KVM_X86_SET_MSR_FILTER
+----------------------------
+
+:Capability: KVM_X86_SET_MSR_FILTER
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_msr_filter
+:Returns: 0 on success, < 0 on error
+
+::
+
+  struct kvm_msr_filter_range {
+  #define KVM_MSR_FILTER_READ  (1 << 0)
+  #define KVM_MSR_FILTER_WRITE (1 << 1)
+       __u32 flags;
+       __u32 nmsrs; /* number of msrs in bitmap */
+       __u32 base;  /* MSR index the bitmap starts at */
+       __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
+  };
+
+  #define KVM_MSR_FILTER_MAX_RANGES 16
+  struct kvm_msr_filter {
+  #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+  #define KVM_MSR_FILTER_DEFAULT_DENY  (1 << 0)
+       __u32 flags;
+       struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
+  };
+
+flags values for struct kvm_msr_filter_range:
+
+KVM_MSR_FILTER_READ
+
+  Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap
+  indicates that a read should immediately fail, while a 1 indicates that
+  a read for a particular MSR should be handled regardless of the default
+  filter action.
+
+KVM_MSR_FILTER_WRITE
+
+  Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap
+  indicates that a write should immediately fail, while a 1 indicates that
+  a write for a particular MSR should be handled regardless of the default
+  filter action.
+
+KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE
+
+  Filter both read and write accesses to MSRs using the given bitmap. A 0
+  in the bitmap indicates that both reads and writes should immediately fail,
+  while a 1 indicates that reads and writes for a particular MSR are not
+  filtered by this range.
+
+flags values for struct kvm_msr_filter:
+
+KVM_MSR_FILTER_DEFAULT_ALLOW
+
+  If no filter range matches an MSR index that is getting accessed, KVM will
+  fall back to allowing access to the MSR.
+
+KVM_MSR_FILTER_DEFAULT_DENY
+
+  If no filter range matches an MSR index that is getting accessed, KVM will
+  fall back to rejecting access to the MSR. In this mode, all MSRs that should
+  be processed by KVM need to explicitly be marked as allowed in the bitmaps.
+
+This ioctl allows user space to define up to 16 bitmaps of MSR ranges to
+specify whether a certain MSR access should be explicitly filtered for or not.
+
+If this ioctl has never been invoked, MSR accesses are not guarded and the
+old KVM in-kernel emulation behavior is fully preserved.
+
+As soon as the filtering is in place, every MSR access is processed through
+the filtering. If a bit is within one of the defined ranges, read and write
+accesses are guarded by the bitmap's value for the MSR index. If it is not
+defined in any range, whether MSR access is rejected is determined by the flags
+field in the kvm_msr_filter struct: KVM_MSR_FILTER_DEFAULT_ALLOW and
+KVM_MSR_FILTER_DEFAULT_DENY.
+
+Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR
+filtering. In that mode, KVM_MSR_FILTER_DEFAULT_DENY no longer has any effect.
+
+Each bitmap range specifies a range of MSRs to potentially allow access on.
+The range goes from MSR index [base .. base+nmsrs]. The flags field
+indicates whether reads, writes or both reads and writes are filtered
+by setting a 1 bit in the bitmap for the corresponding MSR index.
+
+If an MSR access is not permitted through the filtering, it generates a
+#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that
+allows user space to deflect and potentially handle various MSR accesses
+into user space.
+
+If a vCPU is in running state while this ioctl is invoked, the vCPU may
+experience inconsistent filtering behavior on MSR accesses.
+
  
  5. The kvm_run structure
  ========================
@@ -4869,14 +4965,13 @@ to the byte array.
  
  .. note::
  
-      For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and
-      KVM_EXIT_EPR the corresponding
-
-operations are complete (and guest state is consistent) only after userspace
-has re-entered the kernel with KVM_RUN.  The kernel side will first finish
-incomplete operations and then check for pending signals.  Userspace
-can re-enter the guest with an unmasked signal pending to complete
-pending operations.
+      For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR,
+      KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
+      operations are complete (and guest state is consistent) only after userspace
+      has re-entered the kernel with KVM_RUN.  The kernel side will first finish
+      incomplete operations and then check for pending signals.  Userspace
+      can re-enter the guest with an unmasked signal pending to complete
+      pending operations.
  
  ::
  
@@ -5163,6 +5258,44 @@ Note that KVM does not skip the faulting instruction as it does for
  KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state
  if it decides to decode and emulate the instruction.
  
+::
+
+               /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */
+               struct {
+                       __u8 error; /* user -> kernel */
+                       __u8 pad[7];
+                       __u32 reason; /* kernel -> user */
+                       __u32 index; /* kernel -> user */
+                       __u64 data; /* kernel <-> user */
+               } msr;
+
+Used on x86 systems. When the VM capability KVM_CAP_X86_USER_SPACE_MSR is
+enabled, MSR accesses to registers that would invoke a #GP by KVM kernel code
+will instead trigger a KVM_EXIT_X86_RDMSR exit for reads and KVM_EXIT_X86_WRMSR
+exit for writes.
+
+The "reason" field specifies why the MSR trap occurred. User space will only
+receive MSR exit traps when a particular reason was requested during through
+ENABLE_CAP. Currently valid exit reasons are:
+
+       KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM
+       KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits
+       KVM_MSR_EXIT_REASON_FILTER - access blocked by KVM_X86_SET_MSR_FILTER
+
+For KVM_EXIT_X86_RDMSR, the "index" field tells user space which MSR the guest
+wants to read. To respond to this request with a successful read, user space
+writes the respective data into the "data" field and must continue guest
+execution to ensure the read data is transferred into guest register state.
+
+If the RDMSR request was unsuccessful, user space indicates that with a "1" in
+the "error" field. This will inject a #GP into the guest when the VCPU is
+executed again.
+
+For KVM_EXIT_X86_WRMSR, the "index" field tells user space which MSR the guest
+wants to write. Once finished processing the event, user space must continue
+vCPU execution. If the MSR write was unsuccessful, user space also sets the
+"error" field to "1".
+
  ::
  
                 /* Fix the size of the union. */
@@ -5852,6 +5985,28 @@ controlled by the kvm module parameter halt_poll_ns. This capability allows
  the maximum halt time to specified on a per-VM basis, effectively overriding
  the module parameter for the target VM.
  
+7.21 KVM_CAP_X86_USER_SPACE_MSR
+-------------------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] contains the mask of KVM_MSR_EXIT_REASON_* events to report
+:Returns: 0 on success; -1 on error
+
+This capability enables trapping of #GP invoking RDMSR and WRMSR instructions
+into user space.
+
+When a guest requests to read or write an MSR, KVM may not implement all MSRs
+that are relevant to a respective system. It also does not differentiate by
+CPU type.
+
+To allow more fine grained control over MSR handling, user space may enable
+this capability. With it enabled, MSR accesses that match the mask specified in
+args[0] and trigger a #GP event inside the guest by KVM will instead trigger
+KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space
+can then handle to implement model specific MSR handling and/or user notifications
+to inform a user that an MSR was not handled.
+
  8. Other capabilities.
  ======================
  
@@ -6173,3 +6328,48 @@ specific interfaces must be consistent, i.e. if one says the feature
  is supported, than the other should as well and vice versa.  For arm64
  see Documentation/virt/kvm/devices/vcpu.rst "KVM_ARM_VCPU_PVTIME_CTRL".
  For x86 see Documentation/virt/kvm/msr.rst "MSR_KVM_STEAL_TIME".
+
+8.25 KVM_CAP_S390_DIAG318
+-------------------------
+
+:Architectures: s390
+
+This capability enables a guest to set information about its control program
+(i.e. guest kernel type and version). The information is helpful during
+system/firmware service events, providing additional data about the guest
+environments running on the machine.
+
+The information is associated with the DIAGNOSE 0x318 instruction, which sets
+an 8-byte value consisting of a one-byte Control Program Name Code (CPNC) and
+a 7-byte Control Program Version Code (CPVC). The CPNC determines what
+environment the control program is running in (e.g. Linux, z/VM...), and the
+CPVC is used for information specific to OS (e.g. Linux version, Linux
+distribution...)
+
+If this capability is available, then the CPNC and CPVC can be synchronized
+between KVM and userspace via the sync regs mechanism (KVM_SYNC_DIAG318).
+
+8.26 KVM_CAP_X86_USER_SPACE_MSR
+-------------------------------
+
+:Architectures: x86
+
+This capability indicates that KVM supports deflection of MSR reads and
+writes to user space. It can be enabled on a VM level. If enabled, MSR
+accesses that would usually trigger a #GP by KVM into the guest will
+instead get bounced to user space through the KVM_EXIT_X86_RDMSR and
+KVM_EXIT_X86_WRMSR exit notifications.
+
+8.25 KVM_X86_SET_MSR_FILTER
+---------------------------
+
+:Architectures: x86
+
+This capability indicates that KVM supports that accesses to user defined MSRs
+may be rejected. With this capability exposed, KVM exports new VM ioctl
+KVM_X86_SET_MSR_FILTER which user space can call to specify bitmaps of MSR
+ranges that KVM should reject access to.
+
+In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to
+trap and emulate MSRs that are outside of the scope of KVM as well as
+limit the attack surface on KVM's MSR emulation code.
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h

index 96eccb1..5ef2669 100644 (file)
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -298,15 +298,15 @@ static __always_inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu)
         return (kvm_vcpu_get_esr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT;
  }
  
-static __always_inline bool kvm_vcpu_dabt_iss1tw(const struct kvm_vcpu *vcpu)
+static __always_inline bool kvm_vcpu_abt_iss1tw(const struct kvm_vcpu *vcpu)
  {
         return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_S1PTW);
  }
  
+/* Always check for S1PTW *before* using this. */
  static __always_inline bool kvm_vcpu_dabt_iswrite(const struct kvm_vcpu *vcpu)
  {
-       return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_WNR) ||
-               kvm_vcpu_dabt_iss1tw(vcpu); /* AF/DBM update */
+       return kvm_vcpu_get_esr(vcpu) & ESR_ELx_WNR;
  }
  
  static inline bool kvm_vcpu_dabt_is_cm(const struct kvm_vcpu *vcpu)
@@ -335,6 +335,11 @@ static inline bool kvm_vcpu_trap_is_iabt(const struct kvm_vcpu *vcpu)
         return kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_IABT_LOW;
  }
  
+static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu)
+{
+       return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu);
+}
+
  static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu)
  {
         return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC;
@@ -372,6 +377,9 @@ static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu)
  
  static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
  {
+       if (kvm_vcpu_abt_iss1tw(vcpu))
+               return true;
+
         if (kvm_vcpu_trap_is_iabt(vcpu))
                 return false;
  
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h

index eeac62b..313a8fa 100644 (file)
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -446,7 +446,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
                         kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT &&
                         kvm_vcpu_dabt_isvalid(vcpu) &&
                         !kvm_vcpu_abt_issea(vcpu) &&
-                       !kvm_vcpu_dabt_iss1tw(vcpu);
+                       !kvm_vcpu_abt_iss1tw(vcpu);
  
                 if (valid) {
                         int ret = __vgic_v2_perform_cpuif_access(vcpu);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c

index a816cb8..19aacc7 100644 (file)
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -759,7 +759,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         struct kvm_pgtable *pgt;
  
         write_fault = kvm_is_write_fault(vcpu);
-       exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
+       exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
         VM_BUG_ON(write_fault && exec_fault);
  
         if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
@@ -984,7 +984,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
                         goto out;
                 }
  
-               if (kvm_vcpu_dabt_iss1tw(vcpu)) {
+               if (kvm_vcpu_abt_iss1tw(vcpu)) {
                         kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
                         ret = 1;
                         goto out_unlock;
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h

index 825d337..24f3d0f 100644 (file)
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -341,7 +341,7 @@ struct kvm_mips_tlb {
  #define KVM_MIPS_GUEST_TLB_SIZE        64
  struct kvm_vcpu_arch {
         void *guest_ebase;
-       int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+       int (*vcpu_run)(struct kvm_vcpu *vcpu);
  
         /* Host registers preserved across guest mode execution */
         unsigned long host_stack;
@@ -852,7 +852,7 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
  /* Debug: dump vcpu state */
  int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
  
-extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern int kvm_mips_handle_exit(struct kvm_vcpu *vcpu);
  
  /* Building of entry/exception code */
  int kvm_mips_entry_setup(void);
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c

index fd71694..832475b 100644 (file)
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -205,7 +205,7 @@ static inline void build_set_exc_base(u32 **p, unsigned int reg)
   * Assemble the start of the vcpu_run function to run a guest VCPU. The function
   * conforms to the following prototype:
   *
- * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ * int vcpu_run(struct kvm_vcpu *vcpu);
   *
   * The exit from the guest and return to the caller is handled by the code
   * generated by kvm_mips_build_ret_to_host().
@@ -218,8 +218,7 @@ void *kvm_mips_build_vcpu_run(void *addr)
         unsigned int i;
  
         /*
-        * A0: run
-        * A1: vcpu
+        * A0: vcpu
          */
  
         /* k0/k1 not being used in host kernel context */
@@ -238,10 +237,10 @@ void *kvm_mips_build_vcpu_run(void *addr)
         kvm_mips_build_save_scratch(&p, V1, K1);
  
         /* VCPU scratch register has pointer to vcpu */
-       UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+       UASM_i_MTC0(&p, A0, scratch_vcpu[0], scratch_vcpu[1]);
  
         /* Offset into vcpu->arch */
-       UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch));
+       UASM_i_ADDIU(&p, K1, A0, offsetof(struct kvm_vcpu, arch));
  
         /*
          * Save the host stack to VCPU, used for exception processing
@@ -645,10 +644,7 @@ void *kvm_mips_build_exit(void *addr)
         /* Now that context has been saved, we can use other registers */
  
         /* Restore vcpu */
-       UASM_i_MFC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
-
-       /* Restore run (vcpu->run) */
-       UASM_i_LW(&p, S0, offsetof(struct kvm_vcpu, run), S1);
+       UASM_i_MFC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]);
  
         /*
          * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
@@ -810,7 +806,6 @@ void *kvm_mips_build_exit(void *addr)
          * with this in the kernel
          */
         uasm_i_move(&p, A0, S0);
-       uasm_i_move(&p, A1, S1);
         UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
         uasm_i_jalr(&p, RA, T9);
          UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ);
@@ -852,7 +847,7 @@ static void *kvm_mips_build_ret_from_exit(void *addr)
          * guest, reload k1
          */
  
-       uasm_i_move(&p, K1, S1);
+       uasm_i_move(&p, K1, S0);
         UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
  
         /*
@@ -886,8 +881,8 @@ static void *kvm_mips_build_ret_to_guest(void *addr)
  {
         u32 *p = addr;
  
-       /* Put the saved pointer to vcpu (s1) back into the scratch register */
-       UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
+       /* Put the saved pointer to vcpu (s0) back into the scratch register */
+       UASM_i_MTC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]);
  
         /* Load up the Guest EBASE to minimize the window where BEV is set */
         UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c

index 7de85d2..3d6a7f5 100644 (file)
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -137,6 +137,8 @@ extern void kvm_init_loongson_ipi(struct kvm *kvm);
  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  {
         switch (type) {
+       case KVM_VM_MIPS_AUTO:
+               break;
  #ifdef CONFIG_KVM_MIPS_VZ
         case KVM_VM_MIPS_VZ:
  #else
@@ -1197,8 +1199,9 @@ static void kvm_mips_set_c0_status(void)
  /*
   * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV)
   */
-int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
+int kvm_mips_handle_exit(struct kvm_vcpu *vcpu)
  {
+       struct kvm_run *run = vcpu->run;
         u32 cause = vcpu->arch.host_cp0_cause;
         u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
         u32 __user *opc = (u32 __user *) vcpu->arch.pc;
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c

index f8cba51..0788c00 100644 (file)
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -1241,7 +1241,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_vcpu *vcpu)
          */
         kvm_mips_suspend_mm(cpu);
  
-       r = vcpu->arch.vcpu_run(vcpu->run, vcpu);
+       r = vcpu->arch.vcpu_run(vcpu);
  
         /* We may have migrated while handling guest exits */
         cpu = smp_processor_id();
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c

index c299e5d..2ffbe92 100644 (file)
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -3266,7 +3266,7 @@ static int kvm_vz_vcpu_run(struct kvm_vcpu *vcpu)
         kvm_vz_vcpu_load_tlb(vcpu, cpu);
         kvm_vz_vcpu_load_wired(vcpu);
  
-       r = vcpu->arch.vcpu_run(vcpu->run, vcpu);
+       r = vcpu->arch.vcpu_run(vcpu);
  
         kvm_vz_vcpu_save_wired(vcpu);
  
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h

index 10ded83..d67a470 100644 (file)
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -326,6 +326,7 @@ struct kvm_arch {
  #endif
  #ifdef CONFIG_KVM_XICS
         struct kvmppc_xics *xics;
+       struct kvmppc_xics *xics_device;
         struct kvmppc_xive *xive;    /* Current XIVE device in use */
         struct {
                 struct kvmppc_xive *native;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c

index 49db50d..44bf567 100644 (file)
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -558,12 +558,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
  
  int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  {
-       return -ENOTSUPP;
+       return -EOPNOTSUPP;
  }
  
  int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  {
-       return -ENOTSUPP;
+       return -EOPNOTSUPP;
  }
  
  int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
@@ -879,13 +879,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
  
  #ifdef CONFIG_KVM_XICS
         /*
-        * Free the XIVE devices which are not directly freed by the
+        * Free the XIVE and XICS devices which are not directly freed by the
          * device 'release' method
          */
         kfree(kvm->arch.xive_devices.native);
         kvm->arch.xive_devices.native = NULL;
         kfree(kvm->arch.xive_devices.xics_on_xive);
         kvm->arch.xive_devices.xics_on_xive = NULL;
+       kfree(kvm->arch.xics_device);
+       kvm->arch.xics_device = NULL;
  #endif /* CONFIG_KVM_XICS */
  }
  
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c

index 22a677b..bb35490 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -347,7 +347,7 @@ static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
         return __radix_pte_update(ptep, clr, set);
  }
  
-void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
+static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
                              pte_t *ptep, pte_t pte)
  {
         radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c

index 1a529df..8da93fd 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -283,7 +283,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
         struct kvmppc_spapr_tce_table *siter;
         struct mm_struct *mm = kvm->mm;
         unsigned long npages, size = args->size;
-       int ret = -ENOMEM;
+       int ret;
  
         if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
                 (args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
@@ -489,7 +489,7 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm,
         return ret;
  }
  
-long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
+static long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
                 unsigned long entry, unsigned long ua,
                 enum dma_data_direction dir)
  {
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c

index ac6ac19..470e7c5 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -237,7 +237,7 @@ static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm,
         return ret;
  }
  
-extern void iommu_tce_kill_rm(struct iommu_table *tbl,
+static void iommu_tce_kill_rm(struct iommu_table *tbl,
                 unsigned long entry, unsigned long pages)
  {
         if (tbl->it_ops->tce_kill)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c

index 4ba06a2..490a0f6 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3442,9 +3442,19 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
         unsigned long host_psscr = mfspr(SPRN_PSSCR);
         unsigned long host_pidr = mfspr(SPRN_PID);
  
+       /*
+        * P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0,
+        * so set HDICE before writing HDEC.
+        */
+       mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr | LPCR_HDICE);
+       isync();
+
         hdec = time_limit - mftb();
-       if (hdec < 0)
+       if (hdec < 0) {
+               mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
+               isync();
                 return BOOK3S_INTERRUPT_HV_DECREMENTER;
+       }
         mtspr(SPRN_HDEC, hdec);
  
         if (vc->tb_offset) {
@@ -3558,7 +3568,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
   * Virtual-mode guest entry for POWER9 and later when the host and
   * guest are both using the radix MMU.  The LPIDR has already been set.
   */
-int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
+static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
                          unsigned long lpcr)
  {
         struct kvmppc_vcore *vc = vcpu->arch.vcore;
@@ -3572,7 +3582,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
  
         dec = mfspr(SPRN_DEC);
         tb = mftb();
-       if (dec < 512)
+       if (dec < 0)
                 return BOOK3S_INTERRUPT_HV_DECREMENTER;
         local_paca->kvm_hstate.dec_expires = dec + tb;
         if (local_paca->kvm_hstate.dec_expires < time_limit)
@@ -5250,6 +5260,12 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
         case KVM_PPC_ALLOCATE_HTAB: {
                 u32 htab_order;
  
+               /* If we're a nested hypervisor, we currently only support radix */
+               if (kvmhv_on_pseries()) {
+                       r = -EOPNOTSUPP;
+                       break;
+               }
+
                 r = -EFAULT;
                 if (get_user(htab_order, (u32 __user *)argp))
                         break;
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S

index 59822cb..327417d 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -58,13 +58,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
         /*
          * Put whatever is in the decrementer into the
          * hypervisor decrementer.
+        * Because of a hardware deviation in P8 and P9,
+        * we need to set LPCR[HDICE] before writing HDEC.
          */
-BEGIN_FTR_SECTION
         ld      r5, HSTATE_KVM_VCORE(r13)
         ld      r6, VCORE_KVM(r5)
         ld      r9, KVM_HOST_LPCR(r6)
-       andis.  r9, r9, LPCR_LD@h
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+       ori     r8, r9, LPCR_HDICE
+       mtspr   SPRN_LPCR, r8
+       isync
+       andis.  r0, r9, LPCR_LD@h
         mfspr   r8,SPRN_DEC
         mftb    r7
  BEGIN_FTR_SECTION
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c

index 6822d23..33b5854 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -569,7 +569,7 @@ static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
         kvmhv_set_nested_ptbl(gp);
  }
  
-struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
+static struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
  {
         struct kvm_nested_guest *gp;
         long shadow_lpid;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c

index 4d7e561..c2c9c73 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -764,7 +764,7 @@ int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
         return ics_rm_eoi(vcpu, irq);
  }
  
-unsigned long eoi_rc;
+static unsigned long eoi_rc;
  
  static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
  {
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c

index 88fac22..b1fefa6 100644 (file)
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -569,7 +569,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
  #endif
  }
  
-void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
+static void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
  {
         u32 host_pvr;
  
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c

index 381bf8d..5fee5a1 100644 (file)
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
         return -ENXIO;
  }
  
-static void kvmppc_xics_free(struct kvm_device *dev)
+/*
+ * Called when device fd is closed. kvm->lock is held.
+ */
+static void kvmppc_xics_release(struct kvm_device *dev)
  {
         struct kvmppc_xics *xics = dev->private;
         int i;
         struct kvm *kvm = xics->kvm;
+       struct kvm_vcpu *vcpu;
+
+       pr_devel("Releasing xics device\n");
+
+       /*
+        * Since this is the device release function, we know that
+        * userspace does not have any open fd referring to the
+        * device.  Therefore there can not be any of the device
+        * attribute set/get functions being executed concurrently,
+        * and similarly, the connect_vcpu and set/clr_mapped
+        * functions also cannot be being executed.
+        */
  
         debugfs_remove(xics->dentry);
  
+       /*
+        * We should clean up the vCPU interrupt presenters first.
+        */
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               /*
+                * Take vcpu->mutex to ensure that no one_reg get/set ioctl
+                * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently.
+                * Holding the vcpu->mutex also means that execution is
+                * excluded for the vcpu until the ICP was freed. When the vcpu
+                * can execute again, vcpu->arch.icp and vcpu->arch.irq_type
+                * have been cleared and the vcpu will not be going into the
+                * XICS code anymore.
+                */
+               mutex_lock(&vcpu->mutex);
+               kvmppc_xics_free_icp(vcpu);
+               mutex_unlock(&vcpu->mutex);
+       }
+
         if (kvm)
                 kvm->arch.xics = NULL;
  
-       for (i = 0; i <= xics->max_icsid; i++)
+       for (i = 0; i <= xics->max_icsid; i++) {
                 kfree(xics->ics[i]);
-       kfree(xics);
+               xics->ics[i] = NULL;
+       }
+       /*
+        * A reference of the kvmppc_xics pointer is now kept under
+        * the xics_device pointer of the machine for reuse. It is
+        * freed when the VM is destroyed for now until we fix all the
+        * execution paths.
+        */
         kfree(dev);
  }
  
+static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm)
+{
+       struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device;
+       struct kvmppc_xics *xics = *kvm_xics_device;
+
+       if (!xics) {
+               xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+               *kvm_xics_device = xics;
+       } else {
+               memset(xics, 0, sizeof(*xics));
+       }
+
+       return xics;
+}
+
  static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
  {
         struct kvmppc_xics *xics;
         struct kvm *kvm = dev->kvm;
-       int ret = 0;
  
-       xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+       pr_devel("Creating xics for partition\n");
+
+       /* Already there ? */
+       if (kvm->arch.xics)
+               return -EEXIST;
+
+       xics = kvmppc_xics_get_device(kvm);
         if (!xics)
                 return -ENOMEM;
  
         dev->private = xics;
         xics->dev = dev;
         xics->kvm = kvm;
-
-       /* Already there ? */
-       if (kvm->arch.xics)
-               ret = -EEXIST;
-       else
-               kvm->arch.xics = xics;
-
-       if (ret) {
-               kfree(xics);
-               return ret;
-       }
+       kvm->arch.xics = xics;
  
  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
         if (cpu_has_feature(CPU_FTR_ARCH_206) &&
@@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = {
         .name = "kvm-xics",
         .create = kvmppc_xics_create,
         .init = kvmppc_xics_init,
-       .destroy = kvmppc_xics_free,
+       .release = kvmppc_xics_release,
         .set_attr = xics_set_attr,
         .get_attr = xics_get_attr,
         .has_attr = xics_has_attr,
@@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
                 return -EPERM;
         if (xics->kvm != vcpu->kvm)
                 return -EPERM;
-       if (vcpu->arch.irq_type)
+       if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
                 return -EBUSY;
  
         r = kvmppc_xics_create_icp(vcpu, xcpu);
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c

index bdea91d..d0c2db0 100644 (file)
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -1227,17 +1227,7 @@ static int xive_native_debug_show(struct seq_file *m, void *private)
         return 0;
  }
  
-static int xive_native_debug_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, xive_native_debug_show, inode->i_private);
-}
-
-static const struct file_operations xive_native_debug_fops = {
-       .open = xive_native_debug_open,
-       .read = seq_read,
-       .llseek = seq_lseek,
-       .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(xive_native_debug);
  
  static void xive_native_debugfs_init(struct kvmppc_xive *xive)
  {
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c

index 3e1c9f0..b1abcb8 100644 (file)
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1747,12 +1747,12 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
  
  int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  {
-       return -ENOTSUPP;
+       return -EOPNOTSUPP;
  }
  
  int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  {
-       return -ENOTSUPP;
+       return -EOPNOTSUPP;
  }
  
  int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -1773,7 +1773,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
  
  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
  {
-       return -ENOTSUPP;
+       return -EOPNOTSUPP;
  }
  
  void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c

index dde7cb3..877970d 100644 (file)
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -36,6 +36,10 @@
  #define STATIC
  #include <linux/decompress/mm.h>
  
+#define _SETUP
+#include <asm/setup.h> /* For COMMAND_LINE_SIZE */
+#undef _SETUP
+
  #ifdef CONFIG_X86_5LEVEL
  unsigned int __pgtable_l5_enabled;
  unsigned int pgdir_shift __ro_after_init = 39;
@@ -87,8 +91,11 @@ static unsigned long get_boot_seed(void)
  static bool memmap_too_large;
  
  
-/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
-static unsigned long long mem_limit = ULLONG_MAX;
+/*
+ * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit.
+ * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options.
+ */
+static u64 mem_limit;
  
  /* Number of immovable memory regions */
  static int num_immovable_mem;
@@ -131,8 +138,7 @@ enum parse_mode {
  };
  
  static int
-parse_memmap(char *p, unsigned long long *start, unsigned long long *size,
-               enum parse_mode mode)
+parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode)
  {
         char *oldp;
  
@@ -162,7 +168,7 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size,
                          */
                         *size = 0;
                 } else {
-                       unsigned long long flags;
+                       u64 flags;
  
                         /*
                          * efi_fake_mem=nn@ss:attr the attr specifies
@@ -201,7 +207,7 @@ static void mem_avoid_memmap(enum parse_mode mode, char *str)
  
         while (str && (i < MAX_MEMMAP_REGIONS)) {
                 int rc;
-               unsigned long long start, size;
+               u64 start, size;
                 char *k = strchr(str, ',');
  
                 if (k)
@@ -214,7 +220,7 @@ static void mem_avoid_memmap(enum parse_mode mode, char *str)
  
                 if (start == 0) {
                         /* Store the specified memory limit if size > 0 */
-                       if (size > 0)
+                       if (size > 0 && size < mem_limit)
                                 mem_limit = size;
  
                         continue;
@@ -261,15 +267,15 @@ static void parse_gb_huge_pages(char *param, char *val)
  static void handle_mem_options(void)
  {
         char *args = (char *)get_cmd_line_ptr();
-       size_t len = strlen((char *)args);
+       size_t len;
         char *tmp_cmdline;
         char *param, *val;
         u64 mem_size;
  
-       if (!strstr(args, "memmap=") && !strstr(args, "mem=") &&
-               !strstr(args, "hugepages"))
+       if (!args)
                 return;
  
+       len = strnlen(args, COMMAND_LINE_SIZE-1);
         tmp_cmdline = malloc(len + 1);
         if (!tmp_cmdline)
                 error("Failed to allocate space for tmp_cmdline");
@@ -284,14 +290,12 @@ static void handle_mem_options(void)
         while (*args) {
                 args = next_arg(args, &param, &val);
                 /* Stop at -- */
-               if (!val && strcmp(param, "--") == 0) {
-                       warn("Only '--' specified in cmdline");
-                       goto out;
-               }
+               if (!val && strcmp(param, "--") == 0)
+                       break;
  
                 if (!strcmp(param, "memmap")) {
                         mem_avoid_memmap(PARSE_MEMMAP, val);
-               } else if (strstr(param, "hugepages")) {
+               } else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) {
                         parse_gb_huge_pages(param, val);
                 } else if (!strcmp(param, "mem")) {
                         char *p = val;
@@ -300,21 +304,23 @@ static void handle_mem_options(void)
                                 continue;
                         mem_size = memparse(p, &p);
                         if (mem_size == 0)
-                               goto out;
+                               break;
  
-                       mem_limit = mem_size;
+                       if (mem_size < mem_limit)
+                               mem_limit = mem_size;
                 } else if (!strcmp(param, "efi_fake_mem")) {
                         mem_avoid_memmap(PARSE_EFI, val);
                 }
         }
  
-out:
         free(tmp_cmdline);
         return;
  }
  
  /*
- * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
+ * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM)
+ * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit.
+ *
   * The mem_avoid array is used to store the ranges that need to be avoided
   * when KASLR searches for an appropriate random address. We must avoid any
   * regions that are unsafe to overlap with during decompression, and other
@@ -392,8 +398,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
  {
         unsigned long init_size = boot_params->hdr.init_size;
         u64 initrd_start, initrd_size;
-       u64 cmd_line, cmd_line_size;
-       char *ptr;
+       unsigned long cmd_line, cmd_line_size;
  
         /*
          * Avoid the region that is unsafe to overlap during
@@ -414,16 +419,15 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
         /* No need to set mapping for initrd, it will be handled in VO. */
  
         /* Avoid kernel command line. */
-       cmd_line  = (u64)boot_params->ext_cmd_line_ptr << 32;
-       cmd_line |= boot_params->hdr.cmd_line_ptr;
+       cmd_line = get_cmd_line_ptr();
         /* Calculate size of cmd_line. */
-       ptr = (char *)(unsigned long)cmd_line;
-       for (cmd_line_size = 0; ptr[cmd_line_size++];)
-               ;
-       mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
-       mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
-       add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
-                        mem_avoid[MEM_AVOID_CMDLINE].size);
+       if (cmd_line) {
+               cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1;
+               mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
+               mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
+               add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
+                                mem_avoid[MEM_AVOID_CMDLINE].size);
+       }
  
         /* Avoid boot parameters. */
         mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;
@@ -454,7 +458,7 @@ static bool mem_avoid_overlap(struct mem_vector *img,
  {
         int i;
         struct setup_data *ptr;
-       unsigned long earliest = img->start + img->size;
+       u64 earliest = img->start + img->size;
         bool is_overlapping = false;
  
         for (i = 0; i < MEM_AVOID_MAX; i++) {
@@ -499,18 +503,16 @@ static bool mem_avoid_overlap(struct mem_vector *img,
  }
  
  struct slot_area {
-       unsigned long addr;
-       int num;
+       u64 addr;
+       unsigned long num;
  };
  
  #define MAX_SLOT_AREA 100
  
  static struct slot_area slot_areas[MAX_SLOT_AREA];
-
+static unsigned int slot_area_index;
  static unsigned long slot_max;
  
-static unsigned long slot_area_index;
-
  static void store_slot_info(struct mem_vector *region, unsigned long image_size)
  {
         struct slot_area slot_area;
@@ -519,13 +521,10 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size)
                 return;
  
         slot_area.addr = region->start;
-       slot_area.num = (region->size - image_size) /
-                       CONFIG_PHYSICAL_ALIGN + 1;
+       slot_area.num = 1 + (region->size - image_size) / CONFIG_PHYSICAL_ALIGN;
  
-       if (slot_area.num > 0) {
-               slot_areas[slot_area_index++] = slot_area;
-               slot_max += slot_area.num;
-       }
+       slot_areas[slot_area_index++] = slot_area;
+       slot_max += slot_area.num;
  }
  
  /*
@@ -535,57 +534,53 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size)
  static void
  process_gb_huge_pages(struct mem_vector *region, unsigned long image_size)
  {
-       unsigned long addr, size = 0;
+       u64 pud_start, pud_end;
+       unsigned long gb_huge_pages;
         struct mem_vector tmp;
-       int i = 0;
  
-       if (!max_gb_huge_pages) {
+       if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) {
                 store_slot_info(region, image_size);
                 return;
         }
  
-       addr = ALIGN(region->start, PUD_SIZE);
-       /* Did we raise the address above the passed in memory entry? */
-       if (addr < region->start + region->size)
-               size = region->size - (addr - region->start);
-
-       /* Check how many 1GB huge pages can be filtered out: */
-       while (size > PUD_SIZE && max_gb_huge_pages) {
-               size -= PUD_SIZE;
-               max_gb_huge_pages--;
-               i++;
-       }
+       /* Are there any 1GB pages in the region? */
+       pud_start = ALIGN(region->start, PUD_SIZE);
+       pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE);
  
         /* No good 1GB huge pages found: */
-       if (!i) {
+       if (pud_start >= pud_end) {
                 store_slot_info(region, image_size);
                 return;
         }
  
-       /*
-        * Skip those 'i'*1GB good huge pages, and continue checking and
-        * processing the remaining head or tail part of the passed region
-        * if available.
-        */
-
-       if (addr >= region->start + image_size) {
+       /* Check if the head part of the region is usable. */
+       if (pud_start >= region->start + image_size) {
                 tmp.start = region->start;
-               tmp.size = addr - region->start;
+               tmp.size = pud_start - region->start;
                 store_slot_info(&tmp, image_size);
         }
  
-       size  = region->size - (addr - region->start) - i * PUD_SIZE;
-       if (size >= image_size) {
-               tmp.start = addr + i * PUD_SIZE;
-               tmp.size = size;
+       /* Skip the good 1GB pages. */
+       gb_huge_pages = (pud_end - pud_start) >> PUD_SHIFT;
+       if (gb_huge_pages > max_gb_huge_pages) {
+               pud_end = pud_start + (max_gb_huge_pages << PUD_SHIFT);
+               max_gb_huge_pages = 0;
+       } else {
+               max_gb_huge_pages -= gb_huge_pages;
+       }
+
+       /* Check if the tail part of the region is usable. */
+       if (region->start + region->size >= pud_end + image_size) {
+               tmp.start = pud_end;
+               tmp.size = region->start + region->size - pud_end;
                 store_slot_info(&tmp, image_size);
         }
  }
  
-static unsigned long slots_fetch_random(void)
+static u64 slots_fetch_random(void)
  {
         unsigned long slot;
-       int i;
+       unsigned int i;
  
         /* Handle case of no slots stored. */
         if (slot_max == 0)
@@ -598,7 +593,7 @@ static unsigned long slots_fetch_random(void)
                         slot -= slot_areas[i].num;
                         continue;
                 }
-               return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN;
+               return slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN);
         }
  
         if (i == slot_area_index)
@@ -611,49 +606,23 @@ static void __process_mem_region(struct mem_vector *entry,
                                  unsigned long image_size)
  {
         struct mem_vector region, overlap;
-       unsigned long start_orig, end;
-       struct mem_vector cur_entry;
-
-       /* On 32-bit, ignore entries entirely above our maximum. */
-       if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE)
-               return;
+       u64 region_end;
  
-       /* Ignore entries entirely below our minimum. */
-       if (entry->start + entry->size < minimum)
-               return;
-
-       /* Ignore entries above memory limit */
-       end = min(entry->size + entry->start, mem_limit);
-       if (entry->start >= end)
-               return;
-       cur_entry.start = entry->start;
-       cur_entry.size = end - entry->start;
-
-       region.start = cur_entry.start;
-       region.size = cur_entry.size;
+       /* Enforce minimum and memory limit. */
+       region.start = max_t(u64, entry->start, minimum);
+       region_end = min(entry->start + entry->size, mem_limit);
  
         /* Give up if slot area array is full. */
         while (slot_area_index < MAX_SLOT_AREA) {
-               start_orig = region.start;
-
-               /* Potentially raise address to minimum location. */
-               if (region.start < minimum)
-                       region.start = minimum;
-
                 /* Potentially raise address to meet alignment needs. */
                 region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
  
                 /* Did we raise the address above the passed in memory entry? */
-               if (region.start > cur_entry.start + cur_entry.size)
+               if (region.start > region_end)
                         return;
  
                 /* Reduce size by any delta from the original address. */
-               region.size -= region.start - start_orig;
-
-               /* On 32-bit, reduce region size to fit within max size. */
-               if (IS_ENABLED(CONFIG_X86_32) &&
-                   region.start + region.size > KERNEL_IMAGE_SIZE)
-                       region.size = KERNEL_IMAGE_SIZE - region.start;
+               region.size = region_end - region.start;
  
                 /* Return if region can't contain decompressed kernel */
                 if (region.size < image_size)
@@ -666,27 +635,19 @@ static void __process_mem_region(struct mem_vector *entry,
                 }
  
                 /* Store beginning of region if holds at least image_size. */
-               if (overlap.start > region.start + image_size) {
-                       struct mem_vector beginning;
-
-                       beginning.start = region.start;
-                       beginning.size = overlap.start - region.start;
-                       process_gb_huge_pages(&beginning, image_size);
+               if (overlap.start >= region.start + image_size) {
+                       region.size = overlap.start - region.start;
+                       process_gb_huge_pages(&region, image_size);
                 }
  
-               /* Return if overlap extends to or past end of region. */
-               if (overlap.start + overlap.size >= region.start + region.size)
-                       return;
-
                 /* Clip off the overlapping region and start over. */
-               region.size -= overlap.start - region.start + overlap.size;
                 region.start = overlap.start + overlap.size;
         }
  }
  
  static bool process_mem_region(struct mem_vector *region,
-                              unsigned long long minimum,
-                              unsigned long long image_size)
+                              unsigned long minimum,
+                              unsigned long image_size)
  {
         int i;
         /*
@@ -709,7 +670,7 @@ static bool process_mem_region(struct mem_vector *region,
          * immovable memory and @region.
          */
         for (i = 0; i < num_immovable_mem; i++) {
-               unsigned long long start, end, entry_end, region_end;
+               u64 start, end, entry_end, region_end;
                 struct mem_vector entry;
  
                 if (!mem_overlaps(region, &immovable_mem[i]))
@@ -736,8 +697,8 @@ static bool process_mem_region(struct mem_vector *region,
  
  #ifdef CONFIG_EFI
  /*
- * Returns true if mirror region found (and must have been processed
- * for slots adding)
+ * Returns true if we processed the EFI memmap, which we prefer over the E820
+ * table if it is available.
   */
  static bool
  process_efi_entries(unsigned long minimum, unsigned long image_size)
@@ -839,20 +800,30 @@ static void process_e820_entries(unsigned long minimum,
  static unsigned long find_random_phys_addr(unsigned long minimum,
                                            unsigned long image_size)
  {
+       u64 phys_addr;
+
+       /* Bail out early if it's impossible to succeed. */
+       if (minimum + image_size > mem_limit)
+               return 0;
+
         /* Check if we had too many memmaps. */
         if (memmap_too_large) {
                 debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n");
                 return 0;
         }
  
-       /* Make sure minimum is aligned. */
-       minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+       if (!process_efi_entries(minimum, image_size))
+               process_e820_entries(minimum, image_size);
  
-       if (process_efi_entries(minimum, image_size))
-               return slots_fetch_random();
+       phys_addr = slots_fetch_random();
  
-       process_e820_entries(minimum, image_size);
-       return slots_fetch_random();
+       /* Perform a final check to make sure the address is in range. */
+       if (phys_addr < minimum || phys_addr + image_size > mem_limit) {
+               warn("Invalid physical address chosen!\n");
+               return 0;
+       }
+
+       return (unsigned long)phys_addr;
  }
  
  static unsigned long find_random_virt_addr(unsigned long minimum,
@@ -860,18 +831,12 @@ static unsigned long find_random_virt_addr(unsigned long minimum,
  {
         unsigned long slots, random_addr;
  
-       /* Make sure minimum is aligned. */
-       minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
-       /* Align image_size for easy slot calculations. */
-       image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN);
-
         /*
          * There are how many CONFIG_PHYSICAL_ALIGN-sized slots
          * that can hold image_size within the range of minimum to
          * KERNEL_IMAGE_SIZE?
          */
-       slots = (KERNEL_IMAGE_SIZE - minimum - image_size) /
-                CONFIG_PHYSICAL_ALIGN + 1;
+       slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN;
  
         random_addr = kaslr_get_random_long("Virtual") % slots;
  
@@ -908,6 +873,11 @@ void choose_random_location(unsigned long input,
         /* Prepare to add new identity pagetables on demand. */
         initialize_identity_maps();
  
+       if (IS_ENABLED(CONFIG_X86_32))
+               mem_limit = KERNEL_IMAGE_SIZE;
+       else
+               mem_limit = MAXMEM;
+
         /* Record the various known unsafe memory ranges. */
         mem_avoid_init(input, input_size, *output);
  
@@ -917,6 +887,8 @@ void choose_random_location(unsigned long input,
          * location:
          */
         min_addr = min(*output, 512UL << 20);
+       /* Make sure minimum is aligned. */
+       min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN);
  
         /* Walk available memory entries to find a random address. */
         random_addr = find_random_phys_addr(min_addr, output_size);
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h

index 726e264..3efce27 100644 (file)
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -70,8 +70,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize);
  int cmdline_find_option_bool(const char *option);
  
  struct mem_vector {
-       unsigned long long start;
-       unsigned long long size;
+       u64 start;
+       u64 size;
  };
  
  #if CONFIG_RANDOMIZE_BASE
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h

index 2901d5d..83fc9d3 100644 (file)
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -368,6 +368,7 @@
  #define X86_FEATURE_MD_CLEAR           (18*32+10) /* VERW clears CPU buffers */
  #define X86_FEATURE_TSX_FORCE_ABORT    (18*32+13) /* "" TSX_FORCE_ABORT */
  #define X86_FEATURE_SERIALIZE          (18*32+14) /* SERIALIZE instruction */
+#define X86_FEATURE_TSXLDTRK           (18*32+16) /* TSX Suspend Load Address Tracking */
  #define X86_FEATURE_PCONFIG            (18*32+18) /* Intel PCONFIG */
  #define X86_FEATURE_ARCH_LBR           (18*32+19) /* Intel ARCH LBR */
  #define X86_FEATURE_SPEC_CTRL          (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h

index 0a460f2..21a8b52 100644 (file)
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -602,9 +602,7 @@ static inline u64 xgetbv(u32 index)
  {
         u32 eax, edx;
  
-       asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
-                    : "=a" (eax), "=d" (edx)
-                    : "c" (index));
+       asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index));
         return eax + ((u64)edx << 32);
  }
  
@@ -613,8 +611,7 @@ static inline void xsetbv(u32 index, u64 value)
         u32 eax = value;
         u32 edx = value >> 32;
  
-       asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
-                    : : "a" (eax), "d" (edx), "c" (index));
+       asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
  }
  
  #endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 5303dbc..d0f7723 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -80,13 +80,14 @@
  #define KVM_REQ_HV_EXIT                        KVM_ARCH_REQ(21)
  #define KVM_REQ_HV_STIMER              KVM_ARCH_REQ(22)
  #define KVM_REQ_LOAD_EOI_EXITMAP       KVM_ARCH_REQ(23)
-#define KVM_REQ_GET_VMCS12_PAGES       KVM_ARCH_REQ(24)
+#define KVM_REQ_GET_NESTED_STATE_PAGES KVM_ARCH_REQ(24)
  #define KVM_REQ_APICV_UPDATE \
         KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
  #define KVM_REQ_TLB_FLUSH_CURRENT      KVM_ARCH_REQ(26)
  #define KVM_REQ_HV_TLB_FLUSH \
         KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
  #define KVM_REQ_APF_READY              KVM_ARCH_REQ(28)
+#define KVM_REQ_MSR_FILTER_CHANGED     KVM_ARCH_REQ(29)
  
  #define CR0_RESERVED_BITS                                               \
         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -860,6 +861,13 @@ struct kvm_hv {
         struct kvm_hv_syndbg hv_syndbg;
  };
  
+struct msr_bitmap_range {
+       u32 flags;
+       u32 nmsrs;
+       u32 base;
+       unsigned long *bitmap;
+};
+
  enum kvm_irqchip_mode {
         KVM_IRQCHIP_NONE,
         KVM_IRQCHIP_KERNEL,       /* created with KVM_CREATE_IRQCHIP */
@@ -961,6 +969,15 @@ struct kvm_arch {
         bool guest_can_read_msr_platform_info;
         bool exception_payload_enabled;
  
+       /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
+       u32 user_space_msr_mask;
+
+       struct {
+               u8 count;
+               bool default_allow:1;
+               struct msr_bitmap_range ranges[16];
+       } msr_filter;
+
         struct kvm_pmu_event_filter *pmu_event_filter;
         struct task_struct *nx_lpage_recovery_thread;
  };
@@ -1143,7 +1160,12 @@ struct kvm_x86_ops {
         /* Returns actual tsc_offset set in active VMCS */
         u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
  
-       void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
+       /*
+        * Retrieve somewhat arbitrary exit information.  Intended to be used
+        * only from within tracepoints to avoid VMREADs when tracing is off.
+        */
+       void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+                             u32 *exit_int_info, u32 *exit_int_info_err_code);
  
         int (*check_intercept)(struct kvm_vcpu *vcpu,
                                struct x86_instruction_info *info,
@@ -1221,12 +1243,13 @@ struct kvm_x86_ops {
  
         int (*get_msr_feature)(struct kvm_msr_entry *entry);
  
-       bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
+       bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len);
  
         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
  
         void (*migrate_timers)(struct kvm_vcpu *vcpu);
+       void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
  };
  
  struct kvm_x86_nested_ops {
@@ -1238,7 +1261,7 @@ struct kvm_x86_nested_ops {
         int (*set_state)(struct kvm_vcpu *vcpu,
                          struct kvm_nested_state __user *user_kvm_nested_state,
                          struct kvm_nested_state *kvm_state);
-       bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+       bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu);
         int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
  
         int (*enable_evmcs)(struct kvm_vcpu *vcpu,
@@ -1612,8 +1635,8 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
                     unsigned long ipi_bitmap_high, u32 min,
                     unsigned long icr, int op_64_bit);
  
-void kvm_define_shared_msr(unsigned index, u32 msr);
-int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+void kvm_define_user_return_msr(unsigned index, u32 msr);
+int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
  
  u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
  u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h

index 59a3e13..5999b0b 100644 (file)
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -234,6 +234,12 @@ static inline void clwb(volatile void *__p)
  
  #define nop() asm volatile ("nop")
  
+static inline void serialize(void)
+{
+       /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */
+       asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory");
+}
+
  #endif /* __KERNEL__ */
  
  #endif /* _ASM_X86_SPECIAL_INSNS_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h

index 8a1f538..71d630b 100644 (file)
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -3,10 +3,54 @@
  #define __SVM_H
  
  #include <uapi/asm/svm.h>
-
+#include <uapi/asm/kvm.h>
+
+/*
+ * 32-bit intercept words in the VMCB Control Area, starting
+ * at Byte offset 000h.
+ */
+
+enum intercept_words {
+       INTERCEPT_CR = 0,
+       INTERCEPT_DR,
+       INTERCEPT_EXCEPTION,
+       INTERCEPT_WORD3,
+       INTERCEPT_WORD4,
+       INTERCEPT_WORD5,
+       MAX_INTERCEPT,
+};
  
  enum {
-       INTERCEPT_INTR,
+       /* Byte offset 000h (word 0) */
+       INTERCEPT_CR0_READ = 0,
+       INTERCEPT_CR3_READ = 3,
+       INTERCEPT_CR4_READ = 4,
+       INTERCEPT_CR8_READ = 8,
+       INTERCEPT_CR0_WRITE = 16,
+       INTERCEPT_CR3_WRITE = 16 + 3,
+       INTERCEPT_CR4_WRITE = 16 + 4,
+       INTERCEPT_CR8_WRITE = 16 + 8,
+       /* Byte offset 004h (word 1) */
+       INTERCEPT_DR0_READ = 32,
+       INTERCEPT_DR1_READ,
+       INTERCEPT_DR2_READ,
+       INTERCEPT_DR3_READ,
+       INTERCEPT_DR4_READ,
+       INTERCEPT_DR5_READ,
+       INTERCEPT_DR6_READ,
+       INTERCEPT_DR7_READ,
+       INTERCEPT_DR0_WRITE = 48,
+       INTERCEPT_DR1_WRITE,
+       INTERCEPT_DR2_WRITE,
+       INTERCEPT_DR3_WRITE,
+       INTERCEPT_DR4_WRITE,
+       INTERCEPT_DR5_WRITE,
+       INTERCEPT_DR6_WRITE,
+       INTERCEPT_DR7_WRITE,
+       /* Byte offset 008h (word 2) */
+       INTERCEPT_EXCEPTION_OFFSET = 64,
+       /* Byte offset 00Ch (word 3) */
+       INTERCEPT_INTR = 96,
         INTERCEPT_NMI,
         INTERCEPT_SMI,
         INTERCEPT_INIT,
@@ -38,7 +82,8 @@ enum {
         INTERCEPT_TASK_SWITCH,
         INTERCEPT_FERR_FREEZE,
         INTERCEPT_SHUTDOWN,
-       INTERCEPT_VMRUN,
+       /* Byte offset 010h (word 4) */
+       INTERCEPT_VMRUN = 128,
         INTERCEPT_VMMCALL,
         INTERCEPT_VMLOAD,
         INTERCEPT_VMSAVE,
@@ -53,15 +98,18 @@ enum {
         INTERCEPT_MWAIT_COND,
         INTERCEPT_XSETBV,
         INTERCEPT_RDPRU,
+       /* Byte offset 014h (word 5) */
+       INTERCEPT_INVLPGB = 160,
+       INTERCEPT_INVLPGB_ILLEGAL,
+       INTERCEPT_INVPCID,
+       INTERCEPT_MCOMMIT,
+       INTERCEPT_TLBSYNC,
  };
  
  
  struct __attribute__ ((__packed__)) vmcb_control_area {
-       u32 intercept_cr;
-       u32 intercept_dr;
-       u32 intercept_exceptions;
-       u64 intercept;
-       u8 reserved_1[40];
+       u32 intercepts[MAX_INTERCEPT];
+       u32 reserved_1[15 - MAX_INTERCEPT];
         u16 pause_filter_thresh;
         u16 pause_filter_count;
         u64 iopm_base_pa;
@@ -150,14 +198,14 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
  #define SVM_NESTED_CTL_NP_ENABLE       BIT(0)
  #define SVM_NESTED_CTL_SEV_ENABLE      BIT(1)
  
-struct __attribute__ ((__packed__)) vmcb_seg {
+struct vmcb_seg {
         u16 selector;
         u16 attrib;
         u32 limit;
         u64 base;
-};
+} __packed;
  
-struct __attribute__ ((__packed__)) vmcb_save_area {
+struct vmcb_save_area {
         struct vmcb_seg es;
         struct vmcb_seg cs;
         struct vmcb_seg ss;
@@ -200,20 +248,67 @@ struct __attribute__ ((__packed__)) vmcb_save_area {
         u64 br_to;
         u64 last_excp_from;
         u64 last_excp_to;
-};
  
+       /*
+        * The following part of the save area is valid only for
+        * SEV-ES guests when referenced through the GHCB.
+        */
+       u8 reserved_7[104];
+       u64 reserved_8;         /* rax already available at 0x01f8 */
+       u64 rcx;
+       u64 rdx;
+       u64 rbx;
+       u64 reserved_9;         /* rsp already available at 0x01d8 */
+       u64 rbp;
+       u64 rsi;
+       u64 rdi;
+       u64 r8;
+       u64 r9;
+       u64 r10;
+       u64 r11;
+       u64 r12;
+       u64 r13;
+       u64 r14;
+       u64 r15;
+       u8 reserved_10[16];
+       u64 sw_exit_code;
+       u64 sw_exit_info_1;
+       u64 sw_exit_info_2;
+       u64 sw_scratch;
+       u8 reserved_11[56];
+       u64 xcr0;
+       u8 valid_bitmap[16];
+       u64 x87_state_gpa;
+} __packed;
+
+struct ghcb {
+       struct vmcb_save_area save;
+       u8 reserved_save[2048 - sizeof(struct vmcb_save_area)];
+
+       u8 shared_buffer[2032];
+
+       u8 reserved_1[10];
+       u16 protocol_version;   /* negotiated SEV-ES/GHCB protocol version */
+       u32 ghcb_usage;
+} __packed;
+
+
+#define EXPECTED_VMCB_SAVE_AREA_SIZE           1032
+#define EXPECTED_VMCB_CONTROL_AREA_SIZE                256
+#define EXPECTED_GHCB_SIZE                     PAGE_SIZE
  
  static inline void __unused_size_checks(void)
  {
-       BUILD_BUG_ON(sizeof(struct vmcb_save_area) != 0x298);
-       BUILD_BUG_ON(sizeof(struct vmcb_control_area) != 256);
+       BUILD_BUG_ON(sizeof(struct vmcb_save_area)      != EXPECTED_VMCB_SAVE_AREA_SIZE);
+       BUILD_BUG_ON(sizeof(struct vmcb_control_area)   != EXPECTED_VMCB_CONTROL_AREA_SIZE);
+       BUILD_BUG_ON(sizeof(struct ghcb)                != EXPECTED_GHCB_SIZE);
  }
  
-struct __attribute__ ((__packed__)) vmcb {
+struct vmcb {
         struct vmcb_control_area control;
         u8 reserved_control[1024 - sizeof(struct vmcb_control_area)];
         struct vmcb_save_area save;
-};
+} __packed;
  
  #define SVM_CPUID_FUNC 0x8000000a
  
@@ -240,32 +335,6 @@ struct __attribute__ ((__packed__)) vmcb {
  #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
  #define SVM_SELECTOR_CODE_MASK (1 << 3)
  
-#define INTERCEPT_CR0_READ     0
-#define INTERCEPT_CR3_READ     3
-#define INTERCEPT_CR4_READ     4
-#define INTERCEPT_CR8_READ     8
-#define INTERCEPT_CR0_WRITE    (16 + 0)
-#define INTERCEPT_CR3_WRITE    (16 + 3)
-#define INTERCEPT_CR4_WRITE    (16 + 4)
-#define INTERCEPT_CR8_WRITE    (16 + 8)
-
-#define INTERCEPT_DR0_READ     0
-#define INTERCEPT_DR1_READ     1
-#define INTERCEPT_DR2_READ     2
-#define INTERCEPT_DR3_READ     3
-#define INTERCEPT_DR4_READ     4
-#define INTERCEPT_DR5_READ     5
-#define INTERCEPT_DR6_READ     6
-#define INTERCEPT_DR7_READ     7
-#define INTERCEPT_DR0_WRITE    (16 + 0)
-#define INTERCEPT_DR1_WRITE    (16 + 1)
-#define INTERCEPT_DR2_WRITE    (16 + 2)
-#define INTERCEPT_DR3_WRITE    (16 + 3)
-#define INTERCEPT_DR4_WRITE    (16 + 4)
-#define INTERCEPT_DR5_WRITE    (16 + 5)
-#define INTERCEPT_DR6_WRITE    (16 + 6)
-#define INTERCEPT_DR7_WRITE    (16 + 7)
-
  #define SVM_EVTINJ_VEC_MASK 0xff
  
  #define SVM_EVTINJ_TYPE_SHIFT 8
@@ -298,4 +367,47 @@ struct __attribute__ ((__packed__)) vmcb {
  
  #define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
  
+/* GHCB Accessor functions */
+
+#define GHCB_BITMAP_IDX(field)                                                 \
+       (offsetof(struct vmcb_save_area, field) / sizeof(u64))
+
+#define DEFINE_GHCB_ACCESSORS(field)                                           \
+       static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb)     \
+       {                                                                       \
+               return test_bit(GHCB_BITMAP_IDX(field),                         \
+                               (unsigned long *)&ghcb->save.valid_bitmap);     \
+       }                                                                       \
+                                                                               \
+       static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value)       \
+       {                                                                       \
+               __set_bit(GHCB_BITMAP_IDX(field),                               \
+                         (unsigned long *)&ghcb->save.valid_bitmap);           \
+               ghcb->save.field = value;                                       \
+       }
+
+DEFINE_GHCB_ACCESSORS(cpl)
+DEFINE_GHCB_ACCESSORS(rip)
+DEFINE_GHCB_ACCESSORS(rsp)
+DEFINE_GHCB_ACCESSORS(rax)
+DEFINE_GHCB_ACCESSORS(rcx)
+DEFINE_GHCB_ACCESSORS(rdx)
+DEFINE_GHCB_ACCESSORS(rbx)
+DEFINE_GHCB_ACCESSORS(rbp)
+DEFINE_GHCB_ACCESSORS(rsi)
+DEFINE_GHCB_ACCESSORS(rdi)
+DEFINE_GHCB_ACCESSORS(r8)
+DEFINE_GHCB_ACCESSORS(r9)
+DEFINE_GHCB_ACCESSORS(r10)
+DEFINE_GHCB_ACCESSORS(r11)
+DEFINE_GHCB_ACCESSORS(r12)
+DEFINE_GHCB_ACCESSORS(r13)
+DEFINE_GHCB_ACCESSORS(r14)
+DEFINE_GHCB_ACCESSORS(r15)
+DEFINE_GHCB_ACCESSORS(sw_exit_code)
+DEFINE_GHCB_ACCESSORS(sw_exit_info_1)
+DEFINE_GHCB_ACCESSORS(sw_exit_info_2)
+DEFINE_GHCB_ACCESSORS(sw_scratch)
+DEFINE_GHCB_ACCESSORS(xcr0)
+
  #endif
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h

index fdb5b35..0fd4a9d 100644 (file)
--- a/arch/x86/include/asm/sync_core.h
+++ b/arch/x86/include/asm/sync_core.h
@@ -5,6 +5,7 @@
  #include <linux/preempt.h>
  #include <asm/processor.h>
  #include <asm/cpufeature.h>
+#include <asm/special_insns.h>
  
  #ifdef CONFIG_X86_32
  static inline void iret_to_self(void)
@@ -46,22 +47,34 @@ static inline void iret_to_self(void)
   *
   *  b) Text was modified on a different CPU, may subsequently be
   *     executed on this CPU, and you want to make sure the new version
- *     gets executed.  This generally means you're calling this in a IPI.
+ *     gets executed.  This generally means you're calling this in an IPI.
   *
   * If you're calling this for a different reason, you're probably doing
   * it wrong.
+ *
+ * Like all of Linux's memory ordering operations, this is a
+ * compiler barrier as well.
   */
  static inline void sync_core(void)
  {
         /*
-        * There are quite a few ways to do this.  IRET-to-self is nice
-        * because it works on every CPU, at any CPL (so it's compatible
-        * with paravirtualization), and it never exits to a hypervisor.
-        * The only down sides are that it's a bit slow (it seems to be
-        * a bit more than 2x slower than the fastest options) and that
-        * it unmasks NMIs.  The "push %cs" is needed because, in
-        * paravirtual environments, __KERNEL_CS may not be a valid CS
-        * value when we do IRET directly.
+        * The SERIALIZE instruction is the most straightforward way to
+        * do this, but it is not universally available.
+        */
+       if (static_cpu_has(X86_FEATURE_SERIALIZE)) {
+               serialize();
+               return;
+       }
+
+       /*
+        * For all other processors, there are quite a few ways to do this.
+        * IRET-to-self is nice because it works on every CPU, at any CPL
+        * (so it's compatible with paravirtualization), and it never exits
+        * to a hypervisor.  The only downsides are that it's a bit slow
+        * (it seems to be a bit more than 2x slower than the fastest
+        * options) and that it unmasks NMIs.  The "push %cs" is needed,
+        * because in paravirtual environments __KERNEL_CS may not be a
+        * valid CS value when we do IRET directly.
          *
          * In case NMI unmasking or performance ever becomes a problem,
          * the next best option appears to be MOV-to-CR2 and an
@@ -71,9 +84,6 @@ static inline void sync_core(void)
          * CPUID is the conventional way, but it's nasty: it doesn't
          * exist on some 486-like CPUs, and it usually exits to a
          * hypervisor.
-        *
-        * Like all of Linux's memory ordering operations, this is a
-        * compiler barrier as well.
          */
         iret_to_self();
  }
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h

index cd7de4b..f8ba528 100644 (file)
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -52,7 +52,7 @@
  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES VMCS_CONTROL_BIT(VIRT_APIC_ACCESSES)
  #define SECONDARY_EXEC_ENABLE_EPT               VMCS_CONTROL_BIT(EPT)
  #define SECONDARY_EXEC_DESC                    VMCS_CONTROL_BIT(DESC_EXITING)
-#define SECONDARY_EXEC_RDTSCP                  VMCS_CONTROL_BIT(RDTSCP)
+#define SECONDARY_EXEC_ENABLE_RDTSCP           VMCS_CONTROL_BIT(RDTSCP)
  #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   VMCS_CONTROL_BIT(VIRTUAL_X2APIC)
  #define SECONDARY_EXEC_ENABLE_VPID              VMCS_CONTROL_BIT(VPID)
  #define SECONDARY_EXEC_WBINVD_EXITING          VMCS_CONTROL_BIT(WBINVD_EXITING)
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h

index 0780f97..89e5f3d 100644 (file)
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -192,6 +192,26 @@ struct kvm_msr_list {
         __u32 indices[0];
  };
  
+/* Maximum size of any access bitmap in bytes */
+#define KVM_MSR_FILTER_MAX_BITMAP_SIZE 0x600
+
+/* for KVM_X86_SET_MSR_FILTER */
+struct kvm_msr_filter_range {
+#define KVM_MSR_FILTER_READ  (1 << 0)
+#define KVM_MSR_FILTER_WRITE (1 << 1)
+       __u32 flags;
+       __u32 nmsrs; /* number of msrs in bitmap */
+       __u32 base;  /* MSR index the bitmap starts at */
+       __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
+};
+
+#define KVM_MSR_FILTER_MAX_RANGES 16
+struct kvm_msr_filter {
+#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+#define KVM_MSR_FILTER_DEFAULT_DENY  (1 << 0)
+       __u32 flags;
+       struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
+};
  
  struct kvm_cpuid_entry {
         __u32 function;
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h

index 2e8a30f..522d42d 100644 (file)
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -76,6 +76,7 @@
  #define SVM_EXIT_MWAIT_COND    0x08c
  #define SVM_EXIT_XSETBV        0x08d
  #define SVM_EXIT_RDPRU         0x08e
+#define SVM_EXIT_INVPCID       0x0a2
  #define SVM_EXIT_NPF           0x400
  #define SVM_EXIT_AVIC_INCOMPLETE_IPI           0x401
  #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS     0x402
@@ -171,6 +172,7 @@
         { SVM_EXIT_MONITOR,     "monitor" }, \
         { SVM_EXIT_MWAIT,       "mwait" }, \
         { SVM_EXIT_XSETBV,      "xsetbv" }, \
+       { SVM_EXIT_INVPCID,     "invpcid" }, \
         { SVM_EXIT_NPF,         "npf" }, \
         { SVM_EXIT_AVIC_INCOMPLETE_IPI,         "avic_incomplete_ipi" }, \
         { SVM_EXIT_AVIC_UNACCELERATED_ACCESS,   "avic_unaccelerated_access" }, \
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c

index 08320b0..42c6e0d 100644 (file)
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -270,9 +270,8 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
  {
         struct pt_regs *old_regs = set_irq_regs(regs);
         u32 token;
-       irqentry_state_t state;
  
-       state = irqentry_enter(regs);
+       ack_APIC_irq();
  
         inc_irq_stat(irq_hv_callback_count);
  
@@ -283,7 +282,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
                 wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
         }
  
-       irqentry_exit(regs, state);
         set_irq_regs(old_regs);
  }
  
@@ -954,7 +952,7 @@ void arch_haltpoll_disable(unsigned int cpu)
         if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
                 return;
  
-       /* Enable guest halt poll disables host halt poll */
+       /* Disable guest halt poll enables host halt poll */
         smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
  }
  EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c

index 8d5cbe1..2c304fd 100644 (file)
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -45,11 +45,12 @@
   * value that, lies close to the top of the kernel memory. The limit for the GDT
   * and the IDT are set to zero.
   *
- * Given that SLDT and STR are not commonly used in programs that run on WineHQ
- * or DOSEMU2, they are not emulated.
- *
- * The instruction smsw is emulated to return the value that the register CR0
+ * The instruction SMSW is emulated to return the value that the register CR0
   * has at boot time as set in the head_32.
+ * SLDT and STR are emulated to return the values that the kernel programmatically
+ * assigns:
+ * - SLDT returns (GDT_ENTRY_LDT * 8) if an LDT has been set, 0 if not.
+ * - STR returns (GDT_ENTRY_TSS * 8).
   *
   * Emulation is provided for both 32-bit and 64-bit processes.
   *
@@ -244,16 +245,34 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst,
                 *data_size += UMIP_GDT_IDT_LIMIT_SIZE;
                 memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE);
  
-       } else if (umip_inst == UMIP_INST_SMSW) {
-               unsigned long dummy_value = CR0_STATE;
+       } else if (umip_inst == UMIP_INST_SMSW || umip_inst == UMIP_INST_SLDT ||
+                  umip_inst == UMIP_INST_STR) {
+               unsigned long dummy_value;
+
+               if (umip_inst == UMIP_INST_SMSW) {
+                       dummy_value = CR0_STATE;
+               } else if (umip_inst == UMIP_INST_STR) {
+                       dummy_value = GDT_ENTRY_TSS * 8;
+               } else if (umip_inst == UMIP_INST_SLDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+                       down_read(&current->mm->context.ldt_usr_sem);
+                       if (current->mm->context.ldt)
+                               dummy_value = GDT_ENTRY_LDT * 8;
+                       else
+                               dummy_value = 0;
+                       up_read(&current->mm->context.ldt_usr_sem);
+#else
+                       dummy_value = 0;
+#endif
+               }
  
                 /*
-                * Even though the CR0 register has 4 bytes, the number
+                * For these 3 instructions, the number
                  * of bytes to be copied in the result buffer is determined
                  * by whether the operand is a register or a memory location.
                  * If operand is a register, return as many bytes as the operand
                  * size. If operand is memory, return only the two least
-                * siginificant bytes of CR0.
+                * siginificant bytes.
                  */
                 if (X86_MODRM_MOD(insn->modrm.value) == 3)
                         *data_size = insn->opnd_bytes;
@@ -261,7 +280,6 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst,
                         *data_size = 2;
  
                 memcpy(data, &dummy_value, *data_size);
-       /* STR and SLDT  are not emulated */
         } else {
                 return -EINVAL;
         }
@@ -383,10 +401,6 @@ bool fixup_umip_exception(struct pt_regs *regs)
         umip_pr_warn(regs, "%s instruction cannot be used by applications.\n",
                         umip_insns[umip_inst]);
  
-       /* Do not emulate (spoof) SLDT or STR. */
-       if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT)
-               return false;
-
         umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n");
  
         if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size,
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile

index 4a3081e..7f86a14 100644 (file)
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -17,7 +17,8 @@ kvm-y                 += x86.o emulate.o i8259.o irq.o lapic.o \
                            i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
                            hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
  
-kvm-intel-y            += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o
+kvm-intel-y            += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
+                          vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
  kvm-amd-y              += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
  
  obj-$(CONFIG_KVM)      += kvm.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c

index 3fd6eec..37c3668 100644 (file)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -186,7 +186,6 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
  not_found:
         return 36;
  }
-EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr);
  
  /* when an old userspace process fills a new kernel module */
  int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
@@ -371,7 +370,7 @@ void kvm_set_cpu_caps(void)
                 F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
                 F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
                 F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
-               F(SERIALIZE)
+               F(SERIALIZE) | F(TSXLDTRK)
         );
  
         /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h

index 3a923ae..1d2c4f2 100644 (file)
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -34,6 +34,11 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
         return vcpu->arch.maxphyaddr;
  }
  
+static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+       return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
+}
+
  struct cpuid_reg {
         u32 function;
         u32 index;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c

index 5299ef5..0cc0db5 100644 (file)
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2505,9 +2505,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
                 *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
  
         val = GET_SMSTATE(u32, smstate, 0x7fcc);
-       ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+
+       if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+               return X86EMUL_UNHANDLEABLE;
+
         val = GET_SMSTATE(u32, smstate, 0x7fc8);
-       ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+       if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+               return X86EMUL_UNHANDLEABLE;
  
         selector =                 GET_SMSTATE(u32, smstate, 0x7fc4);
         set_desc_base(&desc,       GET_SMSTATE(u32, smstate, 0x7f64));
@@ -2560,16 +2565,23 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
         ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED;
  
         val = GET_SMSTATE(u32, smstate, 0x7f68);
-       ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+
+       if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+               return X86EMUL_UNHANDLEABLE;
+
         val = GET_SMSTATE(u32, smstate, 0x7f60);
-       ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+       if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+               return X86EMUL_UNHANDLEABLE;
  
         cr0 =                       GET_SMSTATE(u64, smstate, 0x7f58);
         cr3 =                       GET_SMSTATE(u64, smstate, 0x7f50);
         cr4 =                       GET_SMSTATE(u64, smstate, 0x7f48);
         ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00));
         val =                       GET_SMSTATE(u64, smstate, 0x7ed0);
-       ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
+
+       if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA))
+               return X86EMUL_UNHANDLEABLE;
  
         selector =                  GET_SMSTATE(u32, smstate, 0x7e90);
         rsm_set_desc_flags(&desc,   GET_SMSTATE(u32, smstate, 0x7e92) << 8);
@@ -3594,7 +3606,7 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt)
         u64 tsc_aux = 0;
  
         if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
-               return emulate_gp(ctxt, 0);
+               return emulate_ud(ctxt);
         ctxt->dst.val = tsc_aux;
         return X86EMUL_CONTINUE;
  }
@@ -3689,11 +3701,18 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt)
  
  static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
  {
+       u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
         u64 msr_data;
+       int r;
  
         msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
                 | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
-       if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
+       r = ctxt->ops->set_msr(ctxt, msr_index, msr_data);
+
+       if (r == X86EMUL_IO_NEEDED)
+               return r;
+
+       if (r)
                 return emulate_gp(ctxt, 0);
  
         return X86EMUL_CONTINUE;
@@ -3701,9 +3720,16 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
  
  static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
  {
+       u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
         u64 msr_data;
+       int r;
+
+       r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data);
+
+       if (r == X86EMUL_IO_NEEDED)
+               return r;
  
-       if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data))
+       if (r)
                 return emulate_gp(ctxt, 0);
  
         *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c

index 1d33056..67a4f60 100644 (file)
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -633,6 +633,11 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
  {
         union hv_stimer_config new_config = {.as_uint64 = config},
                 old_config = {.as_uint64 = stimer->config.as_uint64};
+       struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+       struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+       if (!synic->active && !host)
+               return 1;
  
         trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
                                        stimer->index, config, host);
@@ -652,6 +657,12 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
  static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
                             bool host)
  {
+       struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+       struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+       if (!synic->active && !host)
+               return 1;
+
         trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
                                       stimer->index, count, host);
  
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index 35cca2e..105e785 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -310,6 +310,12 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
         atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
  }
  
+static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
+{
+       kvm_lapic_set_reg(apic, APIC_DFR, val);
+       atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
  static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
  {
         return ((id >> 4) << 16) | (1 << (id & 0xf));
@@ -488,6 +494,12 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
         }
  }
  
+void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec)
+{
+       apic_clear_irr(vec, vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_clear_irr);
+
  static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
  {
         struct kvm_vcpu *vcpu;
@@ -1576,9 +1588,6 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
         struct kvm_lapic *apic = vcpu->arch.apic;
         u64 guest_tsc, tsc_deadline;
  
-       if (apic->lapic_timer.expired_tscdeadline == 0)
-               return;
-
         tsc_deadline = apic->lapic_timer.expired_tscdeadline;
         apic->lapic_timer.expired_tscdeadline = 0;
         guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
@@ -1593,7 +1602,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
  
  void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
  {
-       if (lapic_timer_int_injected(vcpu))
+       if (lapic_in_kernel(vcpu) &&
+           vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+           vcpu->arch.apic->lapic_timer.timer_advance_ns &&
+           lapic_timer_int_injected(vcpu))
                 __kvm_wait_lapic_expire(vcpu);
  }
  EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
@@ -1629,14 +1641,15 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
         }
  
         if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
-               if (apic->lapic_timer.timer_advance_ns)
-                       __kvm_wait_lapic_expire(vcpu);
+               kvm_wait_lapic_expire(vcpu);
                 kvm_apic_inject_pending_timer_irqs(apic);
                 return;
         }
  
         atomic_inc(&apic->lapic_timer.pending);
-       kvm_set_pending_timer(vcpu);
+       kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+       if (from_timer_fn)
+               kvm_vcpu_kick(vcpu);
  }
  
  static void start_sw_tscdeadline(struct kvm_lapic *apic)
@@ -1984,10 +1997,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
                 break;
  
         case APIC_DFR:
-               if (!apic_x2apic_mode(apic)) {
-                       kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
-                       atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
-               } else
+               if (!apic_x2apic_mode(apic))
+                       kvm_apic_set_dfr(apic, val | 0x0FFFFFFF);
+               else
                         ret = 1;
                 break;
  
@@ -2183,8 +2195,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
  {
         struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (!lapic_in_kernel(vcpu) ||
-               !apic_lvtt_tscdeadline(apic))
+       if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
                 return 0;
  
         return apic->lapic_timer.tscdeadline;
@@ -2194,8 +2205,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
  {
         struct kvm_lapic *apic = vcpu->arch.apic;
  
-       if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) ||
-                       apic_lvtt_period(apic))
+       if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
                 return;
  
         hrtimer_cancel(&apic->lapic_timer.timer);
@@ -2303,7 +2313,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
                              SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
         apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
  
-       kvm_lapic_set_reg(apic, APIC_DFR, 0xffffffffU);
+       kvm_apic_set_dfr(apic, 0xffffffffU);
         apic_set_spiv(apic, 0xff);
         kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
         if (!apic_x2apic_mode(apic))
@@ -2461,6 +2471,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
         __apic_update_ppr(apic, &ppr);
         return apic_has_interrupt_for_ppr(apic, ppr);
  }
+EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt);
  
  int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
  {
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h

index 754f29b..4fb86e3 100644 (file)
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -89,6 +89,7 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
  bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                            int shorthand, unsigned int dest, int dest_mode);
  int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
+void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec);
  bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
  bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
  void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h

index 5efc608..9c4a9c8 100644 (file)
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -155,11 +155,6 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
         return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
  }
  
-static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
-        return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
-}
-
  /*
   * Check if a given access (described through the I/D, W/R and U/S bits of a
   * page fault error code pfec) causes a permission fault with the given PTE
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index 43fdb0c..32e0e5c 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -198,17 +198,20 @@ module_param(dbg, bool, 0644);
  #define PTE_LIST_EXT 3
  
  /*
- * Return values of handle_mmio_page_fault and mmu.page_fault:
+ * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
+ *
   * RET_PF_RETRY: let CPU fault again on the address.
   * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
- *
- * For handle_mmio_page_fault only:
   * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ * RET_PF_FIXED: The faulting entry has been fixed.
+ * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
   */
  enum {
         RET_PF_RETRY = 0,
-       RET_PF_EMULATE = 1,
-       RET_PF_INVALID = 2,
+       RET_PF_EMULATE,
+       RET_PF_INVALID,
+       RET_PF_FIXED,
+       RET_PF_SPURIOUS,
  };
  
  struct pte_list_desc {
@@ -521,7 +524,7 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
                                    struct x86_exception *exception)
  {
         /* Check if guest physical address doesn't exceed guest maximum */
-       if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) {
+       if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) {
                 exception->error_code |= PFERR_RSVD_MASK;
                 return UNMAPPED_GVA;
         }
@@ -2469,7 +2472,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                 }
  
                 if (sp->unsync_children)
-                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+                       kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
  
                 __clear_sp_write_flooding_count(sp);
  
@@ -2615,8 +2618,9 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
         }
  }
  
-static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
-                            u64 *spte)
+/* Returns the number of zapped non-leaf child shadow pages. */
+static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+                           u64 *spte, struct list_head *invalid_list)
  {
         u64 pte;
         struct kvm_mmu_page *child;
@@ -2630,23 +2634,34 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
                 } else {
                         child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
                         drop_parent_pte(child, spte);
-               }
-               return true;
-       }
  
-       if (is_mmio_spte(pte))
+                       /*
+                        * Recursively zap nested TDP SPs, parentless SPs are
+                        * unlikely to be used again in the near future.  This
+                        * avoids retaining a large number of stale nested SPs.
+                        */
+                       if (tdp_enabled && invalid_list &&
+                           child->role.guest_mode && !child->parent_ptes.val)
+                               return kvm_mmu_prepare_zap_page(kvm, child,
+                                                               invalid_list);
+               }
+       } else if (is_mmio_spte(pte)) {
                 mmu_spte_clear_no_track(spte);
-
-       return false;
+       }
+       return 0;
  }
  
-static void kvm_mmu_page_unlink_children(struct kvm *kvm,
-                                        struct kvm_mmu_page *sp)
+static int kvm_mmu_page_unlink_children(struct kvm *kvm,
+                                       struct kvm_mmu_page *sp,
+                                       struct list_head *invalid_list)
  {
+       int zapped = 0;
         unsigned i;
  
         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-               mmu_page_zap_pte(kvm, sp, sp->spt + i);
+               zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
+
+       return zapped;
  }
  
  static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -2692,7 +2707,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
         trace_kvm_mmu_prepare_zap_page(sp);
         ++kvm->stat.mmu_shadow_zapped;
         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
-       kvm_mmu_page_unlink_children(kvm, sp);
+       *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
         kvm_mmu_unlink_parents(kvm, sp);
  
         /* Zapping children means active_mmu_pages has become unstable. */
@@ -2970,6 +2985,7 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
  /* Bits which may be returned by set_spte() */
  #define SET_SPTE_WRITE_PROTECTED_PT    BIT(0)
  #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
+#define SET_SPTE_SPURIOUS              BIT(2)
  
  static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                     unsigned int pte_access, int level,
@@ -3058,20 +3074,22 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                 spte = mark_spte_for_access_track(spte);
  
  set_pte:
-       if (mmu_spte_update(sptep, spte))
+       if (*sptep == spte)
+               ret |= SET_SPTE_SPURIOUS;
+       else if (mmu_spte_update(sptep, spte))
                 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
         return ret;
  }
  
  static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
-                       unsigned int pte_access, int write_fault, int level,
+                       unsigned int pte_access, bool write_fault, int level,
                         gfn_t gfn, kvm_pfn_t pfn, bool speculative,
                         bool host_writable)
  {
         int was_rmapped = 0;
         int rmap_count;
         int set_spte_ret;
-       int ret = RET_PF_RETRY;
+       int ret = RET_PF_FIXED;
         bool flush = false;
  
         pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
@@ -3113,6 +3131,15 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
         if (unlikely(is_mmio_spte(*sptep)))
                 ret = RET_PF_EMULATE;
  
+       /*
+        * The fault is fully spurious if and only if the new SPTE and old SPTE
+        * are identical, and emulation is not required.
+        */
+       if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) {
+               WARN_ON_ONCE(!was_rmapped);
+               return RET_PF_SPURIOUS;
+       }
+
         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
         trace_kvm_mmu_set_spte(level, gfn, sptep);
         if (!was_rmapped && is_large_pte(*sptep))
@@ -3161,7 +3188,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
                 return -1;
  
         for (i = 0; i < ret; i++, gfn++, start++) {
-               mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
+               mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn,
                              page_to_pfn(pages[i]), true, true);
                 put_page(pages[i]);
         }
@@ -3240,7 +3267,8 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
  }
  
  static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
-                                  int max_level, kvm_pfn_t *pfnp)
+                                  int max_level, kvm_pfn_t *pfnp,
+                                  bool huge_page_disallowed, int *req_level)
  {
         struct kvm_memory_slot *slot;
         struct kvm_lpage_info *linfo;
@@ -3248,6 +3276,8 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
         kvm_pfn_t mask;
         int level;
  
+       *req_level = PG_LEVEL_4K;
+
         if (unlikely(max_level == PG_LEVEL_4K))
                 return PG_LEVEL_4K;
  
@@ -3272,7 +3302,14 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
         if (level == PG_LEVEL_4K)
                 return level;
  
-       level = min(level, max_level);
+       *req_level = level = min(level, max_level);
+
+       /*
+        * Enforce the iTLB multihit workaround after capturing the requested
+        * level, which will be used to do precise, accurate accounting.
+        */
+       if (huge_page_disallowed)
+               return PG_LEVEL_4K;
  
         /*
          * mmu_notifier_retry() was successful and mmu_lock is held, so
@@ -3292,7 +3329,6 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
         u64 spte = *it.sptep;
  
         if (it.level == level && level > PG_LEVEL_4K &&
-           is_nx_huge_page_enabled() &&
             is_shadow_present_pte(spte) &&
             !is_large_pte(spte)) {
                 /*
@@ -3308,20 +3344,25 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
         }
  }
  
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                         int map_writable, int max_level, kvm_pfn_t pfn,
-                       bool prefault, bool account_disallowed_nx_lpage)
+                       bool prefault, bool is_tdp)
  {
+       bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
+       bool write = error_code & PFERR_WRITE_MASK;
+       bool exec = error_code & PFERR_FETCH_MASK;
+       bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
         struct kvm_shadow_walk_iterator it;
         struct kvm_mmu_page *sp;
-       int level, ret;
+       int level, req_level, ret;
         gfn_t gfn = gpa >> PAGE_SHIFT;
         gfn_t base_gfn = gfn;
  
         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
                 return RET_PF_RETRY;
  
-       level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn);
+       level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
+                                       huge_page_disallowed, &req_level);
  
         trace_kvm_mmu_spte_requested(gpa, level, pfn);
         for_each_shadow_entry(vcpu, gpa, it) {
@@ -3329,7 +3370,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
                  * We cannot overwrite existing page tables with an NX
                  * large page, as the leaf could be executable.
                  */
-               disallowed_hugepage_adjust(it, gfn, &pfn, &level);
+               if (nx_huge_page_workaround_enabled)
+                       disallowed_hugepage_adjust(it, gfn, &pfn, &level);
  
                 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
                 if (it.level == level)
@@ -3341,7 +3383,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
                                               it.level - 1, true, ACC_ALL);
  
                         link_shadow_page(vcpu, it.sptep, sp);
-                       if (account_disallowed_nx_lpage)
+                       if (is_tdp && huge_page_disallowed &&
+                           req_level >= it.level)
                                 account_huge_nx_page(vcpu->kvm, sp);
                 }
         }
@@ -3349,6 +3392,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
         ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
                            write, level, base_gfn, pfn, prefault,
                            map_writable);
+       if (ret == RET_PF_SPURIOUS)
+               return ret;
+
         direct_pte_prefetch(vcpu, it.sptep);
         ++vcpu->stat.pf_fixed;
         return ret;
@@ -3479,21 +3525,19 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte)
  }
  
  /*
- * Return value:
- * - true: let the vcpu to access on the same address again.
- * - false: let the real page fault path to fix it.
+ * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
   */
-static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-                           u32 error_code)
+static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+                          u32 error_code)
  {
         struct kvm_shadow_walk_iterator iterator;
         struct kvm_mmu_page *sp;
-       bool fault_handled = false;
+       int ret = RET_PF_INVALID;
         u64 spte = 0ull;
         uint retry_count = 0;
  
         if (!page_fault_can_be_fast(error_code))
-               return false;
+               return ret;
  
         walk_shadow_page_lockless_begin(vcpu);
  
@@ -3519,7 +3563,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                  * they are always ACC_ALL.
                  */
                 if (is_access_allowed(error_code, spte)) {
-                       fault_handled = true;
+                       ret = RET_PF_SPURIOUS;
                         break;
                 }
  
@@ -3562,11 +3606,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                  * since the gfn is not stable for indirect shadow page. See
                  * Documentation/virt/kvm/locking.rst to get more detail.
                  */
-               fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
-                                                       iterator.sptep, spte,
-                                                       new_spte);
-               if (fault_handled)
+               if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte,
+                                           new_spte)) {
+                       ret = RET_PF_FIXED;
                         break;
+               }
  
                 if (++retry_count > 4) {
                         printk_once(KERN_WARNING
@@ -3577,10 +3621,10 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
         } while (true);
  
         trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
-                             spte, fault_handled);
+                             spte, ret);
         walk_shadow_page_lockless_end(vcpu);
  
-       return fault_handled;
+       return ret;
  }
  
  static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
@@ -3603,6 +3647,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
  void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                         ulong roots_to_free)
  {
+       struct kvm *kvm = vcpu->kvm;
         int i;
         LIST_HEAD(invalid_list);
         bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
@@ -3620,22 +3665,21 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                         return;
         }
  
-       spin_lock(&vcpu->kvm->mmu_lock);
+       spin_lock(&kvm->mmu_lock);
  
         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
                 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
-                       mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
+                       mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
                                            &invalid_list);
  
         if (free_active_root) {
                 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
                     (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
-                       mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
-                                          &invalid_list);
+                       mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
                 } else {
                         for (i = 0; i < 4; ++i)
                                 if (mmu->pae_root[i] != 0)
-                                       mmu_free_root_page(vcpu->kvm,
+                                       mmu_free_root_page(kvm,
                                                            &mmu->pae_root[i],
                                                            &invalid_list);
                         mmu->root_hpa = INVALID_PAGE;
@@ -3643,8 +3687,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                 mmu->root_pgd = 0;
         }
  
-       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-       spin_unlock(&vcpu->kvm->mmu_lock);
+       kvm_mmu_commit_zap_page(kvm, &invalid_list);
+       spin_unlock(&kvm->mmu_lock);
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
  
@@ -4080,8 +4124,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                              bool prefault, int max_level, bool is_tdp)
  {
         bool write = error_code & PFERR_WRITE_MASK;
-       bool exec = error_code & PFERR_FETCH_MASK;
-       bool lpage_disallowed = exec && is_nx_huge_page_enabled();
         bool map_writable;
  
         gfn_t gfn = gpa >> PAGE_SHIFT;
@@ -4092,16 +4134,14 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
         if (page_fault_handle_page_track(vcpu, error_code, gfn))
                 return RET_PF_EMULATE;
  
-       if (fast_page_fault(vcpu, gpa, error_code))
-               return RET_PF_RETRY;
+       r = fast_page_fault(vcpu, gpa, error_code);
+       if (r != RET_PF_INVALID)
+               return r;
  
         r = mmu_topup_memory_caches(vcpu, false);
         if (r)
                 return r;
  
-       if (lpage_disallowed)
-               max_level = PG_LEVEL_4K;
-
         mmu_seq = vcpu->kvm->mmu_notifier_seq;
         smp_rmb();
  
@@ -4118,8 +4158,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
         r = make_mmu_pages_available(vcpu);
         if (r)
                 goto out_unlock;
-       r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn,
-                        prefault, is_tdp && lpage_disallowed);
+       r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
+                        prefault, is_tdp);
  
  out_unlock:
         spin_unlock(&vcpu->kvm->mmu_lock);
@@ -5400,7 +5440,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                         u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
  
                         entry = *spte;
-                       mmu_page_zap_pte(vcpu->kvm, sp, spte);
+                       mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
                         if (gentry &&
                             !((sp->role.word ^ base_role) & ~role_ign.word) &&
                             rmap_can_add(vcpu))
@@ -5450,13 +5490,14 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
         if (r == RET_PF_INVALID) {
                 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
                                           lower_32_bits(error_code), false);
-               WARN_ON(r == RET_PF_INVALID);
+               if (WARN_ON_ONCE(r == RET_PF_INVALID))
+                       return -EIO;
         }
  
-       if (r == RET_PF_RETRY)
-               return 1;
         if (r < 0)
                 return r;
+       if (r != RET_PF_EMULATE)
+               return 1;
  
         /*
          * Before emulating the instruction, check if the error code
@@ -5485,18 +5526,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
         if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
                 emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
  emulate:
-       /*
-        * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
-        * This can happen if a guest gets a page-fault on data access but the HW
-        * table walker is not able to read the instruction page (e.g instruction
-        * page is not present in memory). In those cases we simply restart the
-        * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
-        */
-       if (unlikely(insn && !insn_len)) {
-               if (!kvm_x86_ops.need_emulation_on_page_fault(vcpu))
-                       return 1;
-       }
-
         return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
                                        insn_len);
  }
@@ -5682,11 +5711,17 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
         free_page((unsigned long)mmu->lm_root);
  }
  
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
  {
         struct page *page;
         int i;
  
+       mmu->root_hpa = INVALID_PAGE;
+       mmu->root_pgd = 0;
+       mmu->translate_gpa = translate_gpa;
+       for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+               mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+
         /*
          * When using PAE paging, the four PDPTEs are treated as 'root' pages,
          * while the PDP table is a per-vCPU construct that's allocated at MMU
@@ -5712,7 +5747,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
  
  int kvm_mmu_create(struct kvm_vcpu *vcpu)
  {
-       uint i;
         int ret;
  
         vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
@@ -5726,25 +5760,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
         vcpu->arch.mmu = &vcpu->arch.root_mmu;
         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
  
-       vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
-       vcpu->arch.root_mmu.root_pgd = 0;
-       vcpu->arch.root_mmu.translate_gpa = translate_gpa;
-       for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-               vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
-
-       vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
-       vcpu->arch.guest_mmu.root_pgd = 0;
-       vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
-       for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-               vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
-
         vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
  
-       ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu);
+       ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
         if (ret)
                 return ret;
  
-       ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu);
+       ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
         if (ret)
                 goto fail_allocate_root;
  
@@ -6357,7 +6379,10 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
  
         ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
         to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
-       while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+       for ( ; to_zap; --to_zap) {
+               if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
+                       break;
+
                 /*
                  * We use a separate list instead of just using active_mmu_pages
                  * because the number of lpage_disallowed pages is expected to
@@ -6370,12 +6395,12 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
                 WARN_ON_ONCE(sp->lpage_disallowed);
  
-               if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+               if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
                         kvm_mmu_commit_zap_page(kvm, &invalid_list);
-                       if (to_zap)
-                               cond_resched_lock(&kvm->mmu_lock);
+                       cond_resched_lock(&kvm->mmu_lock);
                 }
         }
+       kvm_mmu_commit_zap_page(kvm, &invalid_list);
  
         spin_unlock(&kvm->mmu_lock);
         srcu_read_unlock(&kvm->srcu, rcu_idx);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h

index 9d15bc0..2080f9c 100644 (file)
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -244,14 +244,11 @@ TRACE_EVENT(
                   __entry->access)
  );
  
-#define __spte_satisfied(__spte)                               \
-       (__entry->retry && is_writable_pte(__entry->__spte))
-
  TRACE_EVENT(
         fast_page_fault,
         TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
-                u64 *sptep, u64 old_spte, bool retry),
-       TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry),
+                u64 *sptep, u64 old_spte, int ret),
+       TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret),
  
         TP_STRUCT__entry(
                 __field(int, vcpu_id)
@@ -260,7 +257,7 @@ TRACE_EVENT(
                 __field(u64 *, sptep)
                 __field(u64, old_spte)
                 __field(u64, new_spte)
-               __field(bool, retry)
+               __field(int, ret)
         ),
  
         TP_fast_assign(
@@ -270,7 +267,7 @@ TRACE_EVENT(
                 __entry->sptep = sptep;
                 __entry->old_spte = old_spte;
                 __entry->new_spte = *sptep;
-               __entry->retry = retry;
+               __entry->ret = ret;
         ),
  
         TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
@@ -278,7 +275,7 @@ TRACE_EVENT(
                   __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
                   kvm_mmu_trace_pferr_flags), __entry->sptep,
                   __entry->old_spte, __entry->new_spte,
-                 __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+                 __entry->ret == RET_PF_SPURIOUS, __entry->ret == RET_PF_FIXED
         )
  );
  
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h

index 4dd6b1e..9a1a15f 100644 (file)
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -550,7 +550,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
          * we call mmu_set_spte() with host_writable = true because
          * pte_prefetch_gfn_to_pfn always gets a writable pfn.
          */
-       mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn,
+       mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn,
                      true, true);
  
         kvm_release_pfn_clean(pfn);
@@ -625,15 +625,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
   * emulate this operation, return 1 to indicate this case.
   */
  static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
-                        struct guest_walker *gw,
-                        int write_fault, int max_level,
-                        kvm_pfn_t pfn, bool map_writable, bool prefault,
-                        bool lpage_disallowed)
+                        struct guest_walker *gw, u32 error_code,
+                        int max_level, kvm_pfn_t pfn, bool map_writable,
+                        bool prefault)
  {
+       bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
+       bool write_fault = error_code & PFERR_WRITE_MASK;
+       bool exec = error_code & PFERR_FETCH_MASK;
+       bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
         struct kvm_mmu_page *sp = NULL;
         struct kvm_shadow_walk_iterator it;
         unsigned direct_access, access = gw->pt_access;
-       int top_level, hlevel, ret;
+       int top_level, level, req_level, ret;
         gfn_t base_gfn = gw->gfn;
  
         direct_access = gw->pte_access;
@@ -679,7 +682,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
                         link_shadow_page(vcpu, it.sptep, sp);
         }
  
-       hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn);
+       level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn,
+                                       huge_page_disallowed, &req_level);
  
         trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
  
@@ -690,10 +694,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
                  * We cannot overwrite existing page tables with an NX
                  * large page, as the leaf could be executable.
                  */
-               disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel);
+               if (nx_huge_page_workaround_enabled)
+                       disallowed_hugepage_adjust(it, gw->gfn, &pfn, &level);
  
                 base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
-               if (it.level == hlevel)
+               if (it.level == level)
                         break;
  
                 validate_direct_spte(vcpu, it.sptep, direct_access);
@@ -704,13 +709,16 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
                         sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
                                               it.level - 1, true, direct_access);
                         link_shadow_page(vcpu, it.sptep, sp);
-                       if (lpage_disallowed)
+                       if (huge_page_disallowed && req_level >= it.level)
                                 account_huge_nx_page(vcpu->kvm, sp);
                 }
         }
  
         ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
                            it.level, base_gfn, pfn, prefault, map_writable);
+       if (ret == RET_PF_SPURIOUS)
+               return ret;
+
         FNAME(pte_prefetch)(vcpu, gw, it.sptep);
         ++vcpu->stat.pf_fixed;
         return ret;
@@ -738,7 +746,7 @@ out_gpte_changed:
   */
  static bool
  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
-                             struct guest_walker *walker, int user_fault,
+                             struct guest_walker *walker, bool user_fault,
                               bool *write_fault_to_shadow_pgtable)
  {
         int level;
@@ -776,15 +784,13 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
  static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
                              bool prefault)
  {
-       int write_fault = error_code & PFERR_WRITE_MASK;
-       int user_fault = error_code & PFERR_USER_MASK;
+       bool write_fault = error_code & PFERR_WRITE_MASK;
+       bool user_fault = error_code & PFERR_USER_MASK;
         struct guest_walker walker;
         int r;
         kvm_pfn_t pfn;
         unsigned long mmu_seq;
         bool map_writable, is_self_change_mapping;
-       bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
-                               is_nx_huge_page_enabled();
         int max_level;
  
         pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -825,7 +831,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
         is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
               &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
  
-       if (lpage_disallowed || is_self_change_mapping)
+       if (is_self_change_mapping)
                 max_level = PG_LEVEL_4K;
         else
                 max_level = walker.level;
@@ -869,8 +875,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
         r = make_mmu_pages_available(vcpu);
         if (r)
                 goto out_unlock;
-       r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn,
-                        map_writable, prefault, lpage_disallowed);
+       r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn,
+                        map_writable, prefault);
         kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
  
  out_unlock:
@@ -895,6 +901,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
  {
         struct kvm_shadow_walk_iterator iterator;
         struct kvm_mmu_page *sp;
+       u64 old_spte;
         int level;
         u64 *sptep;
  
@@ -917,7 +924,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
                 sptep = iterator.sptep;
  
                 sp = sptep_to_sp(sptep);
-               if (is_last_spte(*sptep, level)) {
+               old_spte = *sptep;
+               if (is_last_spte(old_spte, level)) {
                         pt_element_t gpte;
                         gpa_t pte_gpa;
  
@@ -927,7 +935,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
                         pte_gpa = FNAME(get_level1_sp_gpa)(sp);
                         pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
  
-                       if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
+                       mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
+                       if (is_shadow_present_pte(old_spte))
                                 kvm_flush_remote_tlbs_with_address(vcpu->kvm,
                                         sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
  
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c

index ac830cd..f73f84d 100644 (file)
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -153,20 +153,18 @@ int avic_vm_init(struct kvm *kvm)
                 return 0;
  
         /* Allocating physical APIC ID table (4KB) */
-       p_page = alloc_page(GFP_KERNEL_ACCOUNT);
+       p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
         if (!p_page)
                 goto free_avic;
  
         kvm_svm->avic_physical_id_table_page = p_page;
-       clear_page(page_address(p_page));
  
         /* Allocating logical APIC ID table (4KB) */
-       l_page = alloc_page(GFP_KERNEL_ACCOUNT);
+       l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
         if (!l_page)
                 goto free_avic;
  
         kvm_svm->avic_logical_id_table_page = l_page;
-       clear_page(page_address(l_page));
  
         spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
   again:
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c

index fb68467..ba50ff6 100644 (file)
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -98,6 +98,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
  void recalc_intercepts(struct vcpu_svm *svm)
  {
         struct vmcb_control_area *c, *h, *g;
+       unsigned int i;
  
         vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
  
@@ -108,42 +109,37 @@ void recalc_intercepts(struct vcpu_svm *svm)
         h = &svm->nested.hsave->control;
         g = &svm->nested.ctl;
  
-       svm->nested.host_intercept_exceptions = h->intercept_exceptions;
-
-       c->intercept_cr = h->intercept_cr;
-       c->intercept_dr = h->intercept_dr;
-       c->intercept_exceptions = h->intercept_exceptions;
-       c->intercept = h->intercept;
+       for (i = 0; i < MAX_INTERCEPT; i++)
+               c->intercepts[i] = h->intercepts[i];
  
         if (g->int_ctl & V_INTR_MASKING_MASK) {
                 /* We only want the cr8 intercept bits of L1 */
-               c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ);
-               c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE);
+               vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
+               vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
  
                 /*
                  * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
                  * affect any interrupt we may want to inject; therefore,
                  * interrupt window vmexits are irrelevant to L0.
                  */
-               c->intercept &= ~(1ULL << INTERCEPT_VINTR);
+               vmcb_clr_intercept(c, INTERCEPT_VINTR);
         }
  
         /* We don't want to see VMMCALLs from a nested guest */
-       c->intercept &= ~(1ULL << INTERCEPT_VMMCALL);
+       vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
  
-       c->intercept_cr |= g->intercept_cr;
-       c->intercept_dr |= g->intercept_dr;
-       c->intercept_exceptions |= g->intercept_exceptions;
-       c->intercept |= g->intercept;
+       for (i = 0; i < MAX_INTERCEPT; i++)
+               c->intercepts[i] |= g->intercepts[i];
  }
  
  static void copy_vmcb_control_area(struct vmcb_control_area *dst,
                                    struct vmcb_control_area *from)
  {
-       dst->intercept_cr         = from->intercept_cr;
-       dst->intercept_dr         = from->intercept_dr;
-       dst->intercept_exceptions = from->intercept_exceptions;
-       dst->intercept            = from->intercept;
+       unsigned int i;
+
+       for (i = 0; i < MAX_INTERCEPT; i++)
+               dst->intercepts[i] = from->intercepts[i];
+
         dst->iopm_base_pa         = from->iopm_base_pa;
         dst->msrpm_base_pa        = from->msrpm_base_pa;
         dst->tsc_offset           = from->tsc_offset;
@@ -176,7 +172,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
          */
         int i;
  
-       if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+       if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
                 return true;
  
         for (i = 0; i < MSRPM_OFFSETS; i++) {
@@ -200,9 +196,23 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
         return true;
  }
  
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       if (!nested_svm_vmrun_msrpm(svm)) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror =
+                       KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
+               return false;
+       }
+
+       return true;
+}
+
  static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
  {
-       if ((control->intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
+       if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
                 return false;
  
         if (control->asid == 0)
@@ -215,41 +225,39 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
         return true;
  }
  
-static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb)
+static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
  {
-       bool nested_vmcb_lma;
-       if ((vmcb->save.efer & EFER_SVME) == 0)
+       bool vmcb12_lma;
+
+       if ((vmcb12->save.efer & EFER_SVME) == 0)
                 return false;
  
-       if (((vmcb->save.cr0 & X86_CR0_CD) == 0) &&
-           (vmcb->save.cr0 & X86_CR0_NW))
+       if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
                 return false;
  
-       if (!kvm_dr6_valid(vmcb->save.dr6) || !kvm_dr7_valid(vmcb->save.dr7))
+       if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
                 return false;
  
-       nested_vmcb_lma =
-               (vmcb->save.efer & EFER_LME) &&
-               (vmcb->save.cr0 & X86_CR0_PG);
+       vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
  
-       if (!nested_vmcb_lma) {
-               if (vmcb->save.cr4 & X86_CR4_PAE) {
-                       if (vmcb->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK)
+       if (!vmcb12_lma) {
+               if (vmcb12->save.cr4 & X86_CR4_PAE) {
+                       if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK)
                                 return false;
                 } else {
-                       if (vmcb->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK)
+                       if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK)
                                 return false;
                 }
         } else {
-               if (!(vmcb->save.cr4 & X86_CR4_PAE) ||
-                   !(vmcb->save.cr0 & X86_CR0_PE) ||
-                   (vmcb->save.cr3 & MSR_CR3_LONG_RESERVED_MASK))
+               if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
+                   !(vmcb12->save.cr0 & X86_CR0_PE) ||
+                   (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK))
                         return false;
         }
-       if (kvm_valid_cr4(&svm->vcpu, vmcb->save.cr4))
+       if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
                 return false;
  
-       return nested_vmcb_check_controls(&vmcb->control);
+       return nested_vmcb_check_controls(&vmcb12->control);
  }
  
  static void load_nested_vmcb_control(struct vcpu_svm *svm,
@@ -296,7 +304,7 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
   * EXIT_INT_INFO.
   */
  static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
-                                          struct vmcb *nested_vmcb)
+                                          struct vmcb *vmcb12)
  {
         struct kvm_vcpu *vcpu = &svm->vcpu;
         u32 exit_int_info = 0;
@@ -308,7 +316,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
  
                 if (vcpu->arch.exception.has_error_code) {
                         exit_int_info |= SVM_EVTINJ_VALID_ERR;
-                       nested_vmcb->control.exit_int_info_err =
+                       vmcb12->control.exit_int_info_err =
                                 vcpu->arch.exception.error_code;
                 }
  
@@ -325,7 +333,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
                         exit_int_info |= SVM_EVTINJ_TYPE_INTR;
         }
  
-       nested_vmcb->control.exit_int_info = exit_int_info;
+       vmcb12->control.exit_int_info = exit_int_info;
  }
  
  static inline bool nested_npt_enabled(struct vcpu_svm *svm)
@@ -364,31 +372,31 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
         return 0;
  }
  
-static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
+static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
  {
         /* Load the nested guest state */
-       svm->vmcb->save.es = nested_vmcb->save.es;
-       svm->vmcb->save.cs = nested_vmcb->save.cs;
-       svm->vmcb->save.ss = nested_vmcb->save.ss;
-       svm->vmcb->save.ds = nested_vmcb->save.ds;
-       svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
-       svm->vmcb->save.idtr = nested_vmcb->save.idtr;
-       kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
-       svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
-       svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
-       svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
-       svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
-       kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax);
-       kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp);
-       kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip);
+       svm->vmcb->save.es = vmcb12->save.es;
+       svm->vmcb->save.cs = vmcb12->save.cs;
+       svm->vmcb->save.ss = vmcb12->save.ss;
+       svm->vmcb->save.ds = vmcb12->save.ds;
+       svm->vmcb->save.gdtr = vmcb12->save.gdtr;
+       svm->vmcb->save.idtr = vmcb12->save.idtr;
+       kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags);
+       svm_set_efer(&svm->vcpu, vmcb12->save.efer);
+       svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
+       svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
+       svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+       kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
+       kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
+       kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
  
         /* In case we don't even reach vcpu_run, the fields are not updated */
-       svm->vmcb->save.rax = nested_vmcb->save.rax;
-       svm->vmcb->save.rsp = nested_vmcb->save.rsp;
-       svm->vmcb->save.rip = nested_vmcb->save.rip;
-       svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
-       svm->vcpu.arch.dr6  = nested_vmcb->save.dr6;
-       svm->vmcb->save.cpl = nested_vmcb->save.cpl;
+       svm->vmcb->save.rax = vmcb12->save.rax;
+       svm->vmcb->save.rsp = vmcb12->save.rsp;
+       svm->vmcb->save.rip = vmcb12->save.rip;
+       svm->vmcb->save.dr7 = vmcb12->save.dr7;
+       svm->vcpu.arch.dr6  = vmcb12->save.dr6;
+       svm->vmcb->save.cpl = vmcb12->save.cpl;
  }
  
  static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
@@ -426,17 +434,17 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
         vmcb_mark_all_dirty(svm->vmcb);
  }
  
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-                         struct vmcb *nested_vmcb)
+int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
+                        struct vmcb *vmcb12)
  {
         int ret;
  
-       svm->nested.vmcb = vmcb_gpa;
-       load_nested_vmcb_control(svm, &nested_vmcb->control);
-       nested_prepare_vmcb_save(svm, nested_vmcb);
+       svm->nested.vmcb12_gpa = vmcb12_gpa;
+       load_nested_vmcb_control(svm, &vmcb12->control);
+       nested_prepare_vmcb_save(svm, vmcb12);
         nested_prepare_vmcb_control(svm);
  
-       ret = nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3,
+       ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
                                   nested_npt_enabled(svm));
         if (ret)
                 return ret;
@@ -449,19 +457,19 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
  int nested_svm_vmrun(struct vcpu_svm *svm)
  {
         int ret;
-       struct vmcb *nested_vmcb;
+       struct vmcb *vmcb12;
         struct vmcb *hsave = svm->nested.hsave;
         struct vmcb *vmcb = svm->vmcb;
         struct kvm_host_map map;
-       u64 vmcb_gpa;
+       u64 vmcb12_gpa;
  
         if (is_smm(&svm->vcpu)) {
                 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
                 return 1;
         }
  
-       vmcb_gpa = svm->vmcb->save.rax;
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
+       vmcb12_gpa = svm->vmcb->save.rax;
+       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
         if (ret == -EINVAL) {
                 kvm_inject_gp(&svm->vcpu, 0);
                 return 1;
@@ -471,26 +479,28 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
  
         ret = kvm_skip_emulated_instruction(&svm->vcpu);
  
-       nested_vmcb = map.hva;
+       vmcb12 = map.hva;
  
-       if (!nested_vmcb_checks(svm, nested_vmcb)) {
-               nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
-               nested_vmcb->control.exit_code_hi = 0;
-               nested_vmcb->control.exit_info_1  = 0;
-               nested_vmcb->control.exit_info_2  = 0;
+       if (!nested_vmcb_checks(svm, vmcb12)) {
+               vmcb12->control.exit_code    = SVM_EXIT_ERR;
+               vmcb12->control.exit_code_hi = 0;
+               vmcb12->control.exit_info_1  = 0;
+               vmcb12->control.exit_info_2  = 0;
                 goto out;
         }
  
-       trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
-                              nested_vmcb->save.rip,
-                              nested_vmcb->control.int_ctl,
-                              nested_vmcb->control.event_inj,
-                              nested_vmcb->control.nested_ctl);
+       trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
+                              vmcb12->save.rip,
+                              vmcb12->control.int_ctl,
+                              vmcb12->control.event_inj,
+                              vmcb12->control.nested_ctl);
  
-       trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
-                                   nested_vmcb->control.intercept_cr >> 16,
-                                   nested_vmcb->control.intercept_exceptions,
-                                   nested_vmcb->control.intercept);
+       trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
+                                   vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
+                                   vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
+                                   vmcb12->control.intercepts[INTERCEPT_WORD3],
+                                   vmcb12->control.intercepts[INTERCEPT_WORD4],
+                                   vmcb12->control.intercepts[INTERCEPT_WORD5]);
  
         /* Clear internal status */
         kvm_clear_exception_queue(&svm->vcpu);
@@ -522,7 +532,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
  
         svm->nested.nested_run_pending = 1;
  
-       if (enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb))
+       if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
                 goto out_exit_err;
  
         if (nested_svm_vmrun_msrpm(svm))
@@ -563,75 +573,77 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
  int nested_svm_vmexit(struct vcpu_svm *svm)
  {
         int rc;
-       struct vmcb *nested_vmcb;
+       struct vmcb *vmcb12;
         struct vmcb *hsave = svm->nested.hsave;
         struct vmcb *vmcb = svm->vmcb;
         struct kvm_host_map map;
  
-       rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map);
+       rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
         if (rc) {
                 if (rc == -EINVAL)
                         kvm_inject_gp(&svm->vcpu, 0);
                 return 1;
         }
  
-       nested_vmcb = map.hva;
+       vmcb12 = map.hva;
  
         /* Exit Guest-Mode */
         leave_guest_mode(&svm->vcpu);
-       svm->nested.vmcb = 0;
+       svm->nested.vmcb12_gpa = 0;
         WARN_ON_ONCE(svm->nested.nested_run_pending);
  
         /* in case we halted in L2 */
         svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
  
         /* Give the current vmcb to the guest */
-       svm_set_gif(svm, false);
  
-       nested_vmcb->save.es     = vmcb->save.es;
-       nested_vmcb->save.cs     = vmcb->save.cs;
-       nested_vmcb->save.ss     = vmcb->save.ss;
-       nested_vmcb->save.ds     = vmcb->save.ds;
-       nested_vmcb->save.gdtr   = vmcb->save.gdtr;
-       nested_vmcb->save.idtr   = vmcb->save.idtr;
-       nested_vmcb->save.efer   = svm->vcpu.arch.efer;
-       nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
-       nested_vmcb->save.cr2    = vmcb->save.cr2;
-       nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
-       nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
-       nested_vmcb->save.rip    = kvm_rip_read(&svm->vcpu);
-       nested_vmcb->save.rsp    = kvm_rsp_read(&svm->vcpu);
-       nested_vmcb->save.rax    = kvm_rax_read(&svm->vcpu);
-       nested_vmcb->save.dr7    = vmcb->save.dr7;
-       nested_vmcb->save.dr6    = svm->vcpu.arch.dr6;
-       nested_vmcb->save.cpl    = vmcb->save.cpl;
-
-       nested_vmcb->control.int_state         = vmcb->control.int_state;
-       nested_vmcb->control.exit_code         = vmcb->control.exit_code;
-       nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
-       nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
-       nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
-
-       if (nested_vmcb->control.exit_code != SVM_EXIT_ERR)
-               nested_vmcb_save_pending_event(svm, nested_vmcb);
+       vmcb12->save.es     = vmcb->save.es;
+       vmcb12->save.cs     = vmcb->save.cs;
+       vmcb12->save.ss     = vmcb->save.ss;
+       vmcb12->save.ds     = vmcb->save.ds;
+       vmcb12->save.gdtr   = vmcb->save.gdtr;
+       vmcb12->save.idtr   = vmcb->save.idtr;
+       vmcb12->save.efer   = svm->vcpu.arch.efer;
+       vmcb12->save.cr0    = kvm_read_cr0(&svm->vcpu);
+       vmcb12->save.cr3    = kvm_read_cr3(&svm->vcpu);
+       vmcb12->save.cr2    = vmcb->save.cr2;
+       vmcb12->save.cr4    = svm->vcpu.arch.cr4;
+       vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
+       vmcb12->save.rip    = kvm_rip_read(&svm->vcpu);
+       vmcb12->save.rsp    = kvm_rsp_read(&svm->vcpu);
+       vmcb12->save.rax    = kvm_rax_read(&svm->vcpu);
+       vmcb12->save.dr7    = vmcb->save.dr7;
+       vmcb12->save.dr6    = svm->vcpu.arch.dr6;
+       vmcb12->save.cpl    = vmcb->save.cpl;
+
+       vmcb12->control.int_state         = vmcb->control.int_state;
+       vmcb12->control.exit_code         = vmcb->control.exit_code;
+       vmcb12->control.exit_code_hi      = vmcb->control.exit_code_hi;
+       vmcb12->control.exit_info_1       = vmcb->control.exit_info_1;
+       vmcb12->control.exit_info_2       = vmcb->control.exit_info_2;
+
+       if (vmcb12->control.exit_code != SVM_EXIT_ERR)
+               nested_vmcb_save_pending_event(svm, vmcb12);
  
         if (svm->nrips_enabled)
-               nested_vmcb->control.next_rip  = vmcb->control.next_rip;
+               vmcb12->control.next_rip  = vmcb->control.next_rip;
  
-       nested_vmcb->control.int_ctl           = svm->nested.ctl.int_ctl;
-       nested_vmcb->control.tlb_ctl           = svm->nested.ctl.tlb_ctl;
-       nested_vmcb->control.event_inj         = svm->nested.ctl.event_inj;
-       nested_vmcb->control.event_inj_err     = svm->nested.ctl.event_inj_err;
+       vmcb12->control.int_ctl           = svm->nested.ctl.int_ctl;
+       vmcb12->control.tlb_ctl           = svm->nested.ctl.tlb_ctl;
+       vmcb12->control.event_inj         = svm->nested.ctl.event_inj;
+       vmcb12->control.event_inj_err     = svm->nested.ctl.event_inj_err;
  
-       nested_vmcb->control.pause_filter_count =
+       vmcb12->control.pause_filter_count =
                 svm->vmcb->control.pause_filter_count;
-       nested_vmcb->control.pause_filter_thresh =
+       vmcb12->control.pause_filter_thresh =
                 svm->vmcb->control.pause_filter_thresh;
  
         /* Restore the original control entries */
         copy_vmcb_control_area(&vmcb->control, &hsave->control);
  
+       /* On vmexit the  GIF is set to false */
+       svm_set_gif(svm, false);
+
         svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
                 svm->vcpu.arch.l1_tsc_offset;
  
@@ -657,11 +669,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
  
         vmcb_mark_all_dirty(svm->vmcb);
  
-       trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code,
-                                      nested_vmcb->control.exit_info_1,
-                                      nested_vmcb->control.exit_info_2,
-                                      nested_vmcb->control.exit_int_info,
-                                      nested_vmcb->control.exit_int_info_err,
+       trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
+                                      vmcb12->control.exit_info_1,
+                                      vmcb12->control.exit_info_2,
+                                      vmcb12->control.exit_int_info,
+                                      vmcb12->control.exit_int_info_err,
                                        KVM_ISA_SVM);
  
         kvm_vcpu_unmap(&svm->vcpu, &map, true);
@@ -700,6 +712,8 @@ void svm_leave_nested(struct vcpu_svm *svm)
                 copy_vmcb_control_area(&vmcb->control, &hsave->control);
                 nested_svm_uninit_mmu_context(&svm->vcpu);
         }
+
+       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
  }
  
  static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
@@ -707,7 +721,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
         u32 offset, msr, value;
         int write, mask;
  
-       if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+       if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
                 return NESTED_EXIT_HOST;
  
         msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
@@ -734,7 +748,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
         u8 start_bit;
         u64 gpa;
  
-       if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+       if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
                 return NESTED_EXIT_HOST;
  
         port = svm->vmcb->control.exit_info_1 >> 16;
@@ -765,14 +779,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
                 vmexit = nested_svm_intercept_ioio(svm);
                 break;
         case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
-               u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
-               if (svm->nested.ctl.intercept_cr & bit)
+               if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
                         vmexit = NESTED_EXIT_DONE;
                 break;
         }
         case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
-               u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
-               if (svm->nested.ctl.intercept_dr & bit)
+               if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
                         vmexit = NESTED_EXIT_DONE;
                 break;
         }
@@ -790,8 +802,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
                 break;
         }
         default: {
-               u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
-               if (svm->nested.ctl.intercept & exit_bits)
+               if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
                         vmexit = NESTED_EXIT_DONE;
         }
         }
@@ -831,7 +842,7 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm)
  {
         unsigned int nr = svm->vcpu.arch.exception.nr;
  
-       return (svm->nested.ctl.intercept_exceptions & (1 << nr));
+       return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr));
  }
  
  static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
@@ -899,7 +910,7 @@ static void nested_svm_intr(struct vcpu_svm *svm)
  
  static inline bool nested_exit_on_init(struct vcpu_svm *svm)
  {
-       return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INIT));
+       return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
  }
  
  static void nested_svm_init(struct vcpu_svm *svm)
@@ -980,7 +991,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
         case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
                 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
  
-               if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits)
+               if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
+                               excp_bits)
                         return NESTED_EXIT_HOST;
                 else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
                          svm->vcpu.arch.apf.host_apf_flags)
@@ -1018,7 +1030,7 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
  
         /* First fill in the header and copy it out.  */
         if (is_guest_mode(vcpu)) {
-               kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb;
+               kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
                 kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
                 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
  
@@ -1060,10 +1072,14 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
         struct vmcb *hsave = svm->nested.hsave;
         struct vmcb __user *user_vmcb = (struct vmcb __user *)
                 &user_kvm_nested_state->data.svm[0];
-       struct vmcb_control_area ctl;
-       struct vmcb_save_area save;
+       struct vmcb_control_area *ctl;
+       struct vmcb_save_area *save;
+       int ret;
         u32 cr0;
  
+       BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
+                    KVM_STATE_NESTED_SVM_VMCB_SIZE);
+
         if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
                 return -EINVAL;
  
@@ -1088,20 +1104,30 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
  
         if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
                 svm_leave_nested(svm);
-               goto out_set_gif;
+               svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+               return 0;
         }
  
         if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
                 return -EINVAL;
         if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
                 return -EINVAL;
-       if (copy_from_user(&ctl, &user_vmcb->control, sizeof(ctl)))
-               return -EFAULT;
-       if (copy_from_user(&save, &user_vmcb->save, sizeof(save)))
-               return -EFAULT;
  
-       if (!nested_vmcb_check_controls(&ctl))
-               return -EINVAL;
+       ret  = -ENOMEM;
+       ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL);
+       save = kzalloc(sizeof(*save), GFP_KERNEL);
+       if (!ctl || !save)
+               goto out_free;
+
+       ret = -EFAULT;
+       if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl)))
+               goto out_free;
+       if (copy_from_user(save, &user_vmcb->save, sizeof(*save)))
+               goto out_free;
+
+       ret = -EINVAL;
+       if (!nested_vmcb_check_controls(ctl))
+               goto out_free;
  
         /*
          * Processor state contains L2 state.  Check that it is
@@ -1109,15 +1135,15 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
          */
         cr0 = kvm_read_cr0(vcpu);
          if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
-                return -EINVAL;
+               goto out_free;
  
         /*
          * Validate host state saved from before VMRUN (see
          * nested_svm_check_permissions).
          * TODO: validate reserved bits for all saved state.
          */
-       if (!(save.cr0 & X86_CR0_PG))
-               return -EINVAL;
+       if (!(save->cr0 & X86_CR0_PG))
+               goto out_free;
  
         /*
          * All checks done, we can enter guest mode.  L1 control fields
@@ -1126,19 +1152,24 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
          * contains saved L1 state.
          */
         copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
-       hsave->save = save;
+       hsave->save = *save;
  
-       svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
-       load_nested_vmcb_control(svm, &ctl);
+       svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
+       load_nested_vmcb_control(svm, ctl);
         nested_prepare_vmcb_control(svm);
  
-out_set_gif:
-       svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
-       return 0;
+       kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+       ret = 0;
+out_free:
+       kfree(save);
+       kfree(ctl);
+
+       return ret;
  }
  
  struct kvm_x86_nested_ops svm_nested_ops = {
         .check_events = svm_check_nested_events,
+       .get_nested_state_pages = svm_get_nested_state_pages,
         .get_state = svm_get_nested_state,
         .set_state = svm_set_nested_state,
  };
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c

index 402dc42..65e15c2 100644 (file)
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -446,10 +446,8 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
         }
  
         /*
-        * The LAUNCH_UPDATE command will perform in-place encryption of the
-        * memory content (i.e it will write the same memory region with C=1).
-        * It's possible that the cache may contain the data with C=0, i.e.,
-        * unencrypted so invalidate it first.
+        * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
+        * place; the cache may contain the data that was written unencrypted.
          */
         sev_clflush_pages(inpages, npages);
  
@@ -805,10 +803,9 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
                 }
  
                 /*
-                * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
-                * memory content (i.e it will write the same memory region with C=1).
-                * It's possible that the cache may contain the data with C=0, i.e.,
-                * unencrypted so invalidate it first.
+                * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify
+                * the pages; flush the destination too so that future accesses do not
+                * see stale data.
                  */
                 sev_clflush_pages(src_p, 1);
                 sev_clflush_pages(dst_p, 1);
@@ -856,7 +853,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
         struct kvm_sev_launch_secret params;
         struct page **pages;
         void *blob, *hdr;
-       unsigned long n;
+       unsigned long n, i;
         int ret, offset;
  
         if (!sev_guest(kvm))
@@ -869,6 +866,12 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
         if (IS_ERR(pages))
                 return PTR_ERR(pages);
  
+       /*
+        * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in
+        * place; the cache may contain the data that was written unencrypted.
+        */
+       sev_clflush_pages(pages, n);
+
         /*
          * The secret must be copied into contiguous memory region, lets verify
          * that userspace memory pages are contiguous before we issue command.
@@ -914,6 +917,11 @@ e_free_blob:
  e_free:
         kfree(data);
  e_unpin_memory:
+       /* content of memory is updated, mark pages dirty */
+       for (i = 0; i < n; i++) {
+               set_page_dirty_lock(pages[i]);
+               mark_page_accessed(pages[i]);
+       }
         sev_unpin_memory(kvm, pages, n);
         return ret;
  }
@@ -1106,6 +1114,7 @@ void sev_vm_destroy(struct kvm *kvm)
                 list_for_each_safe(pos, q, head) {
                         __unregister_enc_region_locked(kvm,
                                 list_entry(pos, struct enc_region, list));
+                       cond_resched();
                 }
         }
  
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index 0194336..4f401fc 100644 (file)
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -91,7 +91,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio);
  static const struct svm_direct_access_msrs {
         u32 index;   /* Index of the MSR */
         bool always; /* True if intercept is always on */
-} direct_access_msrs[] = {
+} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
         { .index = MSR_STAR,                            .always = true  },
         { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
  #ifdef CONFIG_X86_64
@@ -553,18 +553,44 @@ free_cpu_data:
  
  }
  
-static bool valid_msr_intercept(u32 index)
+static int direct_access_msr_slot(u32 msr)
  {
-       int i;
+       u32 i;
  
         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
-               if (direct_access_msrs[i].index == index)
-                       return true;
+               if (direct_access_msrs[i].index == msr)
+                       return i;
  
-       return false;
+       return -ENOENT;
  }
  
-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
+static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
+                                    int write)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       int slot = direct_access_msr_slot(msr);
+
+       if (slot == -ENOENT)
+               return;
+
+       /* Set the shadow bitmaps to the desired intercept states */
+       if (read)
+               set_bit(slot, svm->shadow_msr_intercept.read);
+       else
+               clear_bit(slot, svm->shadow_msr_intercept.read);
+
+       if (write)
+               set_bit(slot, svm->shadow_msr_intercept.write);
+       else
+               clear_bit(slot, svm->shadow_msr_intercept.write);
+}
+
+static bool valid_msr_intercept(u32 index)
+{
+       return direct_access_msr_slot(index) != -ENOENT;
+}
+
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
  {
         u8 bit_write;
         unsigned long tmp;
@@ -583,8 +609,8 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
         return !!test_bit(bit_write,  &tmp);
  }
  
-static void set_msr_interception(u32 *msrpm, unsigned msr,
-                                int read, int write)
+static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
+                                       u32 msr, int read, int write)
  {
         u8 bit_read, bit_write;
         unsigned long tmp;
@@ -596,6 +622,13 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
          */
         WARN_ON(!valid_msr_intercept(msr));
  
+       /* Enforce non allowed MSRs to trap */
+       if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+               read = 0;
+
+       if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+               write = 0;
+
         offset    = svm_msrpm_offset(msr);
         bit_read  = 2 * (msr & 0x0f);
         bit_write = 2 * (msr & 0x0f) + 1;
@@ -609,17 +642,59 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
         msrpm[offset] = tmp;
  }
  
-static void svm_vcpu_init_msrpm(u32 *msrpm)
+static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
+                                int read, int write)
+{
+       set_shadow_msr_intercept(vcpu, msr, read, write);
+       set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
+}
+
+static u32 *svm_vcpu_alloc_msrpm(void)
  {
-       int i;
+       struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
+       u32 *msrpm;
+
+       if (!pages)
+               return NULL;
  
+       msrpm = page_address(pages);
         memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
  
+       return msrpm;
+}
+
+static void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
+{
+       int i;
+
         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
                 if (!direct_access_msrs[i].always)
                         continue;
+               set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
+       }
+}
+
+static void svm_vcpu_free_msrpm(u32 *msrpm)
+{
+       __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
+}
+
+static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u32 i;
+
+       /*
+        * Set intercept permissions for all direct access MSRs again. They
+        * will automatically get filtered through the MSR filter, so we are
+        * back in sync after this.
+        */
+       for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+               u32 msr = direct_access_msrs[i].index;
+               u32 read = test_bit(i, svm->shadow_msr_intercept.read);
+               u32 write = test_bit(i, svm->shadow_msr_intercept.write);
  
-               set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+               set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
         }
  }
  
@@ -666,26 +741,26 @@ static void init_msrpm_offsets(void)
         }
  }
  
-static void svm_enable_lbrv(struct vcpu_svm *svm)
+static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
  {
-       u32 *msrpm = svm->msrpm;
+       struct vcpu_svm *svm = to_svm(vcpu);
  
         svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
-       set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
-       set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
-       set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
-       set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+       set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+       set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+       set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+       set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
  }
  
-static void svm_disable_lbrv(struct vcpu_svm *svm)
+static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
  {
-       u32 *msrpm = svm->msrpm;
+       struct vcpu_svm *svm = to_svm(vcpu);
  
         svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
-       set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
-       set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
-       set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
-       set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+       set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+       set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+       set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+       set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
  }
  
  void disable_nmi_singlestep(struct vcpu_svm *svm)
@@ -813,6 +888,9 @@ static __init void svm_set_cpu_caps(void)
         if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
             boot_cpu_has(X86_FEATURE_AMD_SSBD))
                 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+       /* Enable INVPCID feature */
+       kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
  }
  
  static __init int svm_hardware_setup(void)
@@ -985,6 +1063,21 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
         return svm->vmcb->control.tsc_offset;
  }
  
+static void svm_check_invpcid(struct vcpu_svm *svm)
+{
+       /*
+        * Intercept INVPCID instruction only if shadow page table is
+        * enabled. Interception is not required with nested page table
+        * enabled.
+        */
+       if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
+               if (!npt_enabled)
+                       svm_set_intercept(svm, INTERCEPT_INVPCID);
+               else
+                       svm_clr_intercept(svm, INTERCEPT_INVPCID);
+       }
+}
+
  static void init_vmcb(struct vcpu_svm *svm)
  {
         struct vmcb_control_area *control = &svm->vmcb->control;
@@ -992,14 +1085,14 @@ static void init_vmcb(struct vcpu_svm *svm)
  
         svm->vcpu.arch.hflags = 0;
  
-       set_cr_intercept(svm, INTERCEPT_CR0_READ);
-       set_cr_intercept(svm, INTERCEPT_CR3_READ);
-       set_cr_intercept(svm, INTERCEPT_CR4_READ);
-       set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
-       set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
-       set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
+       svm_set_intercept(svm, INTERCEPT_CR0_READ);
+       svm_set_intercept(svm, INTERCEPT_CR3_READ);
+       svm_set_intercept(svm, INTERCEPT_CR4_READ);
+       svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+       svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
+       svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
         if (!kvm_vcpu_apicv_active(&svm->vcpu))
-               set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+               svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
  
         set_dr_intercepts(svm);
  
@@ -1094,15 +1187,15 @@ static void init_vmcb(struct vcpu_svm *svm)
                 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
                 svm_clr_intercept(svm, INTERCEPT_INVLPG);
                 clr_exception_intercept(svm, PF_VECTOR);
-               clr_cr_intercept(svm, INTERCEPT_CR3_READ);
-               clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+               svm_clr_intercept(svm, INTERCEPT_CR3_READ);
+               svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
                 save->g_pat = svm->vcpu.arch.pat;
                 save->cr3 = 0;
                 save->cr4 = 0;
         }
         svm->asid_generation = 0;
  
-       svm->nested.vmcb = 0;
+       svm->nested.vmcb12_gpa = 0;
         svm->vcpu.arch.hflags = 0;
  
         if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
@@ -1114,6 +1207,8 @@ static void init_vmcb(struct vcpu_svm *svm)
                 svm_clr_intercept(svm, INTERCEPT_PAUSE);
         }
  
+       svm_check_invpcid(svm);
+
         if (kvm_vcpu_apicv_active(&svm->vcpu))
                 avic_init_vmcb(svm);
  
@@ -1171,35 +1266,25 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  static int svm_create_vcpu(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm;
-       struct page *page;
-       struct page *msrpm_pages;
+       struct page *vmcb_page;
         struct page *hsave_page;
-       struct page *nested_msrpm_pages;
         int err;
  
         BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
         svm = to_svm(vcpu);
  
         err = -ENOMEM;
-       page = alloc_page(GFP_KERNEL_ACCOUNT);
-       if (!page)
+       vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+       if (!vmcb_page)
                 goto out;
  
-       msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
-       if (!msrpm_pages)
-               goto free_page1;
-
-       nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
-       if (!nested_msrpm_pages)
-               goto free_page2;
-
-       hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
+       hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
         if (!hsave_page)
-               goto free_page3;
+               goto error_free_vmcb_page;
  
         err = avic_init_vcpu(svm);
         if (err)
-               goto free_page4;
+               goto error_free_hsave_page;
  
         /* We initialize this flag to true to make sure that the is_running
          * bit would be set the first time the vcpu is loaded.
@@ -1208,17 +1293,22 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
                 svm->avic_is_running = true;
  
         svm->nested.hsave = page_address(hsave_page);
-       clear_page(svm->nested.hsave);
  
-       svm->msrpm = page_address(msrpm_pages);
-       svm_vcpu_init_msrpm(svm->msrpm);
+       svm->msrpm = svm_vcpu_alloc_msrpm();
+       if (!svm->msrpm)
+               goto error_free_hsave_page;
  
-       svm->nested.msrpm = page_address(nested_msrpm_pages);
-       svm_vcpu_init_msrpm(svm->nested.msrpm);
+       svm_vcpu_init_msrpm(vcpu, svm->msrpm);
  
-       svm->vmcb = page_address(page);
-       clear_page(svm->vmcb);
-       svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
+       svm->nested.msrpm = svm_vcpu_alloc_msrpm();
+       if (!svm->nested.msrpm)
+               goto error_free_msrpm;
+
+       /* We only need the L1 pass-through MSR state, so leave vcpu as NULL */
+       svm_vcpu_init_msrpm(vcpu, svm->nested.msrpm);
+
+       svm->vmcb = page_address(vmcb_page);
+       svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
         svm->asid_generation = 0;
         init_vmcb(svm);
  
@@ -1227,14 +1317,12 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
  
         return 0;
  
-free_page4:
+error_free_msrpm:
+       svm_vcpu_free_msrpm(svm->msrpm);
+error_free_hsave_page:
         __free_page(hsave_page);
-free_page3:
-       __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
-free_page2:
-       __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
-free_page1:
-       __free_page(page);
+error_free_vmcb_page:
+       __free_page(vmcb_page);
  out:
         return err;
  }
@@ -1549,11 +1637,11 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
  
         if (gcr0 == *hcr0) {
-               clr_cr_intercept(svm, INTERCEPT_CR0_READ);
-               clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+               svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+               svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
         } else {
-               set_cr_intercept(svm, INTERCEPT_CR0_READ);
-               set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+               svm_set_intercept(svm, INTERCEPT_CR0_READ);
+               svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
         }
  }
  
@@ -2218,12 +2306,9 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
  {
         unsigned long cr0 = svm->vcpu.arch.cr0;
         bool ret = false;
-       u64 intercept;
-
-       intercept = svm->nested.ctl.intercept;
  
         if (!is_guest_mode(&svm->vcpu) ||
-           (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
+           (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
                 return false;
  
         cr0 &= ~SVM_CR0_SELECTIVE_MASK;
@@ -2261,6 +2346,7 @@ static int cr_interception(struct vcpu_svm *svm)
         if (cr >= 16) { /* mov to cr */
                 cr -= 16;
                 val = kvm_register_read(&svm->vcpu, reg);
+               trace_kvm_cr_write(cr, val);
                 switch (cr) {
                 case 0:
                         if (!check_selective_cr0_intercepted(svm, val))
@@ -2306,6 +2392,7 @@ static int cr_interception(struct vcpu_svm *svm)
                         return 1;
                 }
                 kvm_register_write(&svm->vcpu, reg, val);
+               trace_kvm_cr_read(cr, val);
         }
         return kvm_complete_insn_gp(&svm->vcpu, err);
  }
@@ -2556,7 +2643,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                  * We update the L1 MSR bit as well since it will end up
                  * touching the MSR anyway now.
                  */
-               set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
                 break;
         case MSR_IA32_PRED_CMD:
                 if (!msr->host_initiated &&
@@ -2571,7 +2658,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                         break;
  
                 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
-               set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
                 break;
         case MSR_AMD64_VIRT_SPEC_CTRL:
                 if (!msr->host_initiated &&
@@ -2635,9 +2722,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                 svm->vmcb->save.dbgctl = data;
                 vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
                 if (data & (1ULL<<0))
-                       svm_enable_lbrv(svm);
+                       svm_enable_lbrv(vcpu);
                 else
-                       svm_disable_lbrv(svm);
+                       svm_disable_lbrv(vcpu);
                 break;
         case MSR_VM_HSAVE_PA:
                 svm->nested.hsave_msr = data;
@@ -2733,6 +2820,33 @@ static int mwait_interception(struct vcpu_svm *svm)
         return nop_interception(svm);
  }
  
+static int invpcid_interception(struct vcpu_svm *svm)
+{
+       struct kvm_vcpu *vcpu = &svm->vcpu;
+       unsigned long type;
+       gva_t gva;
+
+       if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       /*
+        * For an INVPCID intercept:
+        * EXITINFO1 provides the linear address of the memory operand.
+        * EXITINFO2 provides the contents of the register operand.
+        */
+       type = svm->vmcb->control.exit_info_2;
+       gva = svm->vmcb->control.exit_info_1;
+
+       if (type > 3) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       return kvm_handle_invpcid(vcpu, type, gva);
+}
+
  static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
         [SVM_EXIT_READ_CR0]                     = cr_interception,
         [SVM_EXIT_READ_CR3]                     = cr_interception,
@@ -2795,6 +2909,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
         [SVM_EXIT_MWAIT]                        = mwait_interception,
         [SVM_EXIT_XSETBV]                       = xsetbv_interception,
         [SVM_EXIT_RDPRU]                        = rdpru_interception,
+       [SVM_EXIT_INVPCID]                      = invpcid_interception,
         [SVM_EXIT_NPF]                          = npf_interception,
         [SVM_EXIT_RSM]                          = rsm_interception,
         [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
@@ -2813,12 +2928,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
         }
  
         pr_err("VMCB Control Area:\n");
-       pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
-       pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
-       pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
-       pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
-       pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
-       pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
+       pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
+       pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
+       pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
+       pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
+       pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
+       pr_err("%-20s%08x %08x\n", "intercepts:",
+              control->intercepts[INTERCEPT_WORD3],
+              control->intercepts[INTERCEPT_WORD4]);
         pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
         pr_err("%-20s%d\n", "pause filter threshold:",
                control->pause_filter_thresh);
@@ -2917,12 +3034,19 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
                "excp_to:", save->last_excp_to);
  }
  
-static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+                             u32 *intr_info, u32 *error_code)
  {
         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
  
         *info1 = control->exit_info_1;
         *info2 = control->exit_info_2;
+       *intr_info = control->exit_int_info;
+       if ((*intr_info & SVM_EXITINTINFO_VALID) &&
+           (*intr_info & SVM_EXITINTINFO_VALID_ERR))
+               *error_code = control->exit_int_info_err;
+       else
+               *error_code = 0;
  }
  
  static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
@@ -2933,22 +3057,15 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
  
         trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
  
-       if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
+       if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
                 vcpu->arch.cr0 = svm->vmcb->save.cr0;
         if (npt_enabled)
                 vcpu->arch.cr3 = svm->vmcb->save.cr3;
  
-       svm_complete_interrupts(svm);
-
         if (is_guest_mode(vcpu)) {
                 int vmexit;
  
-               trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
-                                       svm->vmcb->control.exit_info_1,
-                                       svm->vmcb->control.exit_info_2,
-                                       svm->vmcb->control.exit_int_info,
-                                       svm->vmcb->control.exit_int_info_err,
-                                       KVM_ISA_SVM);
+               trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM);
  
                 vmexit = nested_svm_exit_special(svm);
  
@@ -3058,13 +3175,13 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
         if (nested_svm_virtualize_tpr(vcpu))
                 return;
  
-       clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+       svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
  
         if (irr == -1)
                 return;
  
         if (tpr >= irr)
-               set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+               svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
  }
  
  bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
@@ -3252,7 +3369,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
         if (nested_svm_virtualize_tpr(vcpu))
                 return;
  
-       if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
+       if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
                 kvm_set_cr8(vcpu, cr8);
         }
@@ -3349,8 +3466,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
  
  static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
  {
-       if (!is_guest_mode(vcpu) &&
-           to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+       if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
             to_svm(vcpu)->vmcb->control.exit_info_1)
                 return handle_fastpath_set_msr_irqoff(vcpu);
  
@@ -3415,7 +3531,6 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
  
  static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
  {
-       fastpath_t exit_fastpath;
         struct vcpu_svm *svm = to_svm(vcpu);
  
         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
@@ -3456,9 +3571,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         clgi();
         kvm_load_guest_xsave_state(vcpu);
  
-       if (lapic_in_kernel(vcpu) &&
-               vcpu->arch.apic->lapic_timer.timer_advance_ns)
-               kvm_wait_lapic_expire(vcpu);
+       kvm_wait_lapic_expire(vcpu);
  
         /*
          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -3504,7 +3617,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         stgi();
  
         /* Any pending NMI will happen here */
-       exit_fastpath = svm_exit_handlers_fastpath(vcpu);
  
         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
                 kvm_after_interrupt(&svm->vcpu);
@@ -3518,6 +3630,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         }
  
         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+       vmcb_mark_all_clean(svm->vmcb);
  
         /* if exit due to PF check for async PF */
         if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
@@ -3537,8 +3650,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
                      SVM_EXIT_EXCP_BASE + MC_VECTOR))
                 svm_handle_mce(svm);
  
-       vmcb_mark_all_clean(svm->vmcb);
-       return exit_fastpath;
+       svm_complete_interrupts(svm);
+
+       if (is_guest_mode(vcpu))
+               return EXIT_FASTPATH_NONE;
+
+       return svm_exit_handlers_fastpath(vcpu);
  }
  
  static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
@@ -3624,6 +3741,9 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
         svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
                              guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
  
+       /* Check again if INVPCID interception if required */
+       svm_check_invpcid(svm);
+
         if (!kvm_vcpu_apicv_active(vcpu))
                 return;
  
@@ -3738,7 +3858,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
                 break;
         case SVM_EXIT_WRITE_CR0: {
                 unsigned long cr0, val;
-               u64 intercept;
  
                 if (info->intercept == x86_intercept_cr_write)
                         icpt_info.exit_code += info->modrm_reg;
@@ -3747,9 +3866,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
                     info->intercept == x86_intercept_clts)
                         break;
  
-               intercept = svm->nested.ctl.intercept;
-
-               if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
+               if (!(vmcb_is_intercept(&svm->nested.ctl,
+                                       INTERCEPT_SELECTIVE_CR0)))
                         break;
  
                 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
@@ -3884,7 +4002,7 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
                 /* FED8h - SVM Guest */
                 put_smstate(u64, smstate, 0x7ed8, 1);
                 /* FEE0h - SVM Guest VMCB Physical Address */
-               put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
+               put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
  
                 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
                 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3900,21 +4018,28 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
  static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *nested_vmcb;
         struct kvm_host_map map;
-       u64 guest;
-       u64 vmcb;
         int ret = 0;
  
-       guest = GET_SMSTATE(u64, smstate, 0x7ed8);
-       vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
+       if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+               u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
+               u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8);
+               u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
  
-       if (guest) {
-               if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
-                       return 1;
-               nested_vmcb = map.hva;
-               ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb);
-               kvm_vcpu_unmap(&svm->vcpu, &map, true);
+               if (guest) {
+                       if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+                               return 1;
+
+                       if (!(saved_efer & EFER_SVME))
+                               return 1;
+
+                       if (kvm_vcpu_map(&svm->vcpu,
+                                        gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
+                               return 1;
+
+                       ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
+                       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+               }
         }
  
         return ret;
@@ -3933,19 +4058,10 @@ static void enable_smi_window(struct kvm_vcpu *vcpu)
         }
  }
  
-static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
  {
-       unsigned long cr4 = kvm_read_cr4(vcpu);
-       bool smep = cr4 & X86_CR4_SMEP;
-       bool smap = cr4 & X86_CR4_SMAP;
-       bool is_user = svm_get_cpl(vcpu) == 3;
-
-       /*
-        * If RIP is invalid, go ahead with emulation which will cause an
-        * internal error exit.
-        */
-       if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
-               return true;
+       bool smep, smap, is_user;
+       unsigned long cr4;
  
         /*
          * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
@@ -3987,6 +4103,20 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
          * instruction pointer so we will not able to workaround it. Lets
          * print the error and request to kill the guest.
          */
+       if (likely(!insn || insn_len))
+               return true;
+
+       /*
+        * If RIP is invalid, go ahead with emulation which will cause an
+        * internal error exit.
+        */
+       if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
+               return true;
+
+       cr4 = kvm_read_cr4(vcpu);
+       smep = cr4 & X86_CR4_SMEP;
+       smap = cr4 & X86_CR4_SMAP;
+       is_user = svm_get_cpl(vcpu) == 3;
         if (smap && (!smep || is_user)) {
                 if (!sev_guest(vcpu->kvm))
                         return true;
@@ -4010,7 +4140,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
          * if an INIT signal is pending.
          */
         return !gif_set(svm) ||
-                  (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
+                  (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
  }
  
  static void svm_vm_destroy(struct kvm *kvm)
@@ -4148,9 +4278,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
         .mem_enc_reg_region = svm_register_enc_region,
         .mem_enc_unreg_region = svm_unregister_enc_region,
  
-       .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
+       .can_emulate_instruction = svm_can_emulate_instruction,
  
         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
+
+       .msr_filter_changed = svm_msr_filter_changed,
  };
  
  static struct kvm_x86_init_ops svm_init_ops __initdata = {
@@ -4164,6 +4296,8 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
  
  static int __init svm_init(void)
  {
+       __unused_size_checks();
+
         return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
                         __alignof__(struct vcpu_svm), THIS_MODULE);
  }
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h

index a798e17..a7f9974 100644 (file)
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -31,6 +31,7 @@ static const u32 host_save_user_msrs[] = {
  
  #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
  
+#define MAX_DIRECT_ACCESS_MSRS 15
  #define MSRPM_OFFSETS  16
  extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  extern bool npt_enabled;
@@ -85,8 +86,7 @@ struct svm_nested_state {
         struct vmcb *hsave;
         u64 hsave_msr;
         u64 vm_cr_msr;
-       u64 vmcb;
-       u32 host_intercept_exceptions;
+       u64 vmcb12_gpa;
  
         /* These are the merged vectors */
         u32 *msrpm;
@@ -158,6 +158,12 @@ struct vcpu_svm {
          */
         struct list_head ir_list;
         spinlock_t ir_list_lock;
+
+       /* Save desired MSR intercept (read: pass-through) state */
+       struct {
+               DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS);
+               DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
+       } shadow_msr_intercept;
  };
  
  struct svm_cpu_data {
@@ -214,51 +220,44 @@ static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
                 return svm->vmcb;
  }
  
-static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
+static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
  {
-       struct vmcb *vmcb = get_host_vmcb(svm);
-
-       vmcb->control.intercept_cr |= (1U << bit);
-
-       recalc_intercepts(svm);
+       WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+       __set_bit(bit, (unsigned long *)&control->intercepts);
  }
  
-static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
+static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit)
  {
-       struct vmcb *vmcb = get_host_vmcb(svm);
-
-       vmcb->control.intercept_cr &= ~(1U << bit);
-
-       recalc_intercepts(svm);
+       WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+       __clear_bit(bit, (unsigned long *)&control->intercepts);
  }
  
-static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
+static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
  {
-       struct vmcb *vmcb = get_host_vmcb(svm);
-
-       return vmcb->control.intercept_cr & (1U << bit);
+       WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+       return test_bit(bit, (unsigned long *)&control->intercepts);
  }
  
  static inline void set_dr_intercepts(struct vcpu_svm *svm)
  {
         struct vmcb *vmcb = get_host_vmcb(svm);
  
-       vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
-               | (1 << INTERCEPT_DR1_READ)
-               | (1 << INTERCEPT_DR2_READ)
-               | (1 << INTERCEPT_DR3_READ)
-               | (1 << INTERCEPT_DR4_READ)
-               | (1 << INTERCEPT_DR5_READ)
-               | (1 << INTERCEPT_DR6_READ)
-               | (1 << INTERCEPT_DR7_READ)
-               | (1 << INTERCEPT_DR0_WRITE)
-               | (1 << INTERCEPT_DR1_WRITE)
-               | (1 << INTERCEPT_DR2_WRITE)
-               | (1 << INTERCEPT_DR3_WRITE)
-               | (1 << INTERCEPT_DR4_WRITE)
-               | (1 << INTERCEPT_DR5_WRITE)
-               | (1 << INTERCEPT_DR6_WRITE)
-               | (1 << INTERCEPT_DR7_WRITE);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
  
         recalc_intercepts(svm);
  }
@@ -267,25 +266,27 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm)
  {
         struct vmcb *vmcb = get_host_vmcb(svm);
  
-       vmcb->control.intercept_dr = 0;
+       vmcb->control.intercepts[INTERCEPT_DR] = 0;
  
         recalc_intercepts(svm);
  }
  
-static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
+static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
  {
         struct vmcb *vmcb = get_host_vmcb(svm);
  
-       vmcb->control.intercept_exceptions |= (1U << bit);
+       WARN_ON_ONCE(bit >= 32);
+       vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
  
         recalc_intercepts(svm);
  }
  
-static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
+static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
  {
         struct vmcb *vmcb = get_host_vmcb(svm);
  
-       vmcb->control.intercept_exceptions &= ~(1U << bit);
+       WARN_ON_ONCE(bit >= 32);
+       vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
  
         recalc_intercepts(svm);
  }
@@ -294,7 +295,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
  {
         struct vmcb *vmcb = get_host_vmcb(svm);
  
-       vmcb->control.intercept |= (1ULL << bit);
+       vmcb_set_intercept(&vmcb->control, bit);
  
         recalc_intercepts(svm);
  }
@@ -303,14 +304,14 @@ static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
  {
         struct vmcb *vmcb = get_host_vmcb(svm);
  
-       vmcb->control.intercept &= ~(1ULL << bit);
+       vmcb_clr_intercept(&vmcb->control, bit);
  
         recalc_intercepts(svm);
  }
  
  static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
  {
-       return (svm->vmcb->control.intercept & (1ULL << bit)) != 0;
+       return vmcb_is_intercept(&svm->vmcb->control, bit);
  }
  
  static inline bool vgif_enabled(struct vcpu_svm *svm)
@@ -345,7 +346,7 @@ static inline bool gif_set(struct vcpu_svm *svm)
  /* svm.c */
  #define MSR_CR3_LEGACY_RESERVED_MASK           0xfe7U
  #define MSR_CR3_LEGACY_PAE_RESERVED_MASK       0x7U
-#define MSR_CR3_LONG_RESERVED_MASK             0xfff0000000000fe7U
+#define MSR_CR3_LONG_MBZ_MASK                  0xfff0000000000000U
  #define MSR_INVALID                            0xffffffffU
  
  u32 svm_msrpm_offset(u32 msr);
@@ -374,17 +375,17 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu)
  
  static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
  {
-       return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_SMI));
+       return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
  }
  
  static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
  {
-       return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INTR));
+       return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
  }
  
  static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
  {
-       return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI));
+       return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
  }
  
  int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h

index b66432b..aef960f 100644 (file)
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -15,18 +15,20 @@
   * Tracepoint for guest mode entry.
   */
  TRACE_EVENT(kvm_entry,
-       TP_PROTO(unsigned int vcpu_id),
-       TP_ARGS(vcpu_id),
+       TP_PROTO(struct kvm_vcpu *vcpu),
+       TP_ARGS(vcpu),
  
         TP_STRUCT__entry(
                 __field(        unsigned int,   vcpu_id         )
+               __field(        unsigned long,  rip             )
         ),
  
         TP_fast_assign(
-               __entry->vcpu_id        = vcpu_id;
+               __entry->vcpu_id        = vcpu->vcpu_id;
+               __entry->rip            = kvm_rip_read(vcpu);
         ),
  
-       TP_printk("vcpu %u", __entry->vcpu_id)
+       TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
  );
  
  /*
@@ -233,36 +235,45 @@ TRACE_EVENT(kvm_apic,
         (isa == KVM_ISA_VMX) ?                                          \
         __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : ""
  
+#define TRACE_EVENT_KVM_EXIT(name)                                          \
+TRACE_EVENT(name,                                                           \
+       TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),  \
+       TP_ARGS(exit_reason, vcpu, isa),                                     \
+                                                                            \
+       TP_STRUCT__entry(                                                    \
+               __field(        unsigned int,   exit_reason     )            \
+               __field(        unsigned long,  guest_rip       )            \
+               __field(        u32,            isa             )            \
+               __field(        u64,            info1           )            \
+               __field(        u64,            info2           )            \
+               __field(        u32,            intr_info       )            \
+               __field(        u32,            error_code      )            \
+               __field(        unsigned int,   vcpu_id         )            \
+       ),                                                                   \
+                                                                            \
+       TP_fast_assign(                                                      \
+               __entry->exit_reason    = exit_reason;                       \
+               __entry->guest_rip      = kvm_rip_read(vcpu);                \
+               __entry->isa            = isa;                               \
+               __entry->vcpu_id        = vcpu->vcpu_id;                     \
+               kvm_x86_ops.get_exit_info(vcpu, &__entry->info1,             \
+                                         &__entry->info2,                   \
+                                         &__entry->intr_info,               \
+                                         &__entry->error_code);             \
+       ),                                                                   \
+                                                                            \
+       TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx "         \
+                 "info2 0x%016llx intr_info 0x%08x error_code 0x%08x",      \
+                 __entry->vcpu_id,                                          \
+                 kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \
+                 __entry->guest_rip, __entry->info1, __entry->info2,        \
+                 __entry->intr_info, __entry->error_code)                   \
+)
+
  /*
   * Tracepoint for kvm guest exit:
   */
-TRACE_EVENT(kvm_exit,
-       TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
-       TP_ARGS(exit_reason, vcpu, isa),
-
-       TP_STRUCT__entry(
-               __field(        unsigned int,   exit_reason     )
-               __field(        unsigned long,  guest_rip       )
-               __field(        u32,            isa             )
-               __field(        u64,            info1           )
-               __field(        u64,            info2           )
-               __field(        unsigned int,   vcpu_id         )
-       ),
-
-       TP_fast_assign(
-               __entry->exit_reason    = exit_reason;
-               __entry->guest_rip      = kvm_rip_read(vcpu);
-               __entry->isa            = isa;
-               __entry->vcpu_id        = vcpu->vcpu_id;
-               kvm_x86_ops.get_exit_info(vcpu, &__entry->info1,
-                                          &__entry->info2);
-       ),
-
-       TP_printk("vcpu %u reason %s%s%s rip 0x%lx info %llx %llx",
-                 __entry->vcpu_id,
-                 kvm_print_exit_reason(__entry->exit_reason, __entry->isa),
-                 __entry->guest_rip, __entry->info1, __entry->info2)
-);
+TRACE_EVENT_KVM_EXIT(kvm_exit);
  
  /*
   * Tracepoint for kvm interrupt injection:
@@ -544,63 +555,38 @@ TRACE_EVENT(kvm_nested_vmrun,
  );
  
  TRACE_EVENT(kvm_nested_intercepts,
-           TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
-           TP_ARGS(cr_read, cr_write, exceptions, intercept),
+           TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions,
+                    __u32 intercept1, __u32 intercept2, __u32 intercept3),
+           TP_ARGS(cr_read, cr_write, exceptions, intercept1,
+                   intercept2, intercept3),
  
         TP_STRUCT__entry(
                 __field(        __u16,          cr_read         )
                 __field(        __u16,          cr_write        )
                 __field(        __u32,          exceptions      )
-               __field(        __u64,          intercept       )
+               __field(        __u32,          intercept1      )
+               __field(        __u32,          intercept2      )
+               __field(        __u32,          intercept3      )
         ),
  
         TP_fast_assign(
                 __entry->cr_read        = cr_read;
                 __entry->cr_write       = cr_write;
                 __entry->exceptions     = exceptions;
-               __entry->intercept      = intercept;
+               __entry->intercept1     = intercept1;
+               __entry->intercept2     = intercept2;
+               __entry->intercept3     = intercept3;
         ),
  
-       TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
-               __entry->cr_read, __entry->cr_write, __entry->exceptions,
-               __entry->intercept)
+       TP_printk("cr_read: %04x cr_write: %04x excp: %08x "
+                 "intercepts: %08x %08x %08x",
+                 __entry->cr_read, __entry->cr_write, __entry->exceptions,
+                 __entry->intercept1, __entry->intercept2, __entry->intercept3)
  );
  /*
   * Tracepoint for #VMEXIT while nested
   */
-TRACE_EVENT(kvm_nested_vmexit,
-           TP_PROTO(__u64 rip, __u32 exit_code,
-                    __u64 exit_info1, __u64 exit_info2,
-                    __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
-           TP_ARGS(rip, exit_code, exit_info1, exit_info2,
-                   exit_int_info, exit_int_info_err, isa),
-
-       TP_STRUCT__entry(
-               __field(        __u64,          rip                     )
-               __field(        __u32,          exit_code               )
-               __field(        __u64,          exit_info1              )
-               __field(        __u64,          exit_info2              )
-               __field(        __u32,          exit_int_info           )
-               __field(        __u32,          exit_int_info_err       )
-               __field(        __u32,          isa                     )
-       ),
-
-       TP_fast_assign(
-               __entry->rip                    = rip;
-               __entry->exit_code              = exit_code;
-               __entry->exit_info1             = exit_info1;
-               __entry->exit_info2             = exit_info2;
-               __entry->exit_int_info          = exit_int_info;
-               __entry->exit_int_info_err      = exit_int_info_err;
-               __entry->isa                    = isa;
-       ),
-       TP_printk("rip: 0x%016llx reason: %s%s%s ext_inf1: 0x%016llx "
-                 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
-                 __entry->rip,
-                 kvm_print_exit_reason(__entry->exit_code, __entry->isa),
-                 __entry->exit_info1, __entry->exit_info2,
-                 __entry->exit_int_info, __entry->exit_int_info_err)
-);
+TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit);
  
  /*
   * Tracepoint for #VMEXIT reinjected to the guest
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h

index 4bbd8b4..3a18614 100644 (file)
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -151,7 +151,7 @@ static inline bool vmx_umip_emulated(void)
  static inline bool cpu_has_vmx_rdtscp(void)
  {
         return vmcs_config.cpu_based_2nd_exec_ctrl &
-               SECONDARY_EXEC_RDTSCP;
+               SECONDARY_EXEC_ENABLE_RDTSCP;
  }
  
  static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
@@ -196,7 +196,7 @@ static inline bool cpu_has_vmx_ple(void)
                 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
  }
  
-static inline bool vmx_rdrand_supported(void)
+static inline bool cpu_has_vmx_rdrand(void)
  {
         return vmcs_config.cpu_based_2nd_exec_ctrl &
                 SECONDARY_EXEC_RDRAND_EXITING;
@@ -233,7 +233,7 @@ static inline bool cpu_has_vmx_encls_vmexit(void)
                 SECONDARY_EXEC_ENCLS_EXITING;
  }
  
-static inline bool vmx_rdseed_supported(void)
+static inline bool cpu_has_vmx_rdseed(void)
  {
         return vmcs_config.cpu_based_2nd_exec_ctrl &
                 SECONDARY_EXEC_RDSEED_EXITING;
@@ -244,13 +244,13 @@ static inline bool cpu_has_vmx_pml(void)
         return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
  }
  
-static inline bool vmx_xsaves_supported(void)
+static inline bool cpu_has_vmx_xsaves(void)
  {
         return vmcs_config.cpu_based_2nd_exec_ctrl &
                 SECONDARY_EXEC_XSAVES;
  }
  
-static inline bool vmx_waitpkg_supported(void)
+static inline bool cpu_has_vmx_waitpkg(void)
  {
         return vmcs_config.cpu_based_2nd_exec_ctrl &
                 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c

index 23b58c2..6eca8a7 100644 (file)
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -233,6 +233,44 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
         vmx->nested.hv_evmcs = NULL;
  }
  
+static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
+                                    struct loaded_vmcs *prev)
+{
+       struct vmcs_host_state *dest, *src;
+
+       if (unlikely(!vmx->guest_state_loaded))
+               return;
+
+       src = &prev->host_state;
+       dest = &vmx->loaded_vmcs->host_state;
+
+       vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
+       dest->ldt_sel = src->ldt_sel;
+#ifdef CONFIG_X86_64
+       dest->ds_sel = src->ds_sel;
+       dest->es_sel = src->es_sel;
+#endif
+}
+
+static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct loaded_vmcs *prev;
+       int cpu;
+
+       if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
+               return;
+
+       cpu = get_cpu();
+       prev = vmx->loaded_vmcs;
+       vmx->loaded_vmcs = vmcs;
+       vmx_vcpu_load_vmcs(vcpu, cpu, prev);
+       vmx_sync_vmcs_host_state(vmx, prev);
+       put_cpu();
+
+       vmx_register_cache_reset(vcpu);
+}
+
  /*
   * Free whatever needs to be freed from vmx->nested when L1 goes down, or
   * just stops using VMX.
@@ -241,10 +279,13 @@ static void free_nested(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
+       if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
+               vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+
         if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
                 return;
  
-       kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
  
         vmx->nested.vmxon = false;
         vmx->nested.smm.vmxon = false;
@@ -277,44 +318,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
         free_loaded_vmcs(&vmx->nested.vmcs02);
  }
  
-static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
-                                    struct loaded_vmcs *prev)
-{
-       struct vmcs_host_state *dest, *src;
-
-       if (unlikely(!vmx->guest_state_loaded))
-               return;
-
-       src = &prev->host_state;
-       dest = &vmx->loaded_vmcs->host_state;
-
-       vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
-       dest->ldt_sel = src->ldt_sel;
-#ifdef CONFIG_X86_64
-       dest->ds_sel = src->ds_sel;
-       dest->es_sel = src->es_sel;
-#endif
-}
-
-static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct loaded_vmcs *prev;
-       int cpu;
-
-       if (vmx->loaded_vmcs == vmcs)
-               return;
-
-       cpu = get_cpu();
-       prev = vmx->loaded_vmcs;
-       vmx->loaded_vmcs = vmcs;
-       vmx_vcpu_load_vmcs(vcpu, cpu, prev);
-       vmx_sync_vmcs_host_state(vmx, prev);
-       put_cpu();
-
-       vmx_register_cache_reset(vcpu);
-}
-
  /*
   * Ensure that the current vmcs of the logical processor is the
   * vmcs01 of the vcpu before calling free_nested().
@@ -323,8 +326,6 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
  {
         vcpu_load(vcpu);
         vmx_leave_nested(vcpu);
-       vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
-       free_nested(vcpu);
         vcpu_put(vcpu);
  }
  
@@ -938,11 +939,11 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
          * VM-exit in L0, use the more accurate value.
          */
         if (msr_index == MSR_IA32_TSC) {
-               int index = vmx_find_msr_index(&vmx->msr_autostore.guest,
-                                              MSR_IA32_TSC);
+               int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
+                                                   MSR_IA32_TSC);
  
-               if (index >= 0) {
-                       u64 val = vmx->msr_autostore.guest.val[index].value;
+               if (i >= 0) {
+                       u64 val = vmx->msr_autostore.guest.val[i].value;
  
                         *data = kvm_read_l1_tsc(vcpu, val);
                         return true;
@@ -1031,16 +1032,16 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
         bool in_vmcs12_store_list;
-       int msr_autostore_index;
+       int msr_autostore_slot;
         bool in_autostore_list;
         int last;
  
-       msr_autostore_index = vmx_find_msr_index(autostore, msr_index);
-       in_autostore_list = msr_autostore_index >= 0;
+       msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
+       in_autostore_list = msr_autostore_slot >= 0;
         in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
  
         if (in_vmcs12_store_list && !in_autostore_list) {
-               if (autostore->nr == NR_LOADSTORE_MSRS) {
+               if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
                         /*
                          * Emulated VMEntry does not fail here.  Instead a less
                          * accurate value will be returned by
@@ -1057,7 +1058,7 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
                 autostore->val[last].index = msr_index;
         } else if (!in_vmcs12_store_list && in_autostore_list) {
                 last = --autostore->nr;
-               autostore->val[msr_autostore_index] = autostore->val[last];
+               autostore->val[msr_autostore_slot] = autostore->val[last];
         }
  }
  
@@ -2286,7 +2287,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
                 /* Take the following fields only from vmcs12 */
                 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                                   SECONDARY_EXEC_ENABLE_INVPCID |
-                                 SECONDARY_EXEC_RDTSCP |
+                                 SECONDARY_EXEC_ENABLE_RDTSCP |
                                   SECONDARY_EXEC_XSAVES |
                                   SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
@@ -2314,6 +2315,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
                         vmcs_write16(GUEST_INTR_STATUS,
                                 vmcs12->guest_intr_status);
  
+               if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+                   exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+
                 secondary_exec_controls_set(vmx, exec_control);
         }
  
@@ -2408,6 +2412,8 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
                 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
                 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
                 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+               vmx->segment_cache.bitmask = 0;
         }
  
         if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@ -2571,7 +2577,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
          * which means L1 attempted VMEntry to L2 with invalid state.
          * Fail the VMEntry.
          */
-       if (vmx->emulation_required) {
+       if (CC(!vmx_guest_state_valid(vcpu))) {
                 *entry_failure_code = ENTRY_FAIL_DEFAULT;
                 return -EINVAL;
         }
@@ -3344,8 +3350,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
         prepare_vmcs02_early(vmx, vmcs12);
  
         if (from_vmentry) {
-               if (unlikely(!nested_get_vmcs12_pages(vcpu)))
+               if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
+                       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
                         return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
+               }
  
                 if (nested_vmx_check_vmentry_hw(vcpu)) {
                         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
@@ -3387,7 +3395,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
                  * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
                  * have already been set at vmentry time and should not be reset.
                  */
-               kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+               kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
         }
  
         /*
@@ -3468,11 +3476,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         if (evmptrld_status == EVMPTRLD_ERROR) {
                 kvm_queue_exception(vcpu, UD_VECTOR);
                 return 1;
-       } else if (evmptrld_status == EVMPTRLD_VMFAIL) {
+       } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) {
                 return nested_vmx_failInvalid(vcpu);
         }
  
-       if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
+       if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull))
                 return nested_vmx_failInvalid(vcpu);
  
         vmcs12 = get_vmcs12(vcpu);
@@ -3483,7 +3491,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
          * rather than RFLAGS.ZF, and no error number is stored to the
          * VM-instruction error field.
          */
-       if (vmcs12->hdr.shadow_vmcs)
+       if (CC(vmcs12->hdr.shadow_vmcs))
                 return nested_vmx_failInvalid(vcpu);
  
         if (vmx->nested.hv_evmcs) {
@@ -3504,10 +3512,10 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
          * for misconfigurations which will anyway be caught by the processor
          * when using the merged vmcs02.
          */
-       if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
+       if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
                 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
  
-       if (vmcs12->launch_state == launch)
+       if (CC(vmcs12->launch_state == launch))
                 return nested_vmx_fail(vcpu,
                         launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
                                : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
@@ -3528,6 +3536,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         if (unlikely(status != NVMX_VMENTRY_SUCCESS))
                 goto vmentry_failed;
  
+       /* Emulate processing of posted interrupts on VM-Enter. */
+       if (nested_cpu_has_posted_intr(vmcs12) &&
+           kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
+               vmx->nested.pi_pending = true;
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+               kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
+       }
+
         /* Hide L1D cache contents from the nested guest.  */
         vmx->vcpu.arch.l1tf_flush_l1d = true;
  
@@ -4257,7 +4273,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
  
  static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
  {
-       struct shared_msr_entry *efer_msr;
+       struct vmx_uret_msr *efer_msr;
         unsigned int i;
  
         if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
@@ -4271,7 +4287,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
                         return vmx->msr_autoload.guest.val[i].value;
         }
  
-       efer_msr = find_msr_entry(vmx, MSR_EFER);
+       efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
         if (efer_msr)
                 return efer_msr->data;
  
@@ -4404,6 +4420,14 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
         if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
                 kvm_vcpu_flush_tlb_current(vcpu);
  
+       /*
+        * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
+        * now and the new vmentry.  Ensure that the VMCS02 PDPTR fields are
+        * up-to-date before switching to L1.
+        */
+       if (enable_ept && is_pae_paging(vcpu))
+               vmx_ept_load_pdptrs(vcpu);
+
         leave_guest_mode(vcpu);
  
         if (nested_cpu_has_preemption_timer(vmcs12))
@@ -4668,7 +4692,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
                 vmx->nested.msrs.entry_ctls_high &=
                                 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
                 vmx->nested.msrs.exit_ctls_high &=
-                               ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+                               ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
         }
  }
  
@@ -4688,7 +4712,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
  
         r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
         if (r != X86EMUL_CONTINUE) {
-               *ret = vmx_handle_memory_failure(vcpu, r, &e);
+               *ret = kvm_handle_memory_failure(vcpu, r, &e);
                 return -EINVAL;
         }
  
@@ -4752,7 +4776,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
  
         if (vmx_pt_mode_is_host_guest()) {
                 vmx->pt_desc.guest.ctl = 0;
-               pt_update_intercept_for_msr(vmx);
+               pt_update_intercept_for_msr(vcpu);
         }
  
         return 0;
@@ -4995,7 +5019,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
                 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
                 if (r != X86EMUL_CONTINUE)
-                       return vmx_handle_memory_failure(vcpu, r, &e);
+                       return kvm_handle_memory_failure(vcpu, r, &e);
         }
  
         return nested_vmx_succeed(vcpu);
@@ -5068,7 +5092,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                         return 1;
                 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
                 if (r != X86EMUL_CONTINUE)
-                       return vmx_handle_memory_failure(vcpu, r, &e);
+                       return kvm_handle_memory_failure(vcpu, r, &e);
         }
  
         field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
@@ -5230,7 +5254,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
         r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
                                         sizeof(gpa_t), &e);
         if (r != X86EMUL_CONTINUE)
-               return vmx_handle_memory_failure(vcpu, r, &e);
+               return kvm_handle_memory_failure(vcpu, r, &e);
  
         return nested_vmx_succeed(vcpu);
  }
@@ -5283,7 +5307,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
                 return 1;
         r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
         if (r != X86EMUL_CONTINUE)
-               return vmx_handle_memory_failure(vcpu, r, &e);
+               return kvm_handle_memory_failure(vcpu, r, &e);
  
         /*
          * Nested EPT roots are always held through guest_mmu,
@@ -5365,7 +5389,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                 return 1;
         r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
         if (r != X86EMUL_CONTINUE)
-               return vmx_handle_memory_failure(vcpu, r, &e);
+               return kvm_handle_memory_failure(vcpu, r, &e);
  
         if (operand.vpid >> 16)
                 return nested_vmx_fail(vcpu,
@@ -5910,13 +5934,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
                 goto reflect_vmexit;
         }
  
-       exit_intr_info = vmx_get_intr_info(vcpu);
-       exit_qual = vmx_get_exit_qual(vcpu);
-
-       trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual,
-                               vmx->idt_vectoring_info, exit_intr_info,
-                               vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
-                               KVM_ISA_VMX);
+       trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX);
  
         /* If L0 (KVM) wants the exit, it trumps L1's desires. */
         if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
@@ -5932,14 +5950,14 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
          * need to be synthesized by querying the in-kernel LAPIC, but external
          * interrupts are never reflected to L1 so it's a non-issue.
          */
-       if ((exit_intr_info &
-            (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
-           (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+       exit_intr_info = vmx_get_intr_info(vcpu);
+       if (is_exception_with_error_code(exit_intr_info)) {
                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  
                 vmcs12->vm_exit_intr_error_code =
                         vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
         }
+       exit_qual = vmx_get_exit_qual(vcpu);
  
  reflect_vmexit:
         nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual);
@@ -6174,7 +6192,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
                  * restored yet. EVMCS will be mapped from
                  * nested_get_vmcs12_pages().
                  */
-               kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+               kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
         } else {
                 return -EINVAL;
         }
@@ -6310,7 +6328,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
  #ifdef CONFIG_X86_64
                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
  #endif
-               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+               VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
         msrs->exit_ctls_high |=
                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
@@ -6329,7 +6348,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
  #ifdef CONFIG_X86_64
                 VM_ENTRY_IA32E_MODE |
  #endif
-               VM_ENTRY_LOAD_IA32_PAT;
+               VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
+               VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
         msrs->entry_ctls_high |=
                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
  
@@ -6383,7 +6403,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
         msrs->secondary_ctls_low = 0;
         msrs->secondary_ctls_high &=
                 SECONDARY_EXEC_DESC |
-               SECONDARY_EXEC_RDTSCP |
+               SECONDARY_EXEC_ENABLE_RDTSCP |
                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
                 SECONDARY_EXEC_WBINVD_EXITING |
                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -6553,7 +6573,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
         .hv_timer_pending = nested_vmx_preemption_timer_pending,
         .get_state = vmx_get_nested_state,
         .set_state = vmx_set_nested_state,
-       .get_vmcs12_pages = nested_get_vmcs12_pages,
+       .get_nested_state_pages = nested_get_vmcs12_pages,
         .write_log_dirty = nested_vmx_write_pml_buffer,
         .enable_evmcs = nested_enable_evmcs,
         .get_evmcs_version = nested_get_evmcs_version,
diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h

deleted file mode 100644 (file)

index 692b0c3..0000000
--- a/arch/x86/kvm/vmx/ops.h
+++ /dev/null
@@ -1,320 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __KVM_X86_VMX_INSN_H
-#define __KVM_X86_VMX_INSN_H
-
-#include <linux/nospec.h>
-
-#include <asm/kvm_host.h>
-#include <asm/vmx.h>
-
-#include "evmcs.h"
-#include "vmcs.h"
-
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-
-asmlinkage void vmread_error(unsigned long field, bool fault);
-__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
-                                                        bool fault);
-void vmwrite_error(unsigned long field, unsigned long value);
-void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
-void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
-void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
-void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
-
-static __always_inline void vmcs_check16(unsigned long field)
-{
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
-                        "16-bit accessor invalid for 64-bit field");
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
-                        "16-bit accessor invalid for 64-bit high field");
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
-                        "16-bit accessor invalid for 32-bit high field");
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
-                        "16-bit accessor invalid for natural width field");
-}
-
-static __always_inline void vmcs_check32(unsigned long field)
-{
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
-                        "32-bit accessor invalid for 16-bit field");
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
-                        "32-bit accessor invalid for natural width field");
-}
-
-static __always_inline void vmcs_check64(unsigned long field)
-{
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
-                        "64-bit accessor invalid for 16-bit field");
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
-                        "64-bit accessor invalid for 64-bit high field");
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
-                        "64-bit accessor invalid for 32-bit field");
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
-                        "64-bit accessor invalid for natural width field");
-}
-
-static __always_inline void vmcs_checkl(unsigned long field)
-{
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
-                        "Natural width accessor invalid for 16-bit field");
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
-                        "Natural width accessor invalid for 64-bit field");
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
-                        "Natural width accessor invalid for 64-bit high field");
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
-                        "Natural width accessor invalid for 32-bit field");
-}
-
-static __always_inline unsigned long __vmcs_readl(unsigned long field)
-{
-       unsigned long value;
-
-       asm volatile("1: vmread %2, %1\n\t"
-                    ".byte 0x3e\n\t" /* branch taken hint */
-                    "ja 3f\n\t"
-
-                    /*
-                     * VMREAD failed.  Push '0' for @fault, push the failing
-                     * @field, and bounce through the trampoline to preserve
-                     * volatile registers.
-                     */
-                    "push $0\n\t"
-                    "push %2\n\t"
-                    "2:call vmread_error_trampoline\n\t"
-
-                    /*
-                     * Unwind the stack.  Note, the trampoline zeros out the
-                     * memory for @fault so that the result is '0' on error.
-                     */
-                    "pop %2\n\t"
-                    "pop %1\n\t"
-                    "3:\n\t"
-
-                    /* VMREAD faulted.  As above, except push '1' for @fault. */
-                    ".pushsection .fixup, \"ax\"\n\t"
-                    "4: push $1\n\t"
-                    "push %2\n\t"
-                    "jmp 2b\n\t"
-                    ".popsection\n\t"
-                    _ASM_EXTABLE(1b, 4b)
-                    : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc");
-       return value;
-}
-
-static __always_inline u16 vmcs_read16(unsigned long field)
-{
-       vmcs_check16(field);
-       if (static_branch_unlikely(&enable_evmcs))
-               return evmcs_read16(field);
-       return __vmcs_readl(field);
-}
-
-static __always_inline u32 vmcs_read32(unsigned long field)
-{
-       vmcs_check32(field);
-       if (static_branch_unlikely(&enable_evmcs))
-               return evmcs_read32(field);
-       return __vmcs_readl(field);
-}
-
-static __always_inline u64 vmcs_read64(unsigned long field)
-{
-       vmcs_check64(field);
-       if (static_branch_unlikely(&enable_evmcs))
-               return evmcs_read64(field);
-#ifdef CONFIG_X86_64
-       return __vmcs_readl(field);
-#else
-       return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
-#endif
-}
-
-static __always_inline unsigned long vmcs_readl(unsigned long field)
-{
-       vmcs_checkl(field);
-       if (static_branch_unlikely(&enable_evmcs))
-               return evmcs_read64(field);
-       return __vmcs_readl(field);
-}
-
-#define vmx_asm1(insn, op1, error_args...)                             \
-do {                                                                   \
-       asm_volatile_goto("1: " __stringify(insn) " %0\n\t"             \
-                         ".byte 0x2e\n\t" /* branch not taken hint */  \
-                         "jna %l[error]\n\t"                           \
-                         _ASM_EXTABLE(1b, %l[fault])                   \
-                         : : op1 : "cc" : error, fault);               \
-       return;                                                         \
-error:                                                                 \
-       instrumentation_begin();                                        \
-       insn##_error(error_args);                                       \
-       instrumentation_end();                                          \
-       return;                                                         \
-fault:                                                                 \
-       kvm_spurious_fault();                                           \
-} while (0)
-
-#define vmx_asm2(insn, op1, op2, error_args...)                                \
-do {                                                                   \
-       asm_volatile_goto("1: "  __stringify(insn) " %1, %0\n\t"        \
-                         ".byte 0x2e\n\t" /* branch not taken hint */  \
-                         "jna %l[error]\n\t"                           \
-                         _ASM_EXTABLE(1b, %l[fault])                   \
-                         : : op1, op2 : "cc" : error, fault);          \
-       return;                                                         \
-error:                                                                 \
-       instrumentation_begin();                                        \
-       insn##_error(error_args);                                       \
-       instrumentation_end();                                          \
-       return;                                                         \
-fault:                                                                 \
-       kvm_spurious_fault();                                           \
-} while (0)
-
-static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
-{
-       vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value);
-}
-
-static __always_inline void vmcs_write16(unsigned long field, u16 value)
-{
-       vmcs_check16(field);
-       if (static_branch_unlikely(&enable_evmcs))
-               return evmcs_write16(field, value);
-
-       __vmcs_writel(field, value);
-}
-
-static __always_inline void vmcs_write32(unsigned long field, u32 value)
-{
-       vmcs_check32(field);
-       if (static_branch_unlikely(&enable_evmcs))
-               return evmcs_write32(field, value);
-
-       __vmcs_writel(field, value);
-}
-
-static __always_inline void vmcs_write64(unsigned long field, u64 value)
-{
-       vmcs_check64(field);
-       if (static_branch_unlikely(&enable_evmcs))
-               return evmcs_write64(field, value);
-
-       __vmcs_writel(field, value);
-#ifndef CONFIG_X86_64
-       __vmcs_writel(field+1, value >> 32);
-#endif
-}
-
-static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
-{
-       vmcs_checkl(field);
-       if (static_branch_unlikely(&enable_evmcs))
-               return evmcs_write64(field, value);
-
-       __vmcs_writel(field, value);
-}
-
-static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
-{
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
-                        "vmcs_clear_bits does not support 64-bit fields");
-       if (static_branch_unlikely(&enable_evmcs))
-               return evmcs_write32(field, evmcs_read32(field) & ~mask);
-
-       __vmcs_writel(field, __vmcs_readl(field) & ~mask);
-}
-
-static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
-{
-       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
-                        "vmcs_set_bits does not support 64-bit fields");
-       if (static_branch_unlikely(&enable_evmcs))
-               return evmcs_write32(field, evmcs_read32(field) | mask);
-
-       __vmcs_writel(field, __vmcs_readl(field) | mask);
-}
-
-static inline void vmcs_clear(struct vmcs *vmcs)
-{
-       u64 phys_addr = __pa(vmcs);
-
-       vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr);
-}
-
-static inline void vmcs_load(struct vmcs *vmcs)
-{
-       u64 phys_addr = __pa(vmcs);
-
-       if (static_branch_unlikely(&enable_evmcs))
-               return evmcs_load(phys_addr);
-
-       vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
-}
-
-static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
-{
-       struct {
-               u64 vpid : 16;
-               u64 rsvd : 48;
-               u64 gva;
-       } operand = { vpid, 0, gva };
-
-       vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva);
-}
-
-static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
-{
-       struct {
-               u64 eptp, gpa;
-       } operand = {eptp, gpa};
-
-       vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
-}
-
-static inline void vpid_sync_vcpu_single(int vpid)
-{
-       if (vpid == 0)
-               return;
-
-       __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
-}
-
-static inline void vpid_sync_vcpu_global(void)
-{
-       __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
-}
-
-static inline void vpid_sync_context(int vpid)
-{
-       if (cpu_has_vmx_invvpid_single())
-               vpid_sync_vcpu_single(vpid);
-       else if (vpid != 0)
-               vpid_sync_vcpu_global();
-}
-
-static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr)
-{
-       if (vpid == 0)
-               return;
-
-       if (cpu_has_vmx_invvpid_individual_addr())
-               __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
-       else
-               vpid_sync_context(vpid);
-}
-
-static inline void ept_sync_global(void)
-{
-       __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
-}
-
-static inline void ept_sync_context(u64 eptp)
-{
-       if (cpu_has_vmx_invept_context())
-               __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
-       else
-               ept_sync_global();
-}
-
-#endif /* __KVM_X86_VMX_INSN_H */
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c

new file mode 100644 (file)

index 0000000..e4e7adf
--- /dev/null
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm_host.h>
+
+#include <asm/irq_remapping.h>
+#include <asm/cpu.h>
+
+#include "lapic.h"
+#include "posted_intr.h"
+#include "trace.h"
+#include "vmx.h"
+
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
+static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+       return &(to_vmx(vcpu)->pi_desc);
+}
+
+void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+       struct pi_desc old, new;
+       unsigned int dest;
+
+       /*
+        * In case of hot-plug or hot-unplug, we may have to undo
+        * vmx_vcpu_pi_put even if there is no assigned device.  And we
+        * always keep PI.NDST up to date for simplicity: it makes the
+        * code easier, and CPU migration is not a fast path.
+        */
+       if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+               return;
+
+       /*
+        * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+        * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
+        * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
+        * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
+        * correctly.
+        */
+       if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
+               pi_clear_sn(pi_desc);
+               goto after_clear_sn;
+       }
+
+       /* The full case.  */
+       do {
+               old.control = new.control = pi_desc->control;
+
+               dest = cpu_physical_id(cpu);
+
+               if (x2apic_enabled())
+                       new.ndst = dest;
+               else
+                       new.ndst = (dest << 8) & 0xFF00;
+
+               new.sn = 0;
+       } while (cmpxchg64(&pi_desc->control, old.control,
+                          new.control) != old.control);
+
+after_clear_sn:
+
+       /*
+        * Clear SN before reading the bitmap.  The VT-d firmware
+        * writes the bitmap and reads SN atomically (5.2.3 in the
+        * spec), so it doesn't really have a memory barrier that
+        * pairs with this, but we cannot do that and we need one.
+        */
+       smp_mb__after_atomic();
+
+       if (!pi_is_pir_empty(pi_desc))
+               pi_set_on(pi_desc);
+}
+
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+       if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+               !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+               !kvm_vcpu_apicv_active(vcpu))
+               return;
+
+       /* Set SN when the vCPU is preempted */
+       if (vcpu->preempted)
+               pi_set_sn(pi_desc);
+}
+
+static void __pi_post_block(struct kvm_vcpu *vcpu)
+{
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+       struct pi_desc old, new;
+       unsigned int dest;
+
+       do {
+               old.control = new.control = pi_desc->control;
+               WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
+                    "Wakeup handler not enabled while the VCPU is blocked\n");
+
+               dest = cpu_physical_id(vcpu->cpu);
+
+               if (x2apic_enabled())
+                       new.ndst = dest;
+               else
+                       new.ndst = (dest << 8) & 0xFF00;
+
+               /* set 'NV' to 'notification vector' */
+               new.nv = POSTED_INTR_VECTOR;
+       } while (cmpxchg64(&pi_desc->control, old.control,
+                          new.control) != old.control);
+
+       if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
+               spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+               list_del(&vcpu->blocked_vcpu_list);
+               spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+               vcpu->pre_pcpu = -1;
+       }
+}
+
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ *   we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ *      'NDST' <-- vcpu->pre_pcpu
+ *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ *   interrupt is posted for this vCPU, we cannot block it, in
+ *   this case, return 1, otherwise, return 0.
+ *
+ */
+int pi_pre_block(struct kvm_vcpu *vcpu)
+{
+       unsigned int dest;
+       struct pi_desc old, new;
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+       if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+               !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+               !kvm_vcpu_apicv_active(vcpu))
+               return 0;
+
+       WARN_ON(irqs_disabled());
+       local_irq_disable();
+       if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
+               vcpu->pre_pcpu = vcpu->cpu;
+               spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+               list_add_tail(&vcpu->blocked_vcpu_list,
+                             &per_cpu(blocked_vcpu_on_cpu,
+                                      vcpu->pre_pcpu));
+               spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+       }
+
+       do {
+               old.control = new.control = pi_desc->control;
+
+               WARN((pi_desc->sn == 1),
+                    "Warning: SN field of posted-interrupts "
+                    "is set before blocking\n");
+
+               /*
+                * Since vCPU can be preempted during this process,
+                * vcpu->cpu could be different with pre_pcpu, we
+                * need to set pre_pcpu as the destination of wakeup
+                * notification event, then we can find the right vCPU
+                * to wakeup in wakeup handler if interrupts happen
+                * when the vCPU is in blocked state.
+                */
+               dest = cpu_physical_id(vcpu->pre_pcpu);
+
+               if (x2apic_enabled())
+                       new.ndst = dest;
+               else
+                       new.ndst = (dest << 8) & 0xFF00;
+
+               /* set 'NV' to 'wakeup vector' */
+               new.nv = POSTED_INTR_WAKEUP_VECTOR;
+       } while (cmpxchg64(&pi_desc->control, old.control,
+                          new.control) != old.control);
+
+       /* We should not block the vCPU if an interrupt is posted for it.  */
+       if (pi_test_on(pi_desc) == 1)
+               __pi_post_block(vcpu);
+
+       local_irq_enable();
+       return (vcpu->pre_pcpu == -1);
+}
+
+void pi_post_block(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->pre_pcpu == -1)
+               return;
+
+       WARN_ON(irqs_disabled());
+       local_irq_disable();
+       __pi_post_block(vcpu);
+       local_irq_enable();
+}
+
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+void pi_wakeup_handler(void)
+{
+       struct kvm_vcpu *vcpu;
+       int cpu = smp_processor_id();
+
+       spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+       list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+                       blocked_vcpu_list) {
+               struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+               if (pi_test_on(pi_desc) == 1)
+                       kvm_vcpu_kick(vcpu);
+       }
+       spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
+void __init pi_init(int cpu)
+{
+       INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+       spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
+bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+       return pi_test_on(pi_desc) ||
+               (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
+}
+
+
+/*
+ * pi_update_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
+                  bool set)
+{
+       struct kvm_kernel_irq_routing_entry *e;
+       struct kvm_irq_routing_table *irq_rt;
+       struct kvm_lapic_irq irq;
+       struct kvm_vcpu *vcpu;
+       struct vcpu_data vcpu_info;
+       int idx, ret = 0;
+
+       if (!kvm_arch_has_assigned_device(kvm) ||
+           !irq_remapping_cap(IRQ_POSTING_CAP) ||
+           !kvm_vcpu_apicv_active(kvm->vcpus[0]))
+               return 0;
+
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+       if (guest_irq >= irq_rt->nr_rt_entries ||
+           hlist_empty(&irq_rt->map[guest_irq])) {
+               pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+                            guest_irq, irq_rt->nr_rt_entries);
+               goto out;
+       }
+
+       hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+               if (e->type != KVM_IRQ_ROUTING_MSI)
+                       continue;
+               /*
+                * VT-d PI cannot support posting multicast/broadcast
+                * interrupts to a vCPU, we still use interrupt remapping
+                * for these kind of interrupts.
+                *
+                * For lowest-priority interrupts, we only support
+                * those with single CPU as the destination, e.g. user
+                * configures the interrupts via /proc/irq or uses
+                * irqbalance to make the interrupts single-CPU.
+                *
+                * We will support full lowest-priority interrupt later.
+                *
+                * In addition, we can only inject generic interrupts using
+                * the PI mechanism, refuse to route others through it.
+                */
+
+               kvm_set_msi_irq(kvm, e, &irq);
+               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+                   !kvm_irq_is_postable(&irq)) {
+                       /*
+                        * Make sure the IRTE is in remapped mode if
+                        * we don't handle it in posted mode.
+                        */
+                       ret = irq_set_vcpu_affinity(host_irq, NULL);
+                       if (ret < 0) {
+                               printk(KERN_INFO
+                                  "failed to back to remapped mode, irq: %u\n",
+                                  host_irq);
+                               goto out;
+                       }
+
+                       continue;
+               }
+
+               vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc);
+               vcpu_info.vector = irq.vector;
+
+               trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
+                               vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+               if (set)
+                       ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+               else
+                       ret = irq_set_vcpu_affinity(host_irq, NULL);
+
+               if (ret < 0) {
+                       printk(KERN_INFO "%s: failed to update PI IRTE\n",
+                                       __func__);
+                       goto out;
+               }
+       }
+
+       ret = 0;
+out:
+       srcu_read_unlock(&kvm->irq_srcu, idx);
+       return ret;
+}
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h

new file mode 100644 (file)

index 0000000..e53b97f
--- /dev/null
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_POSTED_INTR_H
+#define __KVM_X86_VMX_POSTED_INTR_H
+
+#define POSTED_INTR_ON  0
+#define POSTED_INTR_SN  1
+
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+       u32 pir[8];     /* Posted interrupt requested */
+       union {
+               struct {
+                               /* bit 256 - Outstanding Notification */
+                       u16     on      : 1,
+                               /* bit 257 - Suppress Notification */
+                               sn      : 1,
+                               /* bit 271:258 - Reserved */
+                               rsvd_1  : 14;
+                               /* bit 279:272 - Notification Vector */
+                       u8      nv;
+                               /* bit 287:280 - Reserved */
+                       u8      rsvd_2;
+                               /* bit 319:288 - Notification Destination */
+                       u32     ndst;
+               };
+               u64 control;
+       };
+       u32 rsvd[6];
+} __aligned(64);
+
+static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+       return test_and_set_bit(POSTED_INTR_ON,
+                       (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+       return test_and_clear_bit(POSTED_INTR_ON,
+                       (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+       return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
+static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
+{
+       return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+       set_bit(POSTED_INTR_SN,
+               (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_on(struct pi_desc *pi_desc)
+{
+       set_bit(POSTED_INTR_ON,
+               (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+       clear_bit(POSTED_INTR_ON,
+               (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+       clear_bit(POSTED_INTR_SN,
+               (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+       return test_bit(POSTED_INTR_ON,
+                       (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+       return test_bit(POSTED_INTR_SN,
+                       (unsigned long *)&pi_desc->control);
+}
+
+void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
+int pi_pre_block(struct kvm_vcpu *vcpu);
+void pi_post_block(struct kvm_vcpu *vcpu);
+void pi_wakeup_handler(void);
+void __init pi_init(int cpu);
+bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
+int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
+                  bool set);
+
+#endif /* __KVM_X86_VMX_POSTED_INTR_H */
+\ No newline at end of file
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h

index 7a3675f..1472c6c 100644 (file)
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -138,6 +138,13 @@ static inline bool is_external_intr(u32 intr_info)
         return is_intr_type(intr_info, INTR_TYPE_EXT_INTR);
  }
  
+static inline bool is_exception_with_error_code(u32 intr_info)
+{
+       const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK;
+
+       return (intr_info & mask) == mask;
+}
+
  enum vmcs_field_width {
         VMCS_FIELD_WIDTH_U16 = 0,
         VMCS_FIELD_WIDTH_U64 = 1,
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S

index 799db08..90ad7a6 100644 (file)
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -4,6 +4,7 @@
  #include <asm/bitsperlong.h>
  #include <asm/kvm_vcpu_regs.h>
  #include <asm/nospec-branch.h>
+#include <asm/segment.h>
  
  #define WORD_SIZE (BITS_PER_LONG / 8)
  
@@ -294,3 +295,36 @@ SYM_FUNC_START(vmread_error_trampoline)
  
         ret
  SYM_FUNC_END(vmread_error_trampoline)
+
+SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
+       /*
+        * Unconditionally create a stack frame, getting the correct RSP on the
+        * stack (for x86-64) would take two instructions anyways, and RBP can
+        * be used to restore RSP to make objtool happy (see below).
+        */
+       push %_ASM_BP
+       mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+       /*
+        * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+        * creating the synthetic interrupt stack frame for the IRQ/NMI.
+        */
+       and  $-16, %rsp
+       push $__KERNEL_DS
+       push %rbp
+#endif
+       pushf
+       push $__KERNEL_CS
+       CALL_NOSPEC _ASM_ARG1
+
+       /*
+        * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+        * the correct value.  objtool doesn't know the callee will IRET and,
+        * without the explicit restore, thinks the stack is getting walloped.
+        * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+        */
+       mov %_ASM_BP, %_ASM_SP
+       pop %_ASM_BP
+       ret
+SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index 819c185..4797ec9 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -56,7 +56,6 @@
  #include "lapic.h"
  #include "mmu.h"
  #include "nested.h"
-#include "ops.h"
  #include "pmu.h"
  #include "trace.h"
  #include "vmcs.h"
@@ -146,8 +145,25 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
         RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
         RTIT_STATUS_BYTECNT))
  
-#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
-       (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
+/*
+ * List of MSRs that can be directly passed to the guest.
+ * In addition to these x2apic and PT MSRs are handled specially.
+ */
+static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
+       MSR_IA32_SPEC_CTRL,
+       MSR_IA32_PRED_CMD,
+       MSR_IA32_TSC,
+       MSR_FS_BASE,
+       MSR_GS_BASE,
+       MSR_KERNEL_GS_BASE,
+       MSR_IA32_SYSENTER_CS,
+       MSR_IA32_SYSENTER_ESP,
+       MSR_IA32_SYSENTER_EIP,
+       MSR_CORE_C1_RES,
+       MSR_CORE_C3_RESIDENCY,
+       MSR_CORE_C6_RESIDENCY,
+       MSR_CORE_C7_RESIDENCY,
+};
  
  /*
   * These 2 parameters are used to config the controls for Pause-Loop Exiting:
@@ -341,9 +357,8 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = {
  };
  module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
  
-static bool guest_state_valid(struct kvm_vcpu *vcpu);
  static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
                                                           u32 msr, int type);
  
  void vmx_vmexit(void);
@@ -398,13 +413,6 @@ DEFINE_PER_CPU(struct vmcs *, current_vmcs);
   */
  static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
  
-/*
- * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
- * can find which vCPU should be waken up.
- */
-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
-
  static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
  static DEFINE_SPINLOCK(vmx_vpid_lock);
  
@@ -447,9 +455,9 @@ static unsigned long host_idt_base;
   * will emulate SYSCALL in legacy mode if the vendor string in guest
   * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
   * support this emulation, IA32_STAR must always be included in
- * vmx_msr_index[], even in i386 builds.
+ * vmx_uret_msrs_list[], even in i386 builds.
   */
-const u32 vmx_msr_index[] = {
+static const u32 vmx_uret_msrs_list[] = {
  #ifdef CONFIG_X86_64
         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
  #endif
@@ -623,36 +631,71 @@ static inline bool report_flexpriority(void)
         return flexpriority_enabled;
  }
  
-static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
+static int possible_passthrough_msr_slot(u32 msr)
+{
+       u32 i;
+
+       for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
+               if (vmx_possible_passthrough_msrs[i] == msr)
+                       return i;
+
+       return -ENOENT;
+}
+
+static bool is_valid_passthrough_msr(u32 msr)
+{
+       bool r;
+
+       switch (msr) {
+       case 0x800 ... 0x8ff:
+               /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
+               return true;
+       case MSR_IA32_RTIT_STATUS:
+       case MSR_IA32_RTIT_OUTPUT_BASE:
+       case MSR_IA32_RTIT_OUTPUT_MASK:
+       case MSR_IA32_RTIT_CR3_MATCH:
+       case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+               /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
+               return true;
+       }
+
+       r = possible_passthrough_msr_slot(msr) != -ENOENT;
+
+       WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
+
+       return r;
+}
+
+static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
  {
         int i;
  
-       for (i = 0; i < vmx->nmsrs; ++i)
-               if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
+       for (i = 0; i < vmx->nr_uret_msrs; ++i)
+               if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
                         return i;
         return -1;
  }
  
-struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
  {
         int i;
  
-       i = __find_msr_index(vmx, msr);
+       i = __vmx_find_uret_msr(vmx, msr);
         if (i >= 0)
-               return &vmx->guest_msrs[i];
+               return &vmx->guest_uret_msrs[i];
         return NULL;
  }
  
-static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data)
+static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
+                                 struct vmx_uret_msr *msr, u64 data)
  {
         int ret = 0;
  
         u64 old_msr_data = msr->data;
         msr->data = data;
-       if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
+       if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
                 preempt_disable();
-               ret = kvm_set_shared_msr(msr->index, msr->data,
-                                        msr->mask);
+               ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
                 preempt_enable();
                 if (ret)
                         msr->data = old_msr_data;
@@ -825,7 +868,7 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
         vm_exit_controls_clearbit(vmx, exit);
  }
  
-int vmx_find_msr_index(struct vmx_msrs *m, u32 msr)
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
  {
         unsigned int i;
  
@@ -859,7 +902,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
                 }
                 break;
         }
-       i = vmx_find_msr_index(&m->guest, msr);
+       i = vmx_find_loadstore_msr_slot(&m->guest, msr);
         if (i < 0)
                 goto skip_guest;
         --m->guest.nr;
@@ -867,7 +910,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
  
  skip_guest:
-       i = vmx_find_msr_index(&m->host, msr);
+       i = vmx_find_loadstore_msr_slot(&m->host, msr);
         if (i < 0)
                 return;
  
@@ -926,12 +969,12 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
         }
  
-       i = vmx_find_msr_index(&m->guest, msr);
+       i = vmx_find_loadstore_msr_slot(&m->guest, msr);
         if (!entry_only)
-               j = vmx_find_msr_index(&m->host, msr);
+               j = vmx_find_loadstore_msr_slot(&m->host, msr);
  
-       if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) ||
-               (j < 0 &&  m->host.nr == NR_LOADSTORE_MSRS)) {
+       if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
+           (j < 0 &&  m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
                 printk_once(KERN_WARNING "Not enough msr switch entries. "
                                 "Can't add msr %x\n", msr);
                 return;
@@ -954,10 +997,11 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
         m->host.val[j].value = host_val;
  }
  
-static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
+static bool update_transition_efer(struct vcpu_vmx *vmx)
  {
         u64 guest_efer = vmx->vcpu.arch.efer;
         u64 ignore_bits = 0;
+       int i;
  
         /* Shadow paging assumes NX to be available.  */
         if (!enable_ept)
@@ -989,17 +1033,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
                 else
                         clear_atomic_switch_msr(vmx, MSR_EFER);
                 return false;
-       } else {
-               clear_atomic_switch_msr(vmx, MSR_EFER);
+       }
  
-               guest_efer &= ~ignore_bits;
-               guest_efer |= host_efer & ignore_bits;
+       i = __vmx_find_uret_msr(vmx, MSR_EFER);
+       if (i < 0)
+               return false;
  
-               vmx->guest_msrs[efer_offset].data = guest_efer;
-               vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+       clear_atomic_switch_msr(vmx, MSR_EFER);
  
-               return true;
-       }
+       guest_efer &= ~ignore_bits;
+       guest_efer |= host_efer & ignore_bits;
+
+       vmx->guest_uret_msrs[i].data = guest_efer;
+       vmx->guest_uret_msrs[i].mask = ~ignore_bits;
+
+       return true;
  }
  
  #ifdef CONFIG_X86_32
@@ -1037,6 +1085,12 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
                !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
  }
  
+static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
+{
+       /* The base must be 128-byte aligned and a legal physical address. */
+       return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f);
+}
+
  static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
  {
         u32 i;
@@ -1141,12 +1195,12 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
          * when guest state is loaded. This happens when guest transitions
          * to/from long-mode by setting MSR_EFER.LMA.
          */
-       if (!vmx->guest_msrs_ready) {
-               vmx->guest_msrs_ready = true;
-               for (i = 0; i < vmx->save_nmsrs; ++i)
-                       kvm_set_shared_msr(vmx->guest_msrs[i].index,
-                                          vmx->guest_msrs[i].data,
-                                          vmx->guest_msrs[i].mask);
+       if (!vmx->guest_uret_msrs_loaded) {
+               vmx->guest_uret_msrs_loaded = true;
+               for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
+                       kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
+                                               vmx->guest_uret_msrs[i].data,
+                                               vmx->guest_uret_msrs[i].mask);
  
         }
  
@@ -1230,7 +1284,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
  #endif
         load_fixmap_gdt(raw_smp_processor_id());
         vmx->guest_state_loaded = false;
-       vmx->guest_msrs_ready = false;
+       vmx->guest_uret_msrs_loaded = false;
  }
  
  #ifdef CONFIG_X86_64
@@ -1253,62 +1307,6 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
  }
  #endif
  
-static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
-{
-       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-       struct pi_desc old, new;
-       unsigned int dest;
-
-       /*
-        * In case of hot-plug or hot-unplug, we may have to undo
-        * vmx_vcpu_pi_put even if there is no assigned device.  And we
-        * always keep PI.NDST up to date for simplicity: it makes the
-        * code easier, and CPU migration is not a fast path.
-        */
-       if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
-               return;
-
-       /*
-        * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
-        * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
-        * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
-        * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
-        * correctly.
-        */
-       if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
-               pi_clear_sn(pi_desc);
-               goto after_clear_sn;
-       }
-
-       /* The full case.  */
-       do {
-               old.control = new.control = pi_desc->control;
-
-               dest = cpu_physical_id(cpu);
-
-               if (x2apic_enabled())
-                       new.ndst = dest;
-               else
-                       new.ndst = (dest << 8) & 0xFF00;
-
-               new.sn = 0;
-       } while (cmpxchg64(&pi_desc->control, old.control,
-                          new.control) != old.control);
-
-after_clear_sn:
-
-       /*
-        * Clear SN before reading the bitmap.  The VT-d firmware
-        * writes the bitmap and reads SN atomically (5.2.3 in the
-        * spec), so it doesn't really have a memory barrier that
-        * pairs with this, but we cannot do that and we need one.
-        */
-       smp_mb__after_atomic();
-
-       if (!pi_is_pir_empty(pi_desc))
-               pi_set_on(pi_desc);
-}
-
  void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
                         struct loaded_vmcs *buddy)
  {
@@ -1392,20 +1390,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
         vmx->host_debugctlmsr = get_debugctlmsr();
  }
  
-static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
-{
-       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-       if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-               !irq_remapping_cap(IRQ_POSTING_CAP)  ||
-               !kvm_vcpu_apicv_active(vcpu))
-               return;
-
-       /* Set SN when the vCPU is preempted */
-       if (vcpu->preempted)
-               pi_set_sn(pi_desc);
-}
-
  static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
  {
         vmx_vcpu_pi_put(vcpu);
@@ -1415,7 +1399,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
  
  static bool emulation_required(struct kvm_vcpu *vcpu)
  {
-       return emulate_invalid_guest_state && !guest_state_valid(vcpu);
+       return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
  }
  
  unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -1441,7 +1425,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         unsigned long old_rflags;
  
-       if (enable_unrestricted_guest) {
+       if (is_unrestricted_guest(vcpu)) {
                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
                 vmx->rflags = rflags;
                 vmcs_writel(GUEST_RFLAGS, rflags);
@@ -1561,6 +1545,11 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
         return 0;
  }
  
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+{
+       return true;
+}
+
  static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
  {
         unsigned long rip, orig_rip;
@@ -1598,33 +1587,6 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
         return 1;
  }
  
-/*
- * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
- * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
- * indicates whether exit to userspace is needed.
- */
-int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
-                             struct x86_exception *e)
-{
-       if (r == X86EMUL_PROPAGATE_FAULT) {
-               kvm_inject_emulated_page_fault(vcpu, e);
-               return 1;
-       }
-
-       /*
-        * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
-        * while handling a VMX instruction KVM could've handled the request
-        * correctly by exiting to userspace and performing I/O but there
-        * doesn't seem to be a real use-case behind such requests, just return
-        * KVM_EXIT_INTERNAL_ERROR for now.
-        */
-       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-       vcpu->run->internal.ndata = 0;
-
-       return 0;
-}
-
  /*
   * Recognizes a pending MTF VM-exit and records the nested state for later
   * delivery.
@@ -1708,16 +1670,19 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
         vmx_clear_hlt(vcpu);
  }
  
-/*
- * Swap MSR entry in host/guest MSR entry array.
- */
-static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
  {
-       struct shared_msr_entry tmp;
+       struct vmx_uret_msr tmp;
+       int from, to;
  
-       tmp = vmx->guest_msrs[to];
-       vmx->guest_msrs[to] = vmx->guest_msrs[from];
-       vmx->guest_msrs[from] = tmp;
+       from = __vmx_find_uret_msr(vmx, msr);
+       if (from < 0)
+               return;
+       to = vmx->nr_active_uret_msrs++;
+
+       tmp = vmx->guest_uret_msrs[to];
+       vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
+       vmx->guest_uret_msrs[from] = tmp;
  }
  
  /*
@@ -1727,38 +1692,26 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
   */
  static void setup_msrs(struct vcpu_vmx *vmx)
  {
-       int save_nmsrs, index;
-
-       save_nmsrs = 0;
+       vmx->guest_uret_msrs_loaded = false;
+       vmx->nr_active_uret_msrs = 0;
  #ifdef CONFIG_X86_64
         /*
          * The SYSCALL MSRs are only needed on long mode guests, and only
          * when EFER.SCE is set.
          */
         if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
-               index = __find_msr_index(vmx, MSR_STAR);
-               if (index >= 0)
-                       move_msr_up(vmx, index, save_nmsrs++);
-               index = __find_msr_index(vmx, MSR_LSTAR);
-               if (index >= 0)
-                       move_msr_up(vmx, index, save_nmsrs++);
-               index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
-               if (index >= 0)
-                       move_msr_up(vmx, index, save_nmsrs++);
+               vmx_setup_uret_msr(vmx, MSR_STAR);
+               vmx_setup_uret_msr(vmx, MSR_LSTAR);
+               vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
         }
  #endif
-       index = __find_msr_index(vmx, MSR_EFER);
-       if (index >= 0 && update_transition_efer(vmx, index))
-               move_msr_up(vmx, index, save_nmsrs++);
-       index = __find_msr_index(vmx, MSR_TSC_AUX);
-       if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
-               move_msr_up(vmx, index, save_nmsrs++);
-       index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
-       if (index >= 0)
-               move_msr_up(vmx, index, save_nmsrs++);
-
-       vmx->save_nmsrs = save_nmsrs;
-       vmx->guest_msrs_ready = false;
+       if (update_transition_efer(vmx))
+               vmx_setup_uret_msr(vmx, MSR_EFER);
+
+       if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
+               vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
+
+       vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
  
         if (cpu_has_vmx_msr_bitmap())
                 vmx_update_msr_bitmap(&vmx->vcpu);
@@ -1828,7 +1781,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
  static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct shared_msr_entry *msr;
+       struct vmx_uret_msr *msr;
         u32 index;
  
         switch (msr_info->index) {
@@ -1849,7 +1802,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 if (!msr_info->host_initiated &&
                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
                         return 1;
-               goto find_shared_msr;
+               goto find_uret_msr;
         case MSR_IA32_UMWAIT_CONTROL:
                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
                         return 1;
@@ -1956,10 +1909,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 if (!msr_info->host_initiated &&
                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
                         return 1;
-               goto find_shared_msr;
+               goto find_uret_msr;
         default:
-       find_shared_msr:
-               msr = find_msr_entry(vmx, msr_info->index);
+       find_uret_msr:
+               msr = vmx_find_uret_msr(vmx, msr_info->index);
                 if (msr) {
                         msr_info->data = msr->data;
                         break;
@@ -1988,7 +1941,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
  static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct shared_msr_entry *msr;
+       struct vmx_uret_msr *msr;
         int ret = 0;
         u32 msr_index = msr_info->index;
         u64 data = msr_info->data;
@@ -2082,7 +2035,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                  * in the merging. We update the vmcs01 here for L1 as well
                  * since it will end up touching the MSR anyway now.
                  */
-               vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+               vmx_disable_intercept_for_msr(vcpu,
                                               MSR_IA32_SPEC_CTRL,
                                               MSR_TYPE_RW);
                 break;
@@ -2092,7 +2045,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                         return 1;
                 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
                         return 1;
-               goto find_shared_msr;
+               goto find_uret_msr;
         case MSR_IA32_PRED_CMD:
                 if (!msr_info->host_initiated &&
                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
@@ -2118,8 +2071,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                  * vmcs02.msr_bitmap here since it gets completely overwritten
                  * in the merging.
                  */
-               vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
-                                             MSR_TYPE_W);
+               vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
                 break;
         case MSR_IA32_CR_PAT:
                 if (!kvm_pat_valid(data))
@@ -2169,7 +2121,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                         return 1;
                 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
                 vmx->pt_desc.guest.ctl = data;
-               pt_update_intercept_for_msr(vmx);
+               pt_update_intercept_for_msr(vcpu);
                 break;
         case MSR_IA32_RTIT_STATUS:
                 if (!pt_can_write_msr(vmx))
@@ -2194,7 +2146,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                     !intel_pt_validate_cap(vmx->pt_desc.caps,
                                            PT_CAP_single_range_output))
                         return 1;
-               if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)
+               if (!pt_output_base_valid(vcpu, data))
                         return 1;
                 vmx->pt_desc.guest.output_base = data;
                 break;
@@ -2229,13 +2181,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 /* Check reserved bit, higher 32 bits should be zero */
                 if ((data >> 32) != 0)
                         return 1;
-               goto find_shared_msr;
+               goto find_uret_msr;
  
         default:
-       find_shared_msr:
-               msr = find_msr_entry(vmx, msr_index);
+       find_uret_msr:
+               msr = vmx_find_uret_msr(vmx, msr_index);
                 if (msr)
-                       ret = vmx_set_guest_msr(vmx, msr, data);
+                       ret = vmx_set_guest_uret_msr(vmx, msr, data);
                 else
                         ret = kvm_set_msr_common(vcpu, msr_info);
         }
@@ -2267,7 +2219,8 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
                 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
                 break;
         case VCPU_EXREG_CR3:
-               if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
+               if (is_unrestricted_guest(vcpu) ||
+                   (enable_ept && is_paging(vcpu)))
                         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
                 break;
         case VCPU_EXREG_CR4:
@@ -2448,7 +2401,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                         SECONDARY_EXEC_UNRESTRICTED_GUEST |
                         SECONDARY_EXEC_PAUSE_LOOP_EXITING |
                         SECONDARY_EXEC_DESC |
-                       SECONDARY_EXEC_RDTSCP |
+                       SECONDARY_EXEC_ENABLE_RDTSCP |
                         SECONDARY_EXEC_ENABLE_INVPCID |
                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
@@ -2865,7 +2818,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
  void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+       struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER);
  
         if (!msr)
                 return;
@@ -2971,7 +2924,7 @@ static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
         vpid_sync_context(to_vmx(vcpu)->vpid);
  }
  
-static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
  {
         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
  
@@ -3033,7 +2986,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         unsigned long hw_cr0;
  
         hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
-       if (enable_unrestricted_guest)
+       if (is_unrestricted_guest(vcpu))
                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
         else {
                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
@@ -3054,7 +3007,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         }
  #endif
  
-       if (enable_ept && !enable_unrestricted_guest)
+       if (enable_ept && !is_unrestricted_guest(vcpu))
                 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
  
         vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -3114,7 +3067,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
                         guest_cr3 = vcpu->arch.cr3;
                 else /* vmcs01.GUEST_CR3 is already up-to-date. */
                         update_guest_cr3 = false;
-               ept_load_pdptrs(vcpu);
+               vmx_ept_load_pdptrs(vcpu);
         } else {
                 guest_cr3 = pgd;
         }
@@ -3134,7 +3087,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
         unsigned long hw_cr4;
  
         hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
-       if (enable_unrestricted_guest)
+       if (is_unrestricted_guest(vcpu))
                 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
         else if (vmx->rmode.vm86_active)
                 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
@@ -3169,7 +3122,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
         vcpu->arch.cr4 = cr4;
         kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
  
-       if (!enable_unrestricted_guest) {
+       if (!is_unrestricted_guest(vcpu)) {
                 if (enable_ept) {
                         if (!is_paging(vcpu)) {
                                 hw_cr4 &= ~X86_CR4_PAE;
@@ -3309,7 +3262,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
          * tree. Newer qemu binaries with that qemu fix would not need this
          * kvm hack.
          */
-       if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+       if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
                 var->type |= 0x1; /* Accessed */
  
         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
@@ -3498,11 +3451,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
   * not.
   * We assume that registers are always usable
   */
-static bool guest_state_valid(struct kvm_vcpu *vcpu)
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
  {
-       if (enable_unrestricted_guest)
-               return true;
-
         /* real mode guest state checks */
         if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
@@ -3688,11 +3638,52 @@ void free_vpid(int vpid)
         spin_unlock(&vmx_vpid_lock);
  }
  
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
-                                                         u32 msr, int type)
+static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
  {
         int f = sizeof(unsigned long);
  
+       if (msr <= 0x1fff)
+               __clear_bit(msr, msr_bitmap + 0x000 / f);
+       else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+               __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
+}
+
+static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
+{
+       int f = sizeof(unsigned long);
+
+       if (msr <= 0x1fff)
+               __clear_bit(msr, msr_bitmap + 0x800 / f);
+       else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+               __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+}
+
+static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+{
+       int f = sizeof(unsigned long);
+
+       if (msr <= 0x1fff)
+               __set_bit(msr, msr_bitmap + 0x000 / f);
+       else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+               __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
+}
+
+static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
+{
+       int f = sizeof(unsigned long);
+
+       if (msr <= 0x1fff)
+               __set_bit(msr, msr_bitmap + 0x800 / f);
+       else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+               __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+}
+
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
+                                                         u32 msr, int type)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+
         if (!cpu_has_vmx_msr_bitmap())
                 return;
  
@@ -3700,36 +3691,44 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit
                 evmcs_touch_msr_bitmap();
  
         /*
-        * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
-        * have the write-low and read-high bitmap offsets the wrong way round.
-        * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
-        */
-       if (msr <= 0x1fff) {
-               if (type & MSR_TYPE_R)
-                       /* read-low */
-                       __clear_bit(msr, msr_bitmap + 0x000 / f);
+        * Mark the desired intercept state in shadow bitmap, this is needed
+        * for resync when the MSR filters change.
+       */
+       if (is_valid_passthrough_msr(msr)) {
+               int idx = possible_passthrough_msr_slot(msr);
+
+               if (idx != -ENOENT) {
+                       if (type & MSR_TYPE_R)
+                               clear_bit(idx, vmx->shadow_msr_intercept.read);
+                       if (type & MSR_TYPE_W)
+                               clear_bit(idx, vmx->shadow_msr_intercept.write);
+               }
+       }
  
-               if (type & MSR_TYPE_W)
-                       /* write-low */
-                       __clear_bit(msr, msr_bitmap + 0x800 / f);
+       if ((type & MSR_TYPE_R) &&
+           !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
+               vmx_set_msr_bitmap_read(msr_bitmap, msr);
+               type &= ~MSR_TYPE_R;
+       }
  
-       } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
-               msr &= 0x1fff;
-               if (type & MSR_TYPE_R)
-                       /* read-high */
-                       __clear_bit(msr, msr_bitmap + 0x400 / f);
+       if ((type & MSR_TYPE_W) &&
+           !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
+               vmx_set_msr_bitmap_write(msr_bitmap, msr);
+               type &= ~MSR_TYPE_W;
+       }
  
-               if (type & MSR_TYPE_W)
-                       /* write-high */
-                       __clear_bit(msr, msr_bitmap + 0xc00 / f);
+       if (type & MSR_TYPE_R)
+               vmx_clear_msr_bitmap_read(msr_bitmap, msr);
  
-       }
+       if (type & MSR_TYPE_W)
+               vmx_clear_msr_bitmap_write(msr_bitmap, msr);
  }
  
-static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
                                                          u32 msr, int type)
  {
-       int f = sizeof(unsigned long);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
  
         if (!cpu_has_vmx_msr_bitmap())
                 return;
@@ -3738,39 +3737,34 @@ static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitm
                 evmcs_touch_msr_bitmap();
  
         /*
-        * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
-        * have the write-low and read-high bitmap offsets the wrong way round.
-        * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
-        */
-       if (msr <= 0x1fff) {
-               if (type & MSR_TYPE_R)
-                       /* read-low */
-                       __set_bit(msr, msr_bitmap + 0x000 / f);
-
-               if (type & MSR_TYPE_W)
-                       /* write-low */
-                       __set_bit(msr, msr_bitmap + 0x800 / f);
-
-       } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
-               msr &= 0x1fff;
-               if (type & MSR_TYPE_R)
-                       /* read-high */
-                       __set_bit(msr, msr_bitmap + 0x400 / f);
+        * Mark the desired intercept state in shadow bitmap, this is needed
+        * for resync when the MSR filter changes.
+       */
+       if (is_valid_passthrough_msr(msr)) {
+               int idx = possible_passthrough_msr_slot(msr);
+
+               if (idx != -ENOENT) {
+                       if (type & MSR_TYPE_R)
+                               set_bit(idx, vmx->shadow_msr_intercept.read);
+                       if (type & MSR_TYPE_W)
+                               set_bit(idx, vmx->shadow_msr_intercept.write);
+               }
+       }
  
-               if (type & MSR_TYPE_W)
-                       /* write-high */
-                       __set_bit(msr, msr_bitmap + 0xc00 / f);
+       if (type & MSR_TYPE_R)
+               vmx_set_msr_bitmap_read(msr_bitmap, msr);
  
-       }
+       if (type & MSR_TYPE_W)
+               vmx_set_msr_bitmap_write(msr_bitmap, msr);
  }
  
-static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
-                                                     u32 msr, int type, bool value)
+static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+                                                     u32 msr, int type, bool value)
  {
         if (value)
-               vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+               vmx_enable_intercept_for_msr(vcpu, msr, type);
         else
-               vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
+               vmx_disable_intercept_for_msr(vcpu, msr, type);
  }
  
  static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
@@ -3788,15 +3782,15 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
         return mode;
  }
  
-static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
-                                        u8 mode)
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
  {
         int msr;
  
-       for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
-               unsigned word = msr / BITS_PER_LONG;
-               msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
-               msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+       for (msr = 0x800; msr <= 0x8ff; msr++) {
+               bool apicv = !!(mode & MSR_BITMAP_MODE_X2APIC_APICV);
+
+               vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, !apicv);
+               vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, true);
         }
  
         if (mode & MSR_BITMAP_MODE_X2APIC) {
@@ -3804,11 +3798,11 @@ static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
                  * TPR reads and writes can be virtualized even if virtual interrupt
                  * delivery is not in use.
                  */
-               vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
+               vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
                 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
-                       vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
-                       vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
-                       vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+                       vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
+                       vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+                       vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
                 }
         }
  }
@@ -3816,7 +3810,6 @@ static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
  void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
         u8 mode = vmx_msr_bitmap_mode(vcpu);
         u8 changed = mode ^ vmx->msr_bitmap_mode;
  
@@ -3824,30 +3817,24 @@ void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
                 return;
  
         if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
-               vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+               vmx_update_msr_bitmap_x2apic(vcpu, mode);
  
         vmx->msr_bitmap_mode = mode;
  }
  
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
  {
-       unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
         bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
         u32 i;
  
-       vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
-                                                       MSR_TYPE_RW, flag);
-       vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
-                                                       MSR_TYPE_RW, flag);
-       vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
-                                                       MSR_TYPE_RW, flag);
-       vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
-                                                       MSR_TYPE_RW, flag);
+       vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
+       vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
+       vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
+       vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
         for (i = 0; i < vmx->pt_desc.addr_range; i++) {
-               vmx_set_intercept_for_msr(msr_bitmap,
-                       MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
-               vmx_set_intercept_for_msr(msr_bitmap,
-                       MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+               vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+               vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
         }
  }
  
@@ -3871,6 +3858,29 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
         return ((rvi & 0xf0) > (vppr & 0xf0));
  }
  
+static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 i;
+
+       /*
+        * Set intercept permissions for all potentially passed through MSRs
+        * again. They will automatically get filtered through the MSR filter,
+        * so we are back in sync after this.
+        */
+       for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
+               u32 msr = vmx_possible_passthrough_msrs[i];
+               bool read = test_bit(i, vmx->shadow_msr_intercept.read);
+               bool write = test_bit(i, vmx->shadow_msr_intercept.write);
+
+               vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read);
+               vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write);
+       }
+
+       pt_update_intercept_for_msr(vcpu);
+       vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu));
+}
+
  static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
                                                      bool nested)
  {
@@ -4099,6 +4109,61 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx)
         return exec_control;
  }
  
+/*
+ * Adjust a single secondary execution control bit to intercept/allow an
+ * instruction in the guest.  This is usually done based on whether or not a
+ * feature has been exposed to the guest in order to correctly emulate faults.
+ */
+static inline void
+vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
+                                 u32 control, bool enabled, bool exiting)
+{
+       /*
+        * If the control is for an opt-in feature, clear the control if the
+        * feature is not exposed to the guest, i.e. not enabled.  If the
+        * control is opt-out, i.e. an exiting control, clear the control if
+        * the feature _is_ exposed to the guest, i.e. exiting/interception is
+        * disabled for the associated instruction.  Note, the caller is
+        * responsible presetting exec_control to set all supported bits.
+        */
+       if (enabled == exiting)
+               *exec_control &= ~control;
+
+       /*
+        * Update the nested MSR settings so that a nested VMM can/can't set
+        * controls for features that are/aren't exposed to the guest.
+        */
+       if (nested) {
+               if (enabled)
+                       vmx->nested.msrs.secondary_ctls_high |= control;
+               else
+                       vmx->nested.msrs.secondary_ctls_high &= ~control;
+       }
+}
+
+/*
+ * Wrapper macro for the common case of adjusting a secondary execution control
+ * based on a single guest CPUID bit, with a dedicated feature bit.  This also
+ * verifies that the control is actually supported by KVM and hardware.
+ */
+#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
+({                                                                      \
+       bool __enabled;                                                  \
+                                                                        \
+       if (cpu_has_vmx_##name()) {                                      \
+               __enabled = guest_cpuid_has(&(vmx)->vcpu,                \
+                                           X86_FEATURE_##feat_name);    \
+               vmx_adjust_secondary_exec_control(vmx, exec_control,     \
+                       SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
+       }                                                                \
+})
+
+/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
+#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
+       vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
+
+#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
+       vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
  
  static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
  {
@@ -4139,7 +4204,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
         if (!enable_pml)
                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
  
-       if (vmx_xsaves_supported()) {
+       if (cpu_has_vmx_xsaves()) {
                 /* Exposing XSAVES only when XSAVE is exposed */
                 bool xsaves_enabled =
                         boot_cpu_has(X86_FEATURE_XSAVE) &&
@@ -4148,101 +4213,29 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
  
                 vcpu->arch.xsaves_enabled = xsaves_enabled;
  
-               if (!xsaves_enabled)
-                       exec_control &= ~SECONDARY_EXEC_XSAVES;
-
-               if (nested) {
-                       if (xsaves_enabled)
-                               vmx->nested.msrs.secondary_ctls_high |=
-                                       SECONDARY_EXEC_XSAVES;
-                       else
-                               vmx->nested.msrs.secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_XSAVES;
-               }
-       }
-
-       if (cpu_has_vmx_rdtscp()) {
-               bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
-               if (!rdtscp_enabled)
-                       exec_control &= ~SECONDARY_EXEC_RDTSCP;
-
-               if (nested) {
-                       if (rdtscp_enabled)
-                               vmx->nested.msrs.secondary_ctls_high |=
-                                       SECONDARY_EXEC_RDTSCP;
-                       else
-                               vmx->nested.msrs.secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_RDTSCP;
-               }
-       }
-
-       if (cpu_has_vmx_invpcid()) {
-               /* Exposing INVPCID only when PCID is exposed */
-               bool invpcid_enabled =
-                       guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
-                       guest_cpuid_has(vcpu, X86_FEATURE_PCID);
-
-               if (!invpcid_enabled) {
-                       exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
-                       guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
-               }
-
-               if (nested) {
-                       if (invpcid_enabled)
-                               vmx->nested.msrs.secondary_ctls_high |=
-                                       SECONDARY_EXEC_ENABLE_INVPCID;
-                       else
-                               vmx->nested.msrs.secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_ENABLE_INVPCID;
-               }
+               vmx_adjust_secondary_exec_control(vmx, &exec_control,
+                                                 SECONDARY_EXEC_XSAVES,
+                                                 xsaves_enabled, false);
         }
  
-       if (vmx_rdrand_supported()) {
-               bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
-               if (rdrand_enabled)
-                       exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
+       vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
  
-               if (nested) {
-                       if (rdrand_enabled)
-                               vmx->nested.msrs.secondary_ctls_high |=
-                                       SECONDARY_EXEC_RDRAND_EXITING;
-                       else
-                               vmx->nested.msrs.secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_RDRAND_EXITING;
-               }
-       }
+       /*
+        * Expose INVPCID if and only if PCID is also exposed to the guest.
+        * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF
+        * if CR4.PCIDE=0.  Enumerating CPUID.INVPCID=1 would lead to incorrect
+        * behavior from the guest perspective (it would expect #GP or #PF).
+        */
+       if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
+               guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
+       vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
  
-       if (vmx_rdseed_supported()) {
-               bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
-               if (rdseed_enabled)
-                       exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
  
-               if (nested) {
-                       if (rdseed_enabled)
-                               vmx->nested.msrs.secondary_ctls_high |=
-                                       SECONDARY_EXEC_RDSEED_EXITING;
-                       else
-                               vmx->nested.msrs.secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_RDSEED_EXITING;
-               }
-       }
-
-       if (vmx_waitpkg_supported()) {
-               bool waitpkg_enabled =
-                       guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG);
+       vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
+       vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
  
-               if (!waitpkg_enabled)
-                       exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
-
-               if (nested) {
-                       if (waitpkg_enabled)
-                               vmx->nested.msrs.secondary_ctls_high |=
-                                       SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
-                       else
-                               vmx->nested.msrs.secondary_ctls_high &=
-                                       ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
-               }
-       }
+       vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
+                                   ENABLE_USR_WAIT_PAUSE, false);
  
         vmx->secondary_exec_control = exec_control;
  }
@@ -4335,7 +4328,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
         if (vmx->vpid != 0)
                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
  
-       if (vmx_xsaves_supported())
+       if (cpu_has_vmx_xsaves())
                 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
  
         if (enable_pml) {
@@ -5148,7 +5141,8 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
  
  static int handle_invd(struct kvm_vcpu *vcpu)
  {
-       return kvm_emulate_instruction(vcpu, 0);
+       /* Treat an INVD instruction as a NOP and just skip it. */
+       return kvm_skip_emulated_instruction(vcpu);
  }
  
  static int handle_invlpg(struct kvm_vcpu *vcpu)
@@ -5331,7 +5325,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
          * would also use advanced VM-exit information for EPT violations to
          * reconstruct the page fault error code.
          */
-       if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa)))
+       if (unlikely(kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
                 return kvm_emulate_instruction(vcpu, 0);
  
         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -5442,25 +5436,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
         }
  }
  
-/*
- * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
- */
-static void wakeup_handler(void)
-{
-       struct kvm_vcpu *vcpu;
-       int cpu = smp_processor_id();
-
-       spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-       list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
-                       blocked_vcpu_list) {
-               struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-               if (pi_test_on(pi_desc) == 1)
-                       kvm_vcpu_kick(vcpu);
-       }
-       spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-}
-
  static void vmx_enable_tdp(void)
  {
         kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
@@ -5524,16 +5499,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
  {
         u32 vmx_instruction_info;
         unsigned long type;
-       bool pcid_enabled;
         gva_t gva;
-       struct x86_exception e;
-       unsigned i;
-       unsigned long roots_to_free = 0;
         struct {
                 u64 pcid;
                 u64 gla;
         } operand;
-       int r;
  
         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
                 kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5556,68 +5526,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
                                 sizeof(operand), &gva))
                 return 1;
  
-       r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
-       if (r != X86EMUL_CONTINUE)
-               return vmx_handle_memory_failure(vcpu, r, &e);
-
-       if (operand.pcid >> 12 != 0) {
-               kvm_inject_gp(vcpu, 0);
-               return 1;
-       }
-
-       pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
-
-       switch (type) {
-       case INVPCID_TYPE_INDIV_ADDR:
-               if ((!pcid_enabled && (operand.pcid != 0)) ||
-                   is_noncanonical_address(operand.gla, vcpu)) {
-                       kvm_inject_gp(vcpu, 0);
-                       return 1;
-               }
-               kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
-               return kvm_skip_emulated_instruction(vcpu);
-
-       case INVPCID_TYPE_SINGLE_CTXT:
-               if (!pcid_enabled && (operand.pcid != 0)) {
-                       kvm_inject_gp(vcpu, 0);
-                       return 1;
-               }
-
-               if (kvm_get_active_pcid(vcpu) == operand.pcid) {
-                       kvm_mmu_sync_roots(vcpu);
-                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-               }
-
-               for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
-                           == operand.pcid)
-                               roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
-
-               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
-               /*
-                * If neither the current cr3 nor any of the prev_roots use the
-                * given PCID, then nothing needs to be done here because a
-                * resync will happen anyway before switching to any other CR3.
-                */
-
-               return kvm_skip_emulated_instruction(vcpu);
-
-       case INVPCID_TYPE_ALL_NON_GLOBAL:
-               /*
-                * Currently, KVM doesn't mark global entries in the shadow
-                * page tables, so a non-global flush just degenerates to a
-                * global flush. If needed, we could optimize this later by
-                * keeping track of global entries in shadow page tables.
-                */
-
-               fallthrough;
-       case INVPCID_TYPE_ALL_INCL_GLOBAL:
-               kvm_mmu_unload(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
-
-       default:
-               BUG(); /* We have already checked above that type <= 3 */
-       }
+       return kvm_handle_invpcid(vcpu, type, gva);
  }
  
  static int handle_pml_full(struct kvm_vcpu *vcpu)
@@ -5746,10 +5655,24 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
  static const int kvm_vmx_max_exit_handlers =
         ARRAY_SIZE(kvm_vmx_exit_handlers);
  
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+                             u32 *intr_info, u32 *error_code)
  {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
         *info1 = vmx_get_exit_qual(vcpu);
-       *info2 = vmx_get_intr_info(vcpu);
+       if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+               *info2 = vmx->idt_vectoring_info;
+               *intr_info = vmx_get_intr_info(vcpu);
+               if (is_exception_with_error_code(*intr_info))
+                       *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+               else
+                       *error_code = 0;
+       } else {
+               *info2 = 0;
+               *intr_info = 0;
+               *error_code = 0;
+       }
  }
  
  static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
@@ -6054,6 +5977,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
                         (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
                         exit_reason != EXIT_REASON_EPT_VIOLATION &&
                         exit_reason != EXIT_REASON_PML_FULL &&
+                       exit_reason != EXIT_REASON_APIC_ACCESS &&
                         exit_reason != EXIT_REASON_TASK_SWITCH)) {
                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
@@ -6382,14 +6306,6 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
         return max_irr;
  }
  
-static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
-       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-       return pi_test_on(pi_desc) ||
-               (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
-}
-
  static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
  {
         if (!kvm_vcpu_apicv_active(vcpu))
@@ -6409,70 +6325,43 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
         memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
  }
  
+void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
+
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+{
+       unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+       gate_desc *desc = (gate_desc *)host_idt_base + vector;
+
+       kvm_before_interrupt(vcpu);
+       vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
+       kvm_after_interrupt(vcpu);
+}
+
  static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
  {
         u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
  
         /* if exit due to PF check for async PF */
-       if (is_page_fault(intr_info)) {
+       if (is_page_fault(intr_info))
                 vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
         /* Handle machine checks before interrupts are enabled */
-       } else if (is_machine_check(intr_info)) {
+       else if (is_machine_check(intr_info))
                 kvm_machine_check();
         /* We need to handle NMIs before interrupts are enabled */
-       } else if (is_nmi(intr_info)) {
-               kvm_before_interrupt(&vmx->vcpu);
-               asm("int $2");
-               kvm_after_interrupt(&vmx->vcpu);
-       }
+       else if (is_nmi(intr_info))
+               handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
  }
  
  static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
  {
-       unsigned int vector;
-       unsigned long entry;
-#ifdef CONFIG_X86_64
-       unsigned long tmp;
-#endif
-       gate_desc *desc;
         u32 intr_info = vmx_get_intr_info(vcpu);
  
         if (WARN_ONCE(!is_external_intr(intr_info),
             "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
                 return;
  
-       vector = intr_info & INTR_INFO_VECTOR_MASK;
-       desc = (gate_desc *)host_idt_base + vector;
-       entry = gate_offset(desc);
-
-       kvm_before_interrupt(vcpu);
-
-       asm volatile(
-#ifdef CONFIG_X86_64
-               "mov %%rsp, %[sp]\n\t"
-               "and $-16, %%rsp\n\t"
-               "push %[ss]\n\t"
-               "push %[sp]\n\t"
-#endif
-               "pushf\n\t"
-               "push %[cs]\n\t"
-               CALL_NOSPEC
-               :
-#ifdef CONFIG_X86_64
-               [sp]"=&r"(tmp),
-#endif
-               ASM_CALL_CONSTRAINT
-               :
-               [thunk_target]"r"(entry),
-#ifdef CONFIG_X86_64
-               [ss]"i"(__KERNEL_DS),
-#endif
-               [cs]"i"(__KERNEL_CS)
-       );
-
-       kvm_after_interrupt(vcpu);
+       handle_interrupt_nmi_irqoff(vcpu, intr_info);
  }
-STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
  
  static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
  {
@@ -6799,9 +6688,7 @@ reenter_guest:
         if (enable_preemption_timer)
                 vmx_update_hv_timer(vcpu);
  
-       if (lapic_in_kernel(vcpu) &&
-               vcpu->arch.apic->lapic_timer.timer_advance_ns)
-               kvm_wait_lapic_expire(vcpu);
+       kvm_wait_lapic_expire(vcpu);
  
         /*
          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -6945,20 +6832,20 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
                         goto free_vpid;
         }
  
-       BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS);
+       BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
  
-       for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
-               u32 index = vmx_msr_index[i];
+       for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
+               u32 index = vmx_uret_msrs_list[i];
                 u32 data_low, data_high;
-               int j = vmx->nmsrs;
+               int j = vmx->nr_uret_msrs;
  
                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
                         continue;
                 if (wrmsr_safe(index, data_low, data_high) < 0)
                         continue;
  
-               vmx->guest_msrs[j].index = i;
-               vmx->guest_msrs[j].data = 0;
+               vmx->guest_uret_msrs[j].slot = i;
+               vmx->guest_uret_msrs[j].data = 0;
                 switch (index) {
                 case MSR_IA32_TSX_CTRL:
                         /*
@@ -6966,32 +6853,36 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
                          * let's avoid changing CPUID bits under the host
                          * kernel's feet.
                          */
-                       vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+                       vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
                         break;
                 default:
-                       vmx->guest_msrs[j].mask = -1ull;
+                       vmx->guest_uret_msrs[j].mask = -1ull;
                         break;
                 }
-               ++vmx->nmsrs;
+               ++vmx->nr_uret_msrs;
         }
  
         err = alloc_loaded_vmcs(&vmx->vmcs01);
         if (err < 0)
                 goto free_pml;
  
+       /* The MSR bitmap starts with all ones */
+       bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+       bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+
         msr_bitmap = vmx->vmcs01.msr_bitmap;
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
-       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+       vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
         if (kvm_cstate_in_guest(vcpu->kvm)) {
-               vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
-               vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
-               vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
-               vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+               vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+               vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+               vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+               vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
         }
         vmx->msr_bitmap_mode = 0;
  
@@ -7015,8 +6906,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
         }
  
         if (nested)
-               nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
-                                          vmx_capability.ept);
+               memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
         else
                 memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
  
@@ -7336,11 +7226,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                 update_intel_pt_cfg(vcpu);
  
         if (boot_cpu_has(X86_FEATURE_RTM)) {
-               struct shared_msr_entry *msr;
-               msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL);
+               struct vmx_uret_msr *msr;
+               msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
                 if (msr) {
                         bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
-                       vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
+                       vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
                 }
         }
  }
@@ -7366,14 +7256,14 @@ static __init void vmx_set_cpu_caps(void)
  
         /* CPUID 0xD.1 */
         supported_xss = 0;
-       if (!vmx_xsaves_supported())
+       if (!cpu_has_vmx_xsaves())
                 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
  
         /* CPUID 0x80000001 */
         if (!cpu_has_vmx_rdtscp())
                 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
  
-       if (vmx_waitpkg_supported())
+       if (cpu_has_vmx_waitpkg())
                 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
  }
  
@@ -7429,7 +7319,7 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
          * Because it is marked as EmulateOnUD, we need to intercept it here.
          */
         case x86_intercept_rdtscp:
-               if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+               if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
                         exception->vector = UD_VECTOR;
                         exception->error_code_valid = false;
                         return X86EMUL_PROPAGATE_FAULT;
@@ -7561,107 +7451,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
         kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
  }
  
-static void __pi_post_block(struct kvm_vcpu *vcpu)
-{
-       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-       struct pi_desc old, new;
-       unsigned int dest;
-
-       do {
-               old.control = new.control = pi_desc->control;
-               WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
-                    "Wakeup handler not enabled while the VCPU is blocked\n");
-
-               dest = cpu_physical_id(vcpu->cpu);
-
-               if (x2apic_enabled())
-                       new.ndst = dest;
-               else
-                       new.ndst = (dest << 8) & 0xFF00;
-
-               /* set 'NV' to 'notification vector' */
-               new.nv = POSTED_INTR_VECTOR;
-       } while (cmpxchg64(&pi_desc->control, old.control,
-                          new.control) != old.control);
-
-       if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
-               spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-               list_del(&vcpu->blocked_vcpu_list);
-               spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-               vcpu->pre_pcpu = -1;
-       }
-}
-
-/*
- * This routine does the following things for vCPU which is going
- * to be blocked if VT-d PI is enabled.
- * - Store the vCPU to the wakeup list, so when interrupts happen
- *   we can find the right vCPU to wake up.
- * - Change the Posted-interrupt descriptor as below:
- *      'NDST' <-- vcpu->pre_pcpu
- *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
- * - If 'ON' is set during this process, which means at least one
- *   interrupt is posted for this vCPU, we cannot block it, in
- *   this case, return 1, otherwise, return 0.
- *
- */
-static int pi_pre_block(struct kvm_vcpu *vcpu)
-{
-       unsigned int dest;
-       struct pi_desc old, new;
-       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-       if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-               !irq_remapping_cap(IRQ_POSTING_CAP)  ||
-               !kvm_vcpu_apicv_active(vcpu))
-               return 0;
-
-       WARN_ON(irqs_disabled());
-       local_irq_disable();
-       if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
-               vcpu->pre_pcpu = vcpu->cpu;
-               spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-               list_add_tail(&vcpu->blocked_vcpu_list,
-                             &per_cpu(blocked_vcpu_on_cpu,
-                                      vcpu->pre_pcpu));
-               spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-       }
-
-       do {
-               old.control = new.control = pi_desc->control;
-
-               WARN((pi_desc->sn == 1),
-                    "Warning: SN field of posted-interrupts "
-                    "is set before blocking\n");
-
-               /*
-                * Since vCPU can be preempted during this process,
-                * vcpu->cpu could be different with pre_pcpu, we
-                * need to set pre_pcpu as the destination of wakeup
-                * notification event, then we can find the right vCPU
-                * to wakeup in wakeup handler if interrupts happen
-                * when the vCPU is in blocked state.
-                */
-               dest = cpu_physical_id(vcpu->pre_pcpu);
-
-               if (x2apic_enabled())
-                       new.ndst = dest;
-               else
-                       new.ndst = (dest << 8) & 0xFF00;
-
-               /* set 'NV' to 'wakeup vector' */
-               new.nv = POSTED_INTR_WAKEUP_VECTOR;
-       } while (cmpxchg64(&pi_desc->control, old.control,
-                          new.control) != old.control);
-
-       /* We should not block the vCPU if an interrupt is posted for it.  */
-       if (pi_test_on(pi_desc) == 1)
-               __pi_post_block(vcpu);
-
-       local_irq_enable();
-       return (vcpu->pre_pcpu == -1);
-}
-
  static int vmx_pre_block(struct kvm_vcpu *vcpu)
  {
         if (pi_pre_block(vcpu))
@@ -7673,17 +7462,6 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
         return 0;
  }
  
-static void pi_post_block(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->pre_pcpu == -1)
-               return;
-
-       WARN_ON(irqs_disabled());
-       local_irq_disable();
-       __pi_post_block(vcpu);
-       local_irq_enable();
-}
-
  static void vmx_post_block(struct kvm_vcpu *vcpu)
  {
         if (kvm_x86_ops.set_hv_timer)
@@ -7692,100 +7470,6 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
         pi_post_block(vcpu);
  }
  
-/*
- * vmx_update_pi_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
-                             uint32_t guest_irq, bool set)
-{
-       struct kvm_kernel_irq_routing_entry *e;
-       struct kvm_irq_routing_table *irq_rt;
-       struct kvm_lapic_irq irq;
-       struct kvm_vcpu *vcpu;
-       struct vcpu_data vcpu_info;
-       int idx, ret = 0;
-
-       if (!kvm_arch_has_assigned_device(kvm) ||
-               !irq_remapping_cap(IRQ_POSTING_CAP) ||
-               !kvm_vcpu_apicv_active(kvm->vcpus[0]))
-               return 0;
-
-       idx = srcu_read_lock(&kvm->irq_srcu);
-       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-       if (guest_irq >= irq_rt->nr_rt_entries ||
-           hlist_empty(&irq_rt->map[guest_irq])) {
-               pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
-                            guest_irq, irq_rt->nr_rt_entries);
-               goto out;
-       }
-
-       hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
-               if (e->type != KVM_IRQ_ROUTING_MSI)
-                       continue;
-               /*
-                * VT-d PI cannot support posting multicast/broadcast
-                * interrupts to a vCPU, we still use interrupt remapping
-                * for these kind of interrupts.
-                *
-                * For lowest-priority interrupts, we only support
-                * those with single CPU as the destination, e.g. user
-                * configures the interrupts via /proc/irq or uses
-                * irqbalance to make the interrupts single-CPU.
-                *
-                * We will support full lowest-priority interrupt later.
-                *
-                * In addition, we can only inject generic interrupts using
-                * the PI mechanism, refuse to route others through it.
-                */
-
-               kvm_set_msi_irq(kvm, e, &irq);
-               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
-                   !kvm_irq_is_postable(&irq)) {
-                       /*
-                        * Make sure the IRTE is in remapped mode if
-                        * we don't handle it in posted mode.
-                        */
-                       ret = irq_set_vcpu_affinity(host_irq, NULL);
-                       if (ret < 0) {
-                               printk(KERN_INFO
-                                  "failed to back to remapped mode, irq: %u\n",
-                                  host_irq);
-                               goto out;
-                       }
-
-                       continue;
-               }
-
-               vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
-               vcpu_info.vector = irq.vector;
-
-               trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
-                               vcpu_info.vector, vcpu_info.pi_desc_addr, set);
-
-               if (set)
-                       ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
-               else
-                       ret = irq_set_vcpu_affinity(host_irq, NULL);
-
-               if (ret < 0) {
-                       printk(KERN_INFO "%s: failed to update PI IRTE\n",
-                                       __func__);
-                       goto out;
-               }
-       }
-
-       ret = 0;
-out:
-       srcu_read_unlock(&kvm->irq_srcu, idx);
-       return ret;
-}
-
  static void vmx_setup_mce(struct kvm_vcpu *vcpu)
  {
         if (vcpu->arch.mcg_cap & MCG_LMCE_P)
@@ -7843,11 +7527,6 @@ static void enable_smi_window(struct kvm_vcpu *vcpu)
         /* RSM will cause a vmexit anyway.  */
  }
  
-static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
-{
-       return false;
-}
-
  static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
  {
         return to_vmx(vcpu)->nested.vmxon;
@@ -7954,7 +7633,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
         .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
         .sync_pir_to_irr = vmx_sync_pir_to_irr,
         .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
-       .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
+       .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
  
         .set_tss_addr = vmx_set_tss_addr,
         .set_identity_map_addr = vmx_set_identity_map_addr,
@@ -7988,7 +7667,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
         .pmu_ops = &intel_pmu_ops,
         .nested_ops = &vmx_nested_ops,
  
-       .update_pi_irte = vmx_update_pi_irte,
+       .update_pi_irte = pi_update_irte,
  
  #ifdef CONFIG_X86_64
         .set_hv_timer = vmx_set_hv_timer,
@@ -8002,9 +7681,11 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
         .pre_leave_smm = vmx_pre_leave_smm,
         .enable_smi_window = enable_smi_window,
  
-       .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
+       .can_emulate_instruction = vmx_can_emulate_instruction,
         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
         .migrate_timers = vmx_migrate_timers,
+
+       .msr_filter_changed = vmx_msr_filter_changed,
  };
  
  static __init int hardware_setup(void)
@@ -8016,8 +7697,8 @@ static __init int hardware_setup(void)
         store_idt(&dt);
         host_idt_base = dt.address;
  
-       for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
-               kvm_define_shared_msr(i, vmx_msr_index[i]);
+       for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+               kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
  
         if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
                 return -EIO;
@@ -8154,7 +7835,7 @@ static __init int hardware_setup(void)
                 vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
         }
  
-       kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+       kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
  
         kvm_mce_cap_supported |= MCG_LMCE_P;
  
@@ -8293,8 +7974,8 @@ static int __init vmx_init(void)
  
         for_each_possible_cpu(cpu) {
                 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
-               INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
-               spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+
+               pi_init(cpu);
         }
  
  #ifdef CONFIG_KEXEC_CORE
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h

index 26175a4..5961cb8 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -9,8 +9,9 @@
  
  #include "capabilities.h"
  #include "kvm_cache_regs.h"
-#include "ops.h"
+#include "posted_intr.h"
  #include "vmcs.h"
+#include "vmx_ops.h"
  #include "cpuid.h"
  
  extern const u32 vmx_msr_index[];
@@ -22,20 +23,20 @@ extern const u32 vmx_msr_index[];
  #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
  
  #ifdef CONFIG_X86_64
-#define NR_SHARED_MSRS 7
+#define MAX_NR_USER_RETURN_MSRS        7
  #else
-#define NR_SHARED_MSRS 4
+#define MAX_NR_USER_RETURN_MSRS        4
  #endif
  
-#define NR_LOADSTORE_MSRS 8
+#define MAX_NR_LOADSTORE_MSRS  8
  
  struct vmx_msrs {
         unsigned int            nr;
-       struct vmx_msr_entry    val[NR_LOADSTORE_MSRS];
+       struct vmx_msr_entry    val[MAX_NR_LOADSTORE_MSRS];
  };
  
-struct shared_msr_entry {
-       unsigned index;
+struct vmx_uret_msr {
+       unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */
         u64 data;
         u64 mask;
  };
@@ -49,29 +50,6 @@ enum segment_cache_field {
         SEG_FIELD_NR = 4
  };
  
-/* Posted-Interrupt Descriptor */
-struct pi_desc {
-       u32 pir[8];     /* Posted interrupt requested */
-       union {
-               struct {
-                               /* bit 256 - Outstanding Notification */
-                       u16     on      : 1,
-                               /* bit 257 - Suppress Notification */
-                               sn      : 1,
-                               /* bit 271:258 - Reserved */
-                               rsvd_1  : 14;
-                               /* bit 279:272 - Notification Vector */
-                       u8      nv;
-                               /* bit 287:280 - Reserved */
-                       u8      rsvd_2;
-                               /* bit 319:288 - Notification Destination */
-                       u32     ndst;
-               };
-               u64 control;
-       };
-       u32 rsvd[6];
-} __aligned(64);
-
  #define RTIT_ADDR_RANGE                4
  
  struct pt_ctx {
@@ -218,10 +196,10 @@ struct vcpu_vmx {
         u32                   idt_vectoring_info;
         ulong                 rflags;
  
-       struct shared_msr_entry guest_msrs[NR_SHARED_MSRS];
-       int                   nmsrs;
-       int                   save_nmsrs;
-       bool                  guest_msrs_ready;
+       struct vmx_uret_msr   guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
+       int                   nr_uret_msrs;
+       int                   nr_active_uret_msrs;
+       bool                  guest_uret_msrs_loaded;
  #ifdef CONFIG_X86_64
         u64                   msr_host_kernel_gs_base;
         u64                   msr_guest_kernel_gs_base;
@@ -301,6 +279,13 @@ struct vcpu_vmx {
         u64 ept_pointer;
  
         struct pt_desc pt_desc;
+
+       /* Save desired MSR intercept (read: pass-through) state */
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS  13
+       struct {
+               DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+               DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+       } shadow_msr_intercept;
  };
  
  enum ept_pointers_status {
@@ -343,6 +328,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
  void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
  u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
                    int root_level);
+
  void update_exception_bitmap(struct kvm_vcpu *vcpu);
  void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
  bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
@@ -350,73 +336,11 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
  bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
  void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
  void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
-struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr);
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx);
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr);
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
  void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
-int vmx_find_msr_index(struct vmx_msrs *m, u32 msr);
-int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
-                             struct x86_exception *e);
-
-#define POSTED_INTR_ON  0
-#define POSTED_INTR_SN  1
-
-static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
-{
-       return test_and_set_bit(POSTED_INTR_ON,
-                       (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
-{
-       return test_and_clear_bit(POSTED_INTR_ON,
-                       (unsigned long *)&pi_desc->control);
-}
-
-static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
-{
-       return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
-}
-
-static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
-{
-       return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
-}
-
-static inline void pi_set_sn(struct pi_desc *pi_desc)
-{
-       set_bit(POSTED_INTR_SN,
-               (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_set_on(struct pi_desc *pi_desc)
-{
-       set_bit(POSTED_INTR_ON,
-               (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_on(struct pi_desc *pi_desc)
-{
-       clear_bit(POSTED_INTR_ON,
-               (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_sn(struct pi_desc *pi_desc)
-{
-       clear_bit(POSTED_INTR_SN,
-               (unsigned long *)&pi_desc->control);
-}
-
-static inline int pi_test_on(struct pi_desc *pi_desc)
-{
-       return test_bit(POSTED_INTR_ON,
-                       (unsigned long *)&pi_desc->control);
-}
-
-static inline int pi_test_sn(struct pi_desc *pi_desc)
-{
-       return test_bit(POSTED_INTR_SN,
-                       (unsigned long *)&pi_desc->control);
-}
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
  
  static inline u8 vmx_get_rvi(void)
  {
@@ -498,11 +422,6 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
         return container_of(vcpu, struct vcpu_vmx, vcpu);
  }
  
-static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
-{
-       return &(to_vmx(vcpu)->pi_desc);
-}
-
  static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -554,6 +473,19 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
         return !enable_ept || cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits;
  }
  
+static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
+{
+       return enable_unrestricted_guest && (!is_guest_mode(vcpu) ||
+           (secondary_exec_controls_get(to_vmx(vcpu)) &
+           SECONDARY_EXEC_UNRESTRICTED_GUEST));
+}
+
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu);
+static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu)
+{
+       return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu);
+}
+
  void dump_vmcs(void);
  
  #endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h

new file mode 100644 (file)

index 0000000..692b0c3
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -0,0 +1,320 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_INSN_H
+#define __KVM_X86_VMX_INSN_H
+
+#include <linux/nospec.h>
+
+#include <asm/kvm_host.h>
+#include <asm/vmx.h>
+
+#include "evmcs.h"
+#include "vmcs.h"
+
+#define __ex(x) __kvm_handle_fault_on_reboot(x)
+
+asmlinkage void vmread_error(unsigned long field, bool fault);
+__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
+                                                        bool fault);
+void vmwrite_error(unsigned long field, unsigned long value);
+void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
+void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
+void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
+void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
+
+static __always_inline void vmcs_check16(unsigned long field)
+{
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+                        "16-bit accessor invalid for 64-bit field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+                        "16-bit accessor invalid for 64-bit high field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+                        "16-bit accessor invalid for 32-bit high field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+                        "16-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check32(unsigned long field)
+{
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+                        "32-bit accessor invalid for 16-bit field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+                        "32-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check64(unsigned long field)
+{
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+                        "64-bit accessor invalid for 16-bit field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+                        "64-bit accessor invalid for 64-bit high field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+                        "64-bit accessor invalid for 32-bit field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+                        "64-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_checkl(unsigned long field)
+{
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+                        "Natural width accessor invalid for 16-bit field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+                        "Natural width accessor invalid for 64-bit field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+                        "Natural width accessor invalid for 64-bit high field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+                        "Natural width accessor invalid for 32-bit field");
+}
+
+static __always_inline unsigned long __vmcs_readl(unsigned long field)
+{
+       unsigned long value;
+
+       asm volatile("1: vmread %2, %1\n\t"
+                    ".byte 0x3e\n\t" /* branch taken hint */
+                    "ja 3f\n\t"
+
+                    /*
+                     * VMREAD failed.  Push '0' for @fault, push the failing
+                     * @field, and bounce through the trampoline to preserve
+                     * volatile registers.
+                     */
+                    "push $0\n\t"
+                    "push %2\n\t"
+                    "2:call vmread_error_trampoline\n\t"
+
+                    /*
+                     * Unwind the stack.  Note, the trampoline zeros out the
+                     * memory for @fault so that the result is '0' on error.
+                     */
+                    "pop %2\n\t"
+                    "pop %1\n\t"
+                    "3:\n\t"
+
+                    /* VMREAD faulted.  As above, except push '1' for @fault. */
+                    ".pushsection .fixup, \"ax\"\n\t"
+                    "4: push $1\n\t"
+                    "push %2\n\t"
+                    "jmp 2b\n\t"
+                    ".popsection\n\t"
+                    _ASM_EXTABLE(1b, 4b)
+                    : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc");
+       return value;
+}
+
+static __always_inline u16 vmcs_read16(unsigned long field)
+{
+       vmcs_check16(field);
+       if (static_branch_unlikely(&enable_evmcs))
+               return evmcs_read16(field);
+       return __vmcs_readl(field);
+}
+
+static __always_inline u32 vmcs_read32(unsigned long field)
+{
+       vmcs_check32(field);
+       if (static_branch_unlikely(&enable_evmcs))
+               return evmcs_read32(field);
+       return __vmcs_readl(field);
+}
+
+static __always_inline u64 vmcs_read64(unsigned long field)
+{
+       vmcs_check64(field);
+       if (static_branch_unlikely(&enable_evmcs))
+               return evmcs_read64(field);
+#ifdef CONFIG_X86_64
+       return __vmcs_readl(field);
+#else
+       return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
+#endif
+}
+
+static __always_inline unsigned long vmcs_readl(unsigned long field)
+{
+       vmcs_checkl(field);
+       if (static_branch_unlikely(&enable_evmcs))
+               return evmcs_read64(field);
+       return __vmcs_readl(field);
+}
+
+#define vmx_asm1(insn, op1, error_args...)                             \
+do {                                                                   \
+       asm_volatile_goto("1: " __stringify(insn) " %0\n\t"             \
+                         ".byte 0x2e\n\t" /* branch not taken hint */  \
+                         "jna %l[error]\n\t"                           \
+                         _ASM_EXTABLE(1b, %l[fault])                   \
+                         : : op1 : "cc" : error, fault);               \
+       return;                                                         \
+error:                                                                 \
+       instrumentation_begin();                                        \
+       insn##_error(error_args);                                       \
+       instrumentation_end();                                          \
+       return;                                                         \
+fault:                                                                 \
+       kvm_spurious_fault();                                           \
+} while (0)
+
+#define vmx_asm2(insn, op1, op2, error_args...)                                \
+do {                                                                   \
+       asm_volatile_goto("1: "  __stringify(insn) " %1, %0\n\t"        \
+                         ".byte 0x2e\n\t" /* branch not taken hint */  \
+                         "jna %l[error]\n\t"                           \
+                         _ASM_EXTABLE(1b, %l[fault])                   \
+                         : : op1, op2 : "cc" : error, fault);          \
+       return;                                                         \
+error:                                                                 \
+       instrumentation_begin();                                        \
+       insn##_error(error_args);                                       \
+       instrumentation_end();                                          \
+       return;                                                         \
+fault:                                                                 \
+       kvm_spurious_fault();                                           \
+} while (0)
+
+static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
+{
+       vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value);
+}
+
+static __always_inline void vmcs_write16(unsigned long field, u16 value)
+{
+       vmcs_check16(field);
+       if (static_branch_unlikely(&enable_evmcs))
+               return evmcs_write16(field, value);
+
+       __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_write32(unsigned long field, u32 value)
+{
+       vmcs_check32(field);
+       if (static_branch_unlikely(&enable_evmcs))
+               return evmcs_write32(field, value);
+
+       __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_write64(unsigned long field, u64 value)
+{
+       vmcs_check64(field);
+       if (static_branch_unlikely(&enable_evmcs))
+               return evmcs_write64(field, value);
+
+       __vmcs_writel(field, value);
+#ifndef CONFIG_X86_64
+       __vmcs_writel(field+1, value >> 32);
+#endif
+}
+
+static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
+{
+       vmcs_checkl(field);
+       if (static_branch_unlikely(&enable_evmcs))
+               return evmcs_write64(field, value);
+
+       __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
+{
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+                        "vmcs_clear_bits does not support 64-bit fields");
+       if (static_branch_unlikely(&enable_evmcs))
+               return evmcs_write32(field, evmcs_read32(field) & ~mask);
+
+       __vmcs_writel(field, __vmcs_readl(field) & ~mask);
+}
+
+static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
+{
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+                        "vmcs_set_bits does not support 64-bit fields");
+       if (static_branch_unlikely(&enable_evmcs))
+               return evmcs_write32(field, evmcs_read32(field) | mask);
+
+       __vmcs_writel(field, __vmcs_readl(field) | mask);
+}
+
+static inline void vmcs_clear(struct vmcs *vmcs)
+{
+       u64 phys_addr = __pa(vmcs);
+
+       vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr);
+}
+
+static inline void vmcs_load(struct vmcs *vmcs)
+{
+       u64 phys_addr = __pa(vmcs);
+
+       if (static_branch_unlikely(&enable_evmcs))
+               return evmcs_load(phys_addr);
+
+       vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
+}
+
+static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
+{
+       struct {
+               u64 vpid : 16;
+               u64 rsvd : 48;
+               u64 gva;
+       } operand = { vpid, 0, gva };
+
+       vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva);
+}
+
+static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
+{
+       struct {
+               u64 eptp, gpa;
+       } operand = {eptp, gpa};
+
+       vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
+}
+
+static inline void vpid_sync_vcpu_single(int vpid)
+{
+       if (vpid == 0)
+               return;
+
+       __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
+}
+
+static inline void vpid_sync_vcpu_global(void)
+{
+       __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(int vpid)
+{
+       if (cpu_has_vmx_invvpid_single())
+               vpid_sync_vcpu_single(vpid);
+       else if (vpid != 0)
+               vpid_sync_vcpu_global();
+}
+
+static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr)
+{
+       if (vpid == 0)
+               return;
+
+       if (cpu_has_vmx_invvpid_individual_addr())
+               __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
+       else
+               vpid_sync_context(vpid);
+}
+
+static inline void ept_sync_global(void)
+{
+       __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+}
+
+static inline void ept_sync_context(u64 eptp)
+{
+       if (cpu_has_vmx_invept_context())
+               __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+       else
+               ept_sync_global();
+}
+
+#endif /* __KVM_X86_VMX_INSN_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 7527022..c4015a4 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -71,6 +71,7 @@
  #include <asm/irq_remapping.h>
  #include <asm/mshyperv.h>
  #include <asm/hypervisor.h>
+#include <asm/tlbflush.h>
  #include <asm/intel_pt.h>
  #include <asm/emulate_prefix.h>
  #include <clocksource/hyperv_timer.h>
@@ -161,24 +162,29 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
  int __read_mostly pi_inject_timer = -1;
  module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
  
-#define KVM_NR_SHARED_MSRS 16
+/*
+ * Restoring the host value for MSRs that are only consumed when running in
+ * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
+ * returns to userspace, i.e. the kernel can run with the guest's value.
+ */
+#define KVM_MAX_NR_USER_RETURN_MSRS 16
  
-struct kvm_shared_msrs_global {
+struct kvm_user_return_msrs_global {
         int nr;
-       u32 msrs[KVM_NR_SHARED_MSRS];
+       u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
  };
  
-struct kvm_shared_msrs {
+struct kvm_user_return_msrs {
         struct user_return_notifier urn;
         bool registered;
-       struct kvm_shared_msr_values {
+       struct kvm_user_return_msr_values {
                 u64 host;
                 u64 curr;
-       } values[KVM_NR_SHARED_MSRS];
+       } values[KVM_MAX_NR_USER_RETURN_MSRS];
  };
  
-static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
-static struct kvm_shared_msrs __percpu *shared_msrs;
+static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
+static struct kvm_user_return_msrs __percpu *user_return_msrs;
  
  #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
                                 | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
@@ -266,7 +272,7 @@ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
         } else {
                 vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n",
                                        op, msr, data);
-               return 1;
+               return -ENOENT;
         }
  }
  
@@ -293,9 +299,9 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
  static void kvm_on_user_return(struct user_return_notifier *urn)
  {
         unsigned slot;
-       struct kvm_shared_msrs *locals
-               = container_of(urn, struct kvm_shared_msrs, urn);
-       struct kvm_shared_msr_values *values;
+       struct kvm_user_return_msrs *msrs
+               = container_of(urn, struct kvm_user_return_msrs, urn);
+       struct kvm_user_return_msr_values *values;
         unsigned long flags;
  
         /*
@@ -303,73 +309,73 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
          * interrupted and executed through kvm_arch_hardware_disable()
          */
         local_irq_save(flags);
-       if (locals->registered) {
-               locals->registered = false;
+       if (msrs->registered) {
+               msrs->registered = false;
                 user_return_notifier_unregister(urn);
         }
         local_irq_restore(flags);
-       for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
-               values = &locals->values[slot];
+       for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
+               values = &msrs->values[slot];
                 if (values->host != values->curr) {
-                       wrmsrl(shared_msrs_global.msrs[slot], values->host);
+                       wrmsrl(user_return_msrs_global.msrs[slot], values->host);
                         values->curr = values->host;
                 }
         }
  }
  
-void kvm_define_shared_msr(unsigned slot, u32 msr)
+void kvm_define_user_return_msr(unsigned slot, u32 msr)
  {
-       BUG_ON(slot >= KVM_NR_SHARED_MSRS);
-       shared_msrs_global.msrs[slot] = msr;
-       if (slot >= shared_msrs_global.nr)
-               shared_msrs_global.nr = slot + 1;
+       BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
+       user_return_msrs_global.msrs[slot] = msr;
+       if (slot >= user_return_msrs_global.nr)
+               user_return_msrs_global.nr = slot + 1;
  }
-EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
+EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
  
-static void kvm_shared_msr_cpu_online(void)
+static void kvm_user_return_msr_cpu_online(void)
  {
         unsigned int cpu = smp_processor_id();
-       struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+       struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
         u64 value;
         int i;
  
-       for (i = 0; i < shared_msrs_global.nr; ++i) {
-               rdmsrl_safe(shared_msrs_global.msrs[i], &value);
-               smsr->values[i].host = value;
-               smsr->values[i].curr = value;
+       for (i = 0; i < user_return_msrs_global.nr; ++i) {
+               rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
+               msrs->values[i].host = value;
+               msrs->values[i].curr = value;
         }
  }
  
-int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
  {
         unsigned int cpu = smp_processor_id();
-       struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+       struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
         int err;
  
-       value = (value & mask) | (smsr->values[slot].host & ~mask);
-       if (value == smsr->values[slot].curr)
+       value = (value & mask) | (msrs->values[slot].host & ~mask);
+       if (value == msrs->values[slot].curr)
                 return 0;
-       err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
+       err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
         if (err)
                 return 1;
  
-       smsr->values[slot].curr = value;
-       if (!smsr->registered) {
-               smsr->urn.on_user_return = kvm_on_user_return;
-               user_return_notifier_register(&smsr->urn);
-               smsr->registered = true;
+       msrs->values[slot].curr = value;
+       if (!msrs->registered) {
+               msrs->urn.on_user_return = kvm_on_user_return;
+               user_return_notifier_register(&msrs->urn);
+               msrs->registered = true;
         }
         return 0;
  }
-EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
+EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
  
  static void drop_user_return_notifiers(void)
  {
         unsigned int cpu = smp_processor_id();
-       struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+       struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
  
-       if (smsr->registered)
-               kvm_on_user_return(&smsr->urn);
+       if (msrs->registered)
+               kvm_on_user_return(&msrs->urn);
  }
  
  u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
@@ -1482,6 +1488,40 @@ void kvm_enable_efer_bits(u64 mask)
  }
  EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
  
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
+       u32 count = kvm->arch.msr_filter.count;
+       u32 i;
+       bool r = kvm->arch.msr_filter.default_allow;
+       int idx;
+
+       /* MSR filtering not set up, allow everything */
+       if (!count)
+               return true;
+
+       /* Prevent collision with set_msr_filter */
+       idx = srcu_read_lock(&kvm->srcu);
+
+       for (i = 0; i < count; i++) {
+               u32 start = ranges[i].base;
+               u32 end = start + ranges[i].nmsrs;
+               u32 flags = ranges[i].flags;
+               unsigned long *bitmap = ranges[i].bitmap;
+
+               if ((index >= start) && (index < end) && (flags & type)) {
+                       r = !!test_bit(index - start, bitmap);
+                       break;
+               }
+       }
+
+       srcu_read_unlock(&kvm->srcu, idx);
+
+       return r;
+}
+EXPORT_SYMBOL_GPL(kvm_msr_allowed);
+
  /*
   * Write @data into the MSR specified by @index.  Select MSR specific fault
   * checks are bypassed if @host_initiated is %true.
@@ -1493,6 +1533,9 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
  {
         struct msr_data msr;
  
+       if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
+               return -EPERM;
+
         switch (index) {
         case MSR_FS_BASE:
         case MSR_GS_BASE:
@@ -1549,6 +1592,9 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
         struct msr_data msr;
         int ret;
  
+       if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
+               return -EPERM;
+
         msr.index = index;
         msr.host_initiated = host_initiated;
  
@@ -1584,12 +1630,91 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
  }
  EXPORT_SYMBOL_GPL(kvm_set_msr);
  
+static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read)
+{
+       if (vcpu->run->msr.error) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       } else if (is_read) {
+               kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
+               kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
+       }
+
+       return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+       return complete_emulated_msr(vcpu, true);
+}
+
+static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
+{
+       return complete_emulated_msr(vcpu, false);
+}
+
+static u64 kvm_msr_reason(int r)
+{
+       switch (r) {
+       case -ENOENT:
+               return KVM_MSR_EXIT_REASON_UNKNOWN;
+       case -EPERM:
+               return KVM_MSR_EXIT_REASON_FILTER;
+       default:
+               return KVM_MSR_EXIT_REASON_INVAL;
+       }
+}
+
+static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
+                             u32 exit_reason, u64 data,
+                             int (*completion)(struct kvm_vcpu *vcpu),
+                             int r)
+{
+       u64 msr_reason = kvm_msr_reason(r);
+
+       /* Check if the user wanted to know about this MSR fault */
+       if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
+               return 0;
+
+       vcpu->run->exit_reason = exit_reason;
+       vcpu->run->msr.error = 0;
+       memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
+       vcpu->run->msr.reason = msr_reason;
+       vcpu->run->msr.index = index;
+       vcpu->run->msr.data = data;
+       vcpu->arch.complete_userspace_io = completion;
+
+       return 1;
+}
+
+static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
+{
+       return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
+                                  complete_emulated_rdmsr, r);
+}
+
+static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
+{
+       return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
+                                  complete_emulated_wrmsr, r);
+}
+
  int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
  {
         u32 ecx = kvm_rcx_read(vcpu);
         u64 data;
+       int r;
  
-       if (kvm_get_msr(vcpu, ecx, &data)) {
+       r = kvm_get_msr(vcpu, ecx, &data);
+
+       /* MSR read failed? See if we should ask user space */
+       if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
+               /* Bounce to user space */
+               return 0;
+       }
+
+       /* MSR read failed? Inject a #GP */
+       if (r) {
                 trace_kvm_msr_read_ex(ecx);
                 kvm_inject_gp(vcpu, 0);
                 return 1;
@@ -1607,8 +1732,18 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
  {
         u32 ecx = kvm_rcx_read(vcpu);
         u64 data = kvm_read_edx_eax(vcpu);
+       int r;
+
+       r = kvm_set_msr(vcpu, ecx, data);
+
+       /* MSR write failed? See if we should ask user space */
+       if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) {
+               /* Bounce to user space */
+               return 0;
+       }
  
-       if (kvm_set_msr(vcpu, ecx, data)) {
+       /* MSR write failed? Inject a #GP */
+       if (r) {
                 trace_kvm_msr_write_ex(ecx, data);
                 kvm_inject_gp(vcpu, 0);
                 return 1;
@@ -1774,12 +1909,6 @@ static s64 get_kvmclock_base_ns(void)
  }
  #endif
  
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
-{
-       kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
-       kvm_vcpu_kick(vcpu);
-}
-
  static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
  {
         int version;
@@ -1978,12 +2107,6 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
  #endif
  }
  
-static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
-{
-       u64 curr_offset = vcpu->arch.l1_tsc_offset;
-       vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
-}
-
  /*
   * Multiply tsc by a fixed point number represented by ratio.
   *
@@ -2045,14 +2168,13 @@ static inline bool kvm_check_tsc_unstable(void)
         return check_tsc_unstable();
  }
  
-void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
  {
         struct kvm *kvm = vcpu->kvm;
         u64 offset, ns, elapsed;
         unsigned long flags;
         bool matched;
         bool already_matched;
-       u64 data = msr->data;
         bool synchronizing = false;
  
         raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -2061,7 +2183,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
         elapsed = ns - kvm->arch.last_tsc_nsec;
  
         if (vcpu->arch.virtual_tsc_khz) {
-               if (data == 0 && msr->host_initiated) {
+               if (data == 0) {
                         /*
                          * detection of vcpu initialization -- need to sync
                          * with other vCPUs. This particularly helps to keep
@@ -2131,9 +2253,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
         vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
         vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
  
-       if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
-               update_ia32_tsc_adjust_msr(vcpu, offset);
-
         kvm_vcpu_write_tsc_offset(vcpu, offset);
         raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  
@@ -2148,8 +2267,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
         spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
  }
  
-EXPORT_SYMBOL_GPL(kvm_write_tsc);
-
  static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
                                            s64 adjustment)
  {
@@ -2731,7 +2848,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
                 return 1;
  
         if (!lapic_in_kernel(vcpu))
-               return 1;
+               return data ? 1 : 0;
  
         vcpu->arch.apf.msr_en_val = data;
  
@@ -2944,7 +3061,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 vcpu->arch.msr_ia32_power_ctl = data;
                 break;
         case MSR_IA32_TSC:
-               kvm_write_tsc(vcpu, msr_info);
+               if (msr_info->host_initiated) {
+                       kvm_synchronize_tsc(vcpu, data);
+               } else {
+                       u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+                       adjust_tsc_offset_guest(vcpu, adj);
+                       vcpu->arch.ia32_tsc_adjust_msr += adj;
+               }
                 break;
         case MSR_IA32_XSS:
                 if (!msr_info->host_initiated &&
@@ -3221,9 +3344,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
         case MSR_IA32_POWER_CTL:
                 msr_info->data = vcpu->arch.msr_ia32_power_ctl;
                 break;
-       case MSR_IA32_TSC:
-               msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+       case MSR_IA32_TSC: {
+               /*
+                * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
+                * even when not intercepted. AMD manual doesn't explicitly
+                * state this but appears to behave the same.
+                *
+                * Unconditionally return L1's TSC offset on userspace reads
+                * so that userspace reads and writes always operate on L1's
+                * offset, e.g. to ensure deterministic behavior for migration.
+                */
+               u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
+                                                           vcpu->arch.tsc_offset;
+
+               msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
                 break;
+       }
         case MSR_MTRRcap:
         case 0x200 ... 0x2ff:
                 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -3513,6 +3649,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
         case KVM_CAP_EXCEPTION_PAYLOAD:
         case KVM_CAP_SET_GUEST_DEBUG:
         case KVM_CAP_LAST_CPU:
+       case KVM_CAP_X86_USER_SPACE_MSR:
+       case KVM_CAP_X86_MSR_FILTER:
                 r = 1;
                 break;
         case KVM_CAP_SYNC_REGS:
@@ -5033,6 +5171,10 @@ split_irqchip_unlock:
                 kvm->arch.exception_payload_enabled = cap->args[0];
                 r = 0;
                 break;
+       case KVM_CAP_X86_USER_SPACE_MSR:
+               kvm->arch.user_space_msr_mask = cap->args[0];
+               r = 0;
+               break;
         default:
                 r = -EINVAL;
                 break;
@@ -5040,6 +5182,103 @@ split_irqchip_unlock:
         return r;
  }
  
+static void kvm_clear_msr_filter(struct kvm *kvm)
+{
+       u32 i;
+       u32 count = kvm->arch.msr_filter.count;
+       struct msr_bitmap_range ranges[16];
+
+       mutex_lock(&kvm->lock);
+       kvm->arch.msr_filter.count = 0;
+       memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0]));
+       mutex_unlock(&kvm->lock);
+       synchronize_srcu(&kvm->srcu);
+
+       for (i = 0; i < count; i++)
+               kfree(ranges[i].bitmap);
+}
+
+static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range)
+{
+       struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
+       struct msr_bitmap_range range;
+       unsigned long *bitmap = NULL;
+       size_t bitmap_size;
+       int r;
+
+       if (!user_range->nmsrs)
+               return 0;
+
+       bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
+       if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
+               return -EINVAL;
+
+       bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
+       if (IS_ERR(bitmap))
+               return PTR_ERR(bitmap);
+
+       range = (struct msr_bitmap_range) {
+               .flags = user_range->flags,
+               .base = user_range->base,
+               .nmsrs = user_range->nmsrs,
+               .bitmap = bitmap,
+       };
+
+       if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
+               r = -EINVAL;
+               goto err;
+       }
+
+       if (!range.flags) {
+               r = -EINVAL;
+               goto err;
+       }
+
+       /* Everything ok, add this range identifier to our global pool */
+       ranges[kvm->arch.msr_filter.count] = range;
+       /* Make sure we filled the array before we tell anyone to walk it */
+       smp_wmb();
+       kvm->arch.msr_filter.count++;
+
+       return 0;
+err:
+       kfree(bitmap);
+       return r;
+}
+
+static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
+{
+       struct kvm_msr_filter __user *user_msr_filter = argp;
+       struct kvm_msr_filter filter;
+       bool default_allow;
+       int r = 0;
+       u32 i;
+
+       if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
+               return -EFAULT;
+
+       kvm_clear_msr_filter(kvm);
+
+       default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY);
+       kvm->arch.msr_filter.default_allow = default_allow;
+
+       /*
+        * Protect from concurrent calls to this function that could trigger
+        * a TOCTOU violation on kvm->arch.msr_filter.count.
+        */
+       mutex_lock(&kvm->lock);
+       for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
+               r = kvm_add_msr_filter(kvm, &filter.ranges[i]);
+               if (r)
+                       break;
+       }
+
+       kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
+       mutex_unlock(&kvm->lock);
+
+       return r;
+}
+
  long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
  {
@@ -5346,6 +5585,9 @@ set_pit2_out:
         case KVM_SET_PMU_EVENT_FILTER:
                 r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
                 break;
+       case KVM_X86_SET_MSR_FILTER:
+               r = kvm_vm_ioctl_set_msr_filter(kvm, argp);
+               break;
         default:
                 r = -ENOTTY;
         }
@@ -5707,6 +5949,9 @@ int handle_ud(struct kvm_vcpu *vcpu)
         char sig[5]; /* ud2; .ascii "kvm" */
         struct x86_exception e;
  
+       if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
+               return 1;
+
         if (force_emulation_prefix &&
             kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
                                 sig, sizeof(sig), &e) == 0 &&
@@ -6362,13 +6607,33 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
  static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
                             u32 msr_index, u64 *pdata)
  {
-       return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+       int r;
+
+       r = kvm_get_msr(vcpu, msr_index, pdata);
+
+       if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
+               /* Bounce to user space */
+               return X86EMUL_IO_NEEDED;
+       }
+
+       return r;
  }
  
  static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
                             u32 msr_index, u64 data)
  {
-       return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+       int r;
+
+       r = kvm_set_msr(vcpu, msr_index, data);
+
+       if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
+               /* Bounce to user space */
+               return X86EMUL_IO_NEEDED;
+       }
+
+       return r;
  }
  
  static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
@@ -6912,7 +7177,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
         int r;
         struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
         bool writeback = true;
-       bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+       bool write_fault_to_spt;
+
+       if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
+               return 1;
  
         vcpu->arch.l1tf_flush_l1d = true;
  
@@ -6920,6 +7188,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
          * Clear write_fault_to_shadow_pgtable here to ensure it is
          * never reused.
          */
+       write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
         vcpu->arch.write_fault_to_shadow_pgtable = false;
         kvm_clear_exception_queue(vcpu);
  
@@ -7514,9 +7783,9 @@ int kvm_arch_init(void *opaque)
                 goto out_free_x86_fpu_cache;
         }
  
-       shared_msrs = alloc_percpu(struct kvm_shared_msrs);
-       if (!shared_msrs) {
-               printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
+       user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
+       if (!user_return_msrs) {
+               printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
                 goto out_free_x86_emulator_cache;
         }
  
@@ -7549,7 +7818,7 @@ int kvm_arch_init(void *opaque)
         return 0;
  
  out_free_percpu:
-       free_percpu(shared_msrs);
+       free_percpu(user_return_msrs);
  out_free_x86_emulator_cache:
         kmem_cache_destroy(x86_emulator_cache);
  out_free_x86_fpu_cache:
@@ -7576,7 +7845,7 @@ void kvm_arch_exit(void)
  #endif
         kvm_x86_ops.hardware_enable = NULL;
         kvm_mmu_module_exit();
-       free_percpu(shared_msrs);
+       free_percpu(user_return_msrs);
         kmem_cache_destroy(x86_fpu_cache);
  }
  
@@ -8365,8 +8634,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         bool req_immediate_exit = false;
  
         if (kvm_request_pending(vcpu)) {
-               if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
-                       if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) {
+               if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+                       if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
                                 r = 0;
                                 goto out;
                         }
@@ -8473,6 +8742,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                         kvm_vcpu_update_apicv(vcpu);
                 if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
                         kvm_check_async_pf_completion(vcpu);
+               if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
+                       kvm_x86_ops.msr_filter_changed(vcpu);
         }
  
         if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -8548,7 +8819,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 kvm_x86_ops.request_immediate_exit(vcpu);
         }
  
-       trace_kvm_entry(vcpu->vcpu_id);
+       trace_kvm_entry(vcpu);
  
         fpregs_assert_state_consistent();
         if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -9562,7 +9833,6 @@ fail_mmu_destroy:
  
  void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
  {
-       struct msr_data msr;
         struct kvm *kvm = vcpu->kvm;
  
         kvm_hv_vcpu_postcreate(vcpu);
@@ -9570,10 +9840,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
         if (mutex_lock_killable(&vcpu->mutex))
                 return;
         vcpu_load(vcpu);
-       msr.data = 0x0;
-       msr.index = MSR_IA32_TSC;
-       msr.host_initiated = true;
-       kvm_write_tsc(vcpu, &msr);
+       kvm_synchronize_tsc(vcpu, 0);
         vcpu_put(vcpu);
  
         /* poll control enabled by default */
@@ -9707,7 +9974,7 @@ int kvm_arch_hardware_enable(void)
         u64 max_tsc = 0;
         bool stable, backwards_tsc = false;
  
-       kvm_shared_msr_cpu_online();
+       kvm_user_return_msr_cpu_online();
         ret = kvm_x86_ops.hardware_enable();
         if (ret != 0)
                 return ret;
@@ -10025,6 +10292,8 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
  
  void kvm_arch_destroy_vm(struct kvm *kvm)
  {
+       u32 i;
+
         if (current->mm == kvm->mm) {
                 /*
                  * Free memory regions allocated on behalf of userspace,
@@ -10041,6 +10310,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
         }
         if (kvm_x86_ops.vm_destroy)
                 kvm_x86_ops.vm_destroy(kvm);
+       for (i = 0; i < kvm->arch.msr_filter.count; i++)
+               kfree(kvm->arch.msr_filter.ranges[i].bitmap);
         kvm_pic_destroy(kvm);
         kvm_ioapic_destroy(kvm);
         kvm_free_vcpus(kvm);
@@ -10771,6 +11042,111 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c
  }
  EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
  
+/*
+ * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
+ * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
+ * indicates whether exit to userspace is needed.
+ */
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+                             struct x86_exception *e)
+{
+       if (r == X86EMUL_PROPAGATE_FAULT) {
+               kvm_inject_emulated_page_fault(vcpu, e);
+               return 1;
+       }
+
+       /*
+        * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
+        * while handling a VMX instruction KVM could've handled the request
+        * correctly by exiting to userspace and performing I/O but there
+        * doesn't seem to be a real use-case behind such requests, just return
+        * KVM_EXIT_INTERNAL_ERROR for now.
+        */
+       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+       vcpu->run->internal.ndata = 0;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
+
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
+{
+       bool pcid_enabled;
+       struct x86_exception e;
+       unsigned i;
+       unsigned long roots_to_free = 0;
+       struct {
+               u64 pcid;
+               u64 gla;
+       } operand;
+       int r;
+
+       r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+       if (r != X86EMUL_CONTINUE)
+               return kvm_handle_memory_failure(vcpu, r, &e);
+
+       if (operand.pcid >> 12 != 0) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+       switch (type) {
+       case INVPCID_TYPE_INDIV_ADDR:
+               if ((!pcid_enabled && (operand.pcid != 0)) ||
+                   is_noncanonical_address(operand.gla, vcpu)) {
+                       kvm_inject_gp(vcpu, 0);
+                       return 1;
+               }
+               kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
+               return kvm_skip_emulated_instruction(vcpu);
+
+       case INVPCID_TYPE_SINGLE_CTXT:
+               if (!pcid_enabled && (operand.pcid != 0)) {
+                       kvm_inject_gp(vcpu, 0);
+                       return 1;
+               }
+
+               if (kvm_get_active_pcid(vcpu) == operand.pcid) {
+                       kvm_mmu_sync_roots(vcpu);
+                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+               }
+
+               for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
+                           == operand.pcid)
+                               roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
+               /*
+                * If neither the current cr3 nor any of the prev_roots use the
+                * given PCID, then nothing needs to be done here because a
+                * resync will happen anyway before switching to any other CR3.
+                */
+
+               return kvm_skip_emulated_instruction(vcpu);
+
+       case INVPCID_TYPE_ALL_NON_GLOBAL:
+               /*
+                * Currently, KVM doesn't mark global entries in the shadow
+                * page tables, so a non-global flush just degenerates to a
+                * global flush. If needed, we could optimize this later by
+                * keeping track of global entries in shadow page tables.
+                */
+
+               fallthrough;
+       case INVPCID_TYPE_ALL_INCL_GLOBAL:
+               kvm_mmu_unload(vcpu);
+               return kvm_skip_emulated_instruction(vcpu);
+
+       default:
+               BUG(); /* We have already checked above that type <= 3 */
+       }
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
+
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h

index 995ab69..3900ab0 100644 (file)
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -246,7 +246,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
         return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu);
  }
  
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
  void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
  
  void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -372,6 +371,10 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
  int kvm_spec_ctrl_test_value(u64 value);
  int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
  bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu);
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+                             struct x86_exception *e);
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
  
  #define  KVM_MSR_RET_INVALID  2
  
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h

index 9417a34..26cfb0f 100644 (file)
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -17,7 +17,7 @@
         ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL),   \
         ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\
         ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI),          \
-       ERSN(HYPERV), ERSN(ARM_NISV)
+       ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR)
  
  TRACE_EVENT(kvm_userspace_exit,
             TP_PROTO(__u32 reason, int errno),
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h

index 3d80234..58f43aa 100644 (file)
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -248,6 +248,8 @@ struct kvm_hyperv_exit {
  #define KVM_EXIT_IOAPIC_EOI       26
  #define KVM_EXIT_HYPERV           27
  #define KVM_EXIT_ARM_NISV         28
+#define KVM_EXIT_X86_RDMSR        29
+#define KVM_EXIT_X86_WRMSR        30
  
  /* For KVM_EXIT_INTERNAL_ERROR */
  /* Emulate instruction failed. */
@@ -413,6 +415,17 @@ struct kvm_run {
                         __u64 esr_iss;
                         __u64 fault_ipa;
                 } arm_nisv;
+               /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */
+               struct {
+                       __u8 error; /* user -> kernel */
+                       __u8 pad[7];
+#define KVM_MSR_EXIT_REASON_INVAL      (1 << 0)
+#define KVM_MSR_EXIT_REASON_UNKNOWN    (1 << 1)
+#define KVM_MSR_EXIT_REASON_FILTER     (1 << 2)
+                       __u32 reason; /* kernel -> user */
+                       __u32 index; /* kernel -> user */
+                       __u64 data; /* kernel <-> user */
+               } msr;
                 /* Fix the size of the union. */
                 char padding[256];
         };
@@ -790,9 +803,10 @@ struct kvm_ppc_resize_hpt {
  #define KVM_VM_PPC_HV 1
  #define KVM_VM_PPC_PR 2
  
-/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */
-#define KVM_VM_MIPS_TE         0
+/* on MIPS, 0 indicates auto, 1 forces VZ ASE, 2 forces trap & emulate */
+#define KVM_VM_MIPS_AUTO       0
  #define KVM_VM_MIPS_VZ         1
+#define KVM_VM_MIPS_TE         2
  
  #define KVM_S390_SIE_PAGE_OFFSET 1
  
@@ -1036,6 +1050,8 @@ struct kvm_ppc_resize_hpt {
  #define KVM_CAP_SMALLER_MAXPHYADDR 185
  #define KVM_CAP_S390_DIAG318 186
  #define KVM_CAP_STEAL_TIME 187
+#define KVM_CAP_X86_USER_SPACE_MSR 188
+#define KVM_CAP_X86_MSR_FILTER 189
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
@@ -1537,6 +1553,9 @@ struct kvm_pv_cmd {
  /* Available with KVM_CAP_S390_PROTECTED */
  #define KVM_S390_PV_COMMAND            _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd)
  
+/* Available with KVM_CAP_X86_MSR_FILTER */
+#define KVM_X86_SET_MSR_FILTER _IOW(KVMIO,  0xc6, struct kvm_msr_filter)
+
  /* Secure Encrypted Virtualization command */
  enum sev_cmd_id {
         /* Guest initialization commands */
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore

index 4527871..307ceaa 100644 (file)
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -11,6 +11,7 @@
  /x86_64/set_sregs_test
  /x86_64/smm_test
  /x86_64/state_test
+/x86_64/user_msr_test
  /x86_64/vmx_preemption_timer_test
  /x86_64/svm_vmcall_test
  /x86_64/sync_regs_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile

index 4a16658..7ebe71f 100644 (file)
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -55,6 +55,8 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
  TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
  TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
  TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
+TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
+TEST_GEN_PROGS_x86_64 += x86_64/user_msr_test
  TEST_GEN_PROGS_x86_64 += clear_dirty_log_test
  TEST_GEN_PROGS_x86_64 += demand_paging_test
  TEST_GEN_PROGS_x86_64 += dirty_log_test
diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h

index 16fa21e..54d624d 100644 (file)
--- a/tools/testing/selftests/kvm/include/x86_64/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
@@ -48,7 +48,7 @@
  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
  #define SECONDARY_EXEC_ENABLE_EPT              0x00000002
  #define SECONDARY_EXEC_DESC                    0x00000004
-#define SECONDARY_EXEC_RDTSCP                  0x00000008
+#define SECONDARY_EXEC_ENABLE_RDTSCP           0x00000008
  #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE  0x00000010
  #define SECONDARY_EXEC_ENABLE_VPID             0x00000020
  #define SECONDARY_EXEC_WBINVD_EXITING          0x00000040
diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c

new file mode 100644 (file)

index 0000000..f8e7611
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests for MSR_IA32_TSC and MSR_IA32_TSC_ADJUST.
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+#define UNITY                  (1ull << 30)
+#define HOST_ADJUST            (UNITY * 64)
+#define GUEST_STEP             (UNITY * 4)
+#define ROUND(x)               ((x + UNITY / 2) & -UNITY)
+#define rounded_rdmsr(x)       ROUND(rdmsr(x))
+#define rounded_host_rdmsr(x)  ROUND(vcpu_get_msr(vm, 0, x))
+
+#define GUEST_ASSERT_EQ(a, b) do {                             \
+       __typeof(a) _a = (a);                                   \
+       __typeof(b) _b = (b);                                   \
+       if (_a != _b)                                           \
+                ucall(UCALL_ABORT, 4,                          \
+                        "Failed guest assert: "                        \
+                        #a " == " #b, __LINE__, _a, _b);       \
+  } while(0)
+
+static void guest_code(void)
+{
+       u64 val = 0;
+
+       GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+       GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+       /* Guest: writes to MSR_IA32_TSC affect both MSRs.  */
+       val = 1ull * GUEST_STEP;
+       wrmsr(MSR_IA32_TSC, val);
+       GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+       GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+       /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs.  */
+       GUEST_SYNC(2);
+       val = 2ull * GUEST_STEP;
+       wrmsr(MSR_IA32_TSC_ADJUST, val);
+       GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+       GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+       /* Host: setting the TSC offset.  */
+       GUEST_SYNC(3);
+       GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+       GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+       /*
+        * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+        * host-side offset and affect both MSRs.
+        */
+       GUEST_SYNC(4);
+       val = 3ull * GUEST_STEP;
+       wrmsr(MSR_IA32_TSC_ADJUST, val);
+       GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+       GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+       /*
+        * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+        * offset is now visible in MSR_IA32_TSC_ADJUST.
+        */
+       GUEST_SYNC(5);
+       val = 4ull * GUEST_STEP;
+       wrmsr(MSR_IA32_TSC, val);
+       GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+       GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+       GUEST_DONE();
+}
+
+static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
+{
+       struct ucall uc;
+
+       vcpu_args_set(vm, vcpuid, 1, vcpuid);
+
+       vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL);
+
+       switch (get_ucall(vm, vcpuid, &uc)) {
+       case UCALL_SYNC:
+               TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+                            uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
+                            stage + 1, (ulong)uc.args[1]);
+               return;
+       case UCALL_DONE:
+               return;
+       case UCALL_ABORT:
+               TEST_ASSERT(false, "%s at %s:%ld\n" \
+                           "\tvalues: %#lx, %#lx", (const char *)uc.args[0],
+                           __FILE__, uc.args[1], uc.args[2], uc.args[3]);
+       default:
+               TEST_ASSERT(false, "Unexpected exit: %s",
+                           exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason));
+       }
+}
+
+int main(void)
+{
+       struct kvm_vm *vm;
+       uint64_t val;
+
+       vm = vm_create_default(VCPU_ID, 0, guest_code);
+       vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+       val = 0;
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+       /* Guest: writes to MSR_IA32_TSC affect both MSRs.  */
+       run_vcpu(vm, VCPU_ID, 1);
+       val = 1ull * GUEST_STEP;
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+       /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs.  */
+       run_vcpu(vm, VCPU_ID, 2);
+       val = 2ull * GUEST_STEP;
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+       /*
+        * Host: writes to MSR_IA32_TSC set the host-side offset
+        * and therefore do not change MSR_IA32_TSC_ADJUST.
+        */
+       vcpu_set_msr(vm, 0, MSR_IA32_TSC, HOST_ADJUST + val);
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+       run_vcpu(vm, VCPU_ID, 3);
+
+       /* Host: writes to MSR_IA32_TSC_ADJUST do not modify the TSC.  */
+       vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, UNITY * 123456);
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+       ASSERT_EQ(vcpu_get_msr(vm, 0, MSR_IA32_TSC_ADJUST), UNITY * 123456);
+
+       /* Restore previous value.  */
+       vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, val);
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+       /*
+        * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+        * host-side offset and affect both MSRs.
+        */
+       run_vcpu(vm, VCPU_ID, 4);
+       val = 3ull * GUEST_STEP;
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+       /*
+        * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+        * offset is now visible in MSR_IA32_TSC_ADJUST.
+        */
+       run_vcpu(vm, VCPU_ID, 5);
+       val = 4ull * GUEST_STEP;
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+       ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+       kvm_vm_free(vm);
+
+       return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/user_msr_test.c b/tools/testing/selftests/kvm/x86_64/user_msr_test.c

new file mode 100644 (file)

index 0000000..cbe1b08
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/user_msr_test.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tests for KVM_CAP_X86_USER_SPACE_MSR and KVM_X86_SET_MSR_FILTER
+ *
+ * Copyright (C) 2020, Amazon Inc.
+ *
+ * This is a functional test to verify that we can deflect MSR events
+ * into user space.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID                  5
+
+static u32 msr_reads, msr_writes;
+
+static u8 bitmap_00000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_00000000_write[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_40000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000_read[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_deadbeef[1] = { 0x1 };
+
+static void deny_msr(uint8_t *bitmap, u32 msr)
+{
+       u32 idx = msr & (KVM_MSR_FILTER_MAX_BITMAP_SIZE - 1);
+
+       bitmap[idx / 8] &= ~(1 << (idx % 8));
+}
+
+static void prepare_bitmaps(void)
+{
+       memset(bitmap_00000000, 0xff, sizeof(bitmap_00000000));
+       memset(bitmap_00000000_write, 0xff, sizeof(bitmap_00000000_write));
+       memset(bitmap_40000000, 0xff, sizeof(bitmap_40000000));
+       memset(bitmap_c0000000, 0xff, sizeof(bitmap_c0000000));
+       memset(bitmap_c0000000_read, 0xff, sizeof(bitmap_c0000000_read));
+
+       deny_msr(bitmap_00000000_write, MSR_IA32_POWER_CTL);
+       deny_msr(bitmap_c0000000_read, MSR_SYSCALL_MASK);
+       deny_msr(bitmap_c0000000_read, MSR_GS_BASE);
+}
+
+struct kvm_msr_filter filter = {
+       .flags = KVM_MSR_FILTER_DEFAULT_DENY,
+       .ranges = {
+               {
+                       .flags = KVM_MSR_FILTER_READ,
+                       .base = 0x00000000,
+                       .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+                       .bitmap = bitmap_00000000,
+               }, {
+                       .flags = KVM_MSR_FILTER_WRITE,
+                       .base = 0x00000000,
+                       .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+                       .bitmap = bitmap_00000000_write,
+               }, {
+                       .flags = KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE,
+                       .base = 0x40000000,
+                       .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+                       .bitmap = bitmap_40000000,
+               }, {
+                       .flags = KVM_MSR_FILTER_READ,
+                       .base = 0xc0000000,
+                       .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+                       .bitmap = bitmap_c0000000_read,
+               }, {
+                       .flags = KVM_MSR_FILTER_WRITE,
+                       .base = 0xc0000000,
+                       .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+                       .bitmap = bitmap_c0000000,
+               }, {
+                       .flags = KVM_MSR_FILTER_WRITE | KVM_MSR_FILTER_READ,
+                       .base = 0xdeadbeef,
+                       .nmsrs = 1,
+                       .bitmap = bitmap_deadbeef,
+               },
+       },
+};
+
+struct kvm_msr_filter no_filter = {
+       .flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+};
+
+static void guest_msr_calls(bool trapped)
+{
+       /* This goes into the in-kernel emulation */
+       wrmsr(MSR_SYSCALL_MASK, 0);
+
+       if (trapped) {
+               /* This goes into user space emulation */
+               GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) == MSR_SYSCALL_MASK);
+               GUEST_ASSERT(rdmsr(MSR_GS_BASE) == MSR_GS_BASE);
+       } else {
+               GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) != MSR_SYSCALL_MASK);
+               GUEST_ASSERT(rdmsr(MSR_GS_BASE) != MSR_GS_BASE);
+       }
+
+       /* If trapped == true, this goes into user space emulation */
+       wrmsr(MSR_IA32_POWER_CTL, 0x1234);
+
+       /* This goes into the in-kernel emulation */
+       rdmsr(MSR_IA32_POWER_CTL);
+
+       /* Invalid MSR, should always be handled by user space exit */
+       GUEST_ASSERT(rdmsr(0xdeadbeef) == 0xdeadbeef);
+       wrmsr(0xdeadbeef, 0x1234);
+}
+
+static void guest_code(void)
+{
+       guest_msr_calls(true);
+
+       /*
+        * Disable msr filtering, so that the kernel
+        * handles everything in the next round
+        */
+       GUEST_SYNC(0);
+
+       guest_msr_calls(false);
+
+       GUEST_DONE();
+}
+
+static int handle_ucall(struct kvm_vm *vm)
+{
+       struct ucall uc;
+
+       switch (get_ucall(vm, VCPU_ID, &uc)) {
+       case UCALL_ABORT:
+               TEST_FAIL("Guest assertion not met");
+               break;
+       case UCALL_SYNC:
+               vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &no_filter);
+               break;
+       case UCALL_DONE:
+               return 1;
+       default:
+               TEST_FAIL("Unknown ucall %lu", uc.cmd);
+       }
+
+       return 0;
+}
+
+static void handle_rdmsr(struct kvm_run *run)
+{
+       run->msr.data = run->msr.index;
+       msr_reads++;
+
+       if (run->msr.index == MSR_SYSCALL_MASK ||
+           run->msr.index == MSR_GS_BASE) {
+               TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+                           "MSR read trap w/o access fault");
+       }
+
+       if (run->msr.index == 0xdeadbeef) {
+               TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+                           "MSR deadbeef read trap w/o inval fault");
+       }
+}
+
+static void handle_wrmsr(struct kvm_run *run)
+{
+       /* ignore */
+       msr_writes++;
+
+       if (run->msr.index == MSR_IA32_POWER_CTL) {
+               TEST_ASSERT(run->msr.data == 0x1234,
+                           "MSR data for MSR_IA32_POWER_CTL incorrect");
+               TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+                           "MSR_IA32_POWER_CTL trap w/o access fault");
+       }
+
+       if (run->msr.index == 0xdeadbeef) {
+               TEST_ASSERT(run->msr.data == 0x1234,
+                           "MSR data for deadbeef incorrect");
+               TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+                           "deadbeef trap w/o inval fault");
+       }
+}
+
+int main(int argc, char *argv[])
+{
+       struct kvm_enable_cap cap = {
+               .cap = KVM_CAP_X86_USER_SPACE_MSR,
+               .args[0] = KVM_MSR_EXIT_REASON_INVAL |
+                          KVM_MSR_EXIT_REASON_UNKNOWN |
+                          KVM_MSR_EXIT_REASON_FILTER,
+       };
+       struct kvm_vm *vm;
+       struct kvm_run *run;
+       int rc;
+
+       /* Tell stdout not to buffer its content */
+       setbuf(stdout, NULL);
+
+       /* Create VM */
+       vm = vm_create_default(VCPU_ID, 0, guest_code);
+       vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+       run = vcpu_state(vm, VCPU_ID);
+
+       rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+       TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+       vm_enable_cap(vm, &cap);
+
+       rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+       TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+       prepare_bitmaps();
+       vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter);
+
+       while (1) {
+               rc = _vcpu_run(vm, VCPU_ID);
+
+               TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc);
+
+               switch (run->exit_reason) {
+               case KVM_EXIT_X86_RDMSR:
+                       handle_rdmsr(run);
+                       break;
+               case KVM_EXIT_X86_WRMSR:
+                       handle_wrmsr(run);
+                       break;
+               case KVM_EXIT_IO:
+                       if (handle_ucall(vm))
+                               goto done;
+                       break;
+               }
+
+       }
+
+done:
+       TEST_ASSERT(msr_reads == 4, "Handled 4 rdmsr in user space");
+       TEST_ASSERT(msr_writes == 3, "Handled 3 wrmsr in user space");
+
+       kvm_vm_free(vm);
+
+       return 0;
+}
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c

index d6408bb..c2323c2 100644 (file)
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -853,15 +853,17 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
         struct eventfd_ctx       *eventfd;
         struct kvm_io_bus        *bus;
         int                       ret = -ENOENT;
+       bool                      wildcard;
  
         eventfd = eventfd_ctx_fdget(args->fd);
         if (IS_ERR(eventfd))
                 return PTR_ERR(eventfd);
  
+       wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
+
         mutex_lock(&kvm->slots_lock);
  
         list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
-               bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
  
                 if (p->bus_idx != bus_idx ||
                     p->eventfd != eventfd  ||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 67cd0b8..68edd25 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4332,7 +4332,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
  void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                                struct kvm_io_device *dev)
  {
-       int i;
+       int i, j;
         struct kvm_io_bus *new_bus, *bus;
  
         bus = kvm_get_bus(kvm, bus_idx);
@@ -4349,17 +4349,20 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
  
         new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
                           GFP_KERNEL_ACCOUNT);
-       if (!new_bus)  {
+       if (new_bus) {
+               memcpy(new_bus, bus, struct_size(bus, range, i));
+               new_bus->dev_count--;
+               memcpy(new_bus->range + i, bus->range + i + 1,
+                               flex_array_size(new_bus, range, new_bus->dev_count - i));
+       } else {
                 pr_err("kvm: failed to shrink bus, removing it completely\n");
-               goto broken;
+               for (j = 0; j < bus->dev_count; j++) {
+                       if (j == i)
+                               continue;
+                       kvm_iodevice_destructor(bus->range[j].dev);
+               }
         }
  
-       memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
-       new_bus->dev_count--;
-       memcpy(new_bus->range + i, bus->range + i + 1,
-              (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
-
-broken:
         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
         synchronize_srcu_expedited(&kvm->srcu);
         kfree(bus);
author	Paolo Bonzini <pbonzini@redhat.com>
	Tue, 20 Oct 2020 12:14:25 +0000 (08:14 -0400)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Tue, 20 Oct 2020 12:14:25 +0000 (08:14 -0400)
Documentation/virt/kvm/api.rst		patch \| blob \| history
arch/arm64/include/asm/kvm_emulate.h		patch \| blob \| history
arch/arm64/kvm/hyp/include/hyp/switch.h		patch \| blob \| history
arch/arm64/kvm/mmu.c		patch \| blob \| history
arch/mips/include/asm/kvm_host.h		patch \| blob \| history
arch/mips/kvm/entry.c		patch \| blob \| history
arch/mips/kvm/mips.c		patch \| blob \| history
arch/mips/kvm/trap_emul.c		patch \| blob \| history
arch/mips/kvm/vz.c		patch \| blob \| history
arch/powerpc/include/asm/kvm_host.h		patch \| blob \| history
arch/powerpc/kvm/book3s.c		patch \| blob \| history
arch/powerpc/kvm/book3s_64_mmu_radix.c		patch \| blob \| history
arch/powerpc/kvm/book3s_64_vio.c		patch \| blob \| history
arch/powerpc/kvm/book3s_64_vio_hv.c		patch \| blob \| history
arch/powerpc/kvm/book3s_hv.c		patch \| blob \| history
arch/powerpc/kvm/book3s_hv_interrupts.S		patch \| blob \| history
arch/powerpc/kvm/book3s_hv_nested.c		patch \| blob \| history
arch/powerpc/kvm/book3s_hv_rm_xics.c		patch \| blob \| history
arch/powerpc/kvm/book3s_pr.c		patch \| blob \| history
arch/powerpc/kvm/book3s_xics.c		patch \| blob \| history
arch/powerpc/kvm/book3s_xive_native.c		patch \| blob \| history
arch/powerpc/kvm/booke.c		patch \| blob \| history
arch/x86/boot/compressed/kaslr.c		patch \| blob \| history
arch/x86/boot/compressed/misc.h		patch \| blob \| history
arch/x86/include/asm/cpufeatures.h		patch \| blob \| history
arch/x86/include/asm/fpu/internal.h		patch \| blob \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/include/asm/special_insns.h		patch \| blob \| history
arch/x86/include/asm/svm.h		patch \| blob \| history
arch/x86/include/asm/sync_core.h		patch \| blob \| history
arch/x86/include/asm/vmx.h		patch \| blob \| history
arch/x86/include/uapi/asm/kvm.h		patch \| blob \| history
arch/x86/include/uapi/asm/svm.h		patch \| blob \| history
arch/x86/kernel/kvm.c		patch \| blob \| history
arch/x86/kernel/umip.c		patch \| blob \| history
arch/x86/kvm/Makefile		patch \| blob \| history
arch/x86/kvm/cpuid.c		patch \| blob \| history
arch/x86/kvm/cpuid.h		patch \| blob \| history
arch/x86/kvm/emulate.c		patch \| blob \| history
arch/x86/kvm/hyperv.c		patch \| blob \| history
arch/x86/kvm/lapic.c		patch \| blob \| history
arch/x86/kvm/lapic.h		patch \| blob \| history
arch/x86/kvm/mmu.h		patch \| blob \| history
arch/x86/kvm/mmu/mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/mmutrace.h		patch \| blob \| history
arch/x86/kvm/mmu/paging_tmpl.h		patch \| blob \| history
arch/x86/kvm/svm/avic.c		patch \| blob \| history
arch/x86/kvm/svm/nested.c		patch \| blob \| history
arch/x86/kvm/svm/sev.c		patch \| blob \| history
arch/x86/kvm/svm/svm.c		patch \| blob \| history
arch/x86/kvm/svm/svm.h		patch \| blob \| history
arch/x86/kvm/trace.h		patch \| blob \| history
arch/x86/kvm/vmx/capabilities.h		patch \| blob \| history
arch/x86/kvm/vmx/nested.c		patch \| blob \| history
arch/x86/kvm/vmx/ops.h	[deleted file]	patch \| blob \| history
arch/x86/kvm/vmx/posted_intr.c	[new file with mode: 0644]	patch \| blob
arch/x86/kvm/vmx/posted_intr.h	[new file with mode: 0644]	patch \| blob
arch/x86/kvm/vmx/vmcs.h		patch \| blob \| history
arch/x86/kvm/vmx/vmenter.S		patch \| blob \| history
arch/x86/kvm/vmx/vmx.c		patch \| blob \| history
arch/x86/kvm/vmx/vmx.h		patch \| blob \| history
arch/x86/kvm/vmx/vmx_ops.h	[new file with mode: 0644]	patch \| blob
arch/x86/kvm/x86.c		patch \| blob \| history
arch/x86/kvm/x86.h		patch \| blob \| history
include/trace/events/kvm.h		patch \| blob \| history
include/uapi/linux/kvm.h		patch \| blob \| history
tools/testing/selftests/kvm/.gitignore		patch \| blob \| history
tools/testing/selftests/kvm/Makefile		patch \| blob \| history
tools/testing/selftests/kvm/include/x86_64/vmx.h		patch \| blob \| history
tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/kvm/x86_64/user_msr_test.c	[new file with mode: 0644]	patch \| blob
virt/kvm/eventfd.c		patch \| blob \| history
virt/kvm/kvm_main.c		patch \| blob \| history