Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 10 May 2021 19:30:45 +0000 (12:30 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 10 May 2021 19:30:45 +0000 (12:30 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 10 May 2021 19:30:45 +0000 (12:30 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 10 May 2021 19:30:45 +0000 (12:30 -0700)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst

index 22d0775..7fcb2fd 100644 (file)
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4803,7 +4803,7 @@ KVM_PV_VM_VERIFY
  4.126 KVM_X86_SET_MSR_FILTER
  ----------------------------
  
-:Capability: KVM_X86_SET_MSR_FILTER
+:Capability: KVM_CAP_X86_MSR_FILTER
  :Architectures: x86
  :Type: vm ioctl
  :Parameters: struct kvm_msr_filter
@@ -6715,7 +6715,7 @@ accesses that would usually trigger a #GP by KVM into the guest will
  instead get bounced to user space through the KVM_EXIT_X86_RDMSR and
  KVM_EXIT_X86_WRMSR exit notifications.
  
-8.27 KVM_X86_SET_MSR_FILTER
+8.27 KVM_CAP_X86_MSR_FILTER
  ---------------------------
  
  :Architectures: x86
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index cbbcee0..55efbac 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -113,6 +113,7 @@
  #define VALID_PAGE(x) ((x) != INVALID_PAGE)
  
  #define UNMAPPED_GVA (~(gpa_t)0)
+#define INVALID_GPA (~(gpa_t)0)
  
  /* KVM Hugepage definitions for x86 */
  #define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G
@@ -199,6 +200,7 @@ enum x86_intercept_stage;
  
  #define KVM_NR_DB_REGS 4
  
+#define DR6_BUS_LOCK   (1 << 11)
  #define DR6_BD         (1 << 13)
  #define DR6_BS         (1 << 14)
  #define DR6_BT         (1 << 15)
@@ -212,7 +214,7 @@ enum x86_intercept_stage;
   * DR6_ACTIVE_LOW is also used as the init/reset value for DR6.
   */
  #define DR6_ACTIVE_LOW 0xffff0ff0
-#define DR6_VOLATILE   0x0001e00f
+#define DR6_VOLATILE   0x0001e80f
  #define DR6_FIXED_1    (DR6_ACTIVE_LOW & ~DR6_VOLATILE)
  
  #define DR7_BP_EN_MASK 0x000000ff
@@ -407,7 +409,7 @@ struct kvm_mmu {
         u32 pkru_mask;
  
         u64 *pae_root;
-       u64 *lm_root;
+       u64 *pml4_root;
  
         /*
          * check zero bits on shadow page table entries, these
@@ -1417,6 +1419,7 @@ struct kvm_arch_async_pf {
         bool direct_map;
  };
  
+extern u32 __read_mostly kvm_nr_uret_msrs;
  extern u64 __read_mostly host_efer;
  extern bool __read_mostly allow_smaller_maxphyaddr;
  extern struct kvm_x86_ops kvm_x86_ops;
@@ -1775,9 +1778,15 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
                     unsigned long ipi_bitmap_high, u32 min,
                     unsigned long icr, int op_64_bit);
  
-void kvm_define_user_return_msr(unsigned index, u32 msr);
+int kvm_add_user_return_msr(u32 msr);
+int kvm_find_user_return_msr(u32 msr);
  int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
  
+static inline bool kvm_is_supported_user_return_msr(u32 msr)
+{
+       return kvm_find_user_return_msr(msr) >= 0;
+}
+
  u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
  u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
  
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h

index 3381198..6929987 100644 (file)
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -7,8 +7,6 @@
  #include <linux/interrupt.h>
  #include <uapi/asm/kvm_para.h>
  
-extern void kvmclock_init(void);
-
  #ifdef CONFIG_KVM_GUEST
  bool kvm_check_and_clear_guest_paused(void);
  #else
@@ -86,13 +84,14 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
  }
  
  #ifdef CONFIG_KVM_GUEST
+void kvmclock_init(void);
+void kvmclock_disable(void);
  bool kvm_para_available(void);
  unsigned int kvm_arch_para_features(void);
  unsigned int kvm_arch_para_hints(void);
  void kvm_async_pf_task_wait_schedule(u32 token);
  void kvm_async_pf_task_wake(u32 token);
  u32 kvm_read_and_reset_apf_flags(void);
-void kvm_disable_steal_time(void);
  bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token);
  
  DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@@ -137,11 +136,6 @@ static inline u32 kvm_read_and_reset_apf_flags(void)
         return 0;
  }
  
-static inline void kvm_disable_steal_time(void)
-{
-       return;
-}
-
  static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
  {
         return false;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h

index 5a3022c..0662f64 100644 (file)
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -437,6 +437,8 @@ struct kvm_vmx_nested_state_hdr {
                 __u16 flags;
         } smm;
  
+       __u16 pad;
+
         __u32 flags;
         __u64 preemption_timer_deadline;
  };
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c

index d307c22..a26643d 100644 (file)
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -26,6 +26,7 @@
  #include <linux/kprobes.h>
  #include <linux/nmi.h>
  #include <linux/swait.h>
+#include <linux/syscore_ops.h>
  #include <asm/timer.h>
  #include <asm/cpu.h>
  #include <asm/traps.h>
@@ -37,6 +38,7 @@
  #include <asm/tlb.h>
  #include <asm/cpuidle_haltpoll.h>
  #include <asm/ptrace.h>
+#include <asm/reboot.h>
  #include <asm/svm.h>
  
  DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@@ -345,7 +347,7 @@ static void kvm_guest_cpu_init(void)
  
                 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
                 __this_cpu_write(apf_reason.enabled, 1);
-               pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
+               pr_info("setup async PF for cpu %d\n", smp_processor_id());
         }
  
         if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
@@ -371,34 +373,17 @@ static void kvm_pv_disable_apf(void)
         wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
         __this_cpu_write(apf_reason.enabled, 0);
  
-       pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id());
+       pr_info("disable async PF for cpu %d\n", smp_processor_id());
  }
  
-static void kvm_pv_guest_cpu_reboot(void *unused)
+static void kvm_disable_steal_time(void)
  {
-       /*
-        * We disable PV EOI before we load a new kernel by kexec,
-        * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
-        * New kernel can re-enable when it boots.
-        */
-       if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
-               wrmsrl(MSR_KVM_PV_EOI_EN, 0);
-       kvm_pv_disable_apf();
-       kvm_disable_steal_time();
-}
+       if (!has_steal_clock)
+               return;
  
-static int kvm_pv_reboot_notify(struct notifier_block *nb,
-                               unsigned long code, void *unused)
-{
-       if (code == SYS_RESTART)
-               on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
-       return NOTIFY_DONE;
+       wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
  }
  
-static struct notifier_block kvm_pv_reboot_nb = {
-       .notifier_call = kvm_pv_reboot_notify,
-};
-
  static u64 kvm_steal_clock(int cpu)
  {
         u64 steal;
@@ -416,14 +401,6 @@ static u64 kvm_steal_clock(int cpu)
         return steal;
  }
  
-void kvm_disable_steal_time(void)
-{
-       if (!has_steal_clock)
-               return;
-
-       wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
-}
-
  static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
  {
         early_set_memory_decrypted((unsigned long) ptr, size);
@@ -451,6 +428,27 @@ static void __init sev_map_percpu_data(void)
         }
  }
  
+static void kvm_guest_cpu_offline(bool shutdown)
+{
+       kvm_disable_steal_time();
+       if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+               wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+       kvm_pv_disable_apf();
+       if (!shutdown)
+               apf_task_wake_all();
+       kvmclock_disable();
+}
+
+static int kvm_cpu_online(unsigned int cpu)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       kvm_guest_cpu_init();
+       local_irq_restore(flags);
+       return 0;
+}
+
  #ifdef CONFIG_SMP
  
  static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
@@ -635,31 +633,64 @@ static void __init kvm_smp_prepare_boot_cpu(void)
         kvm_spinlock_init();
  }
  
-static void kvm_guest_cpu_offline(void)
+static int kvm_cpu_down_prepare(unsigned int cpu)
  {
-       kvm_disable_steal_time();
-       if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
-               wrmsrl(MSR_KVM_PV_EOI_EN, 0);
-       kvm_pv_disable_apf();
-       apf_task_wake_all();
+       unsigned long flags;
+
+       local_irq_save(flags);
+       kvm_guest_cpu_offline(false);
+       local_irq_restore(flags);
+       return 0;
  }
  
-static int kvm_cpu_online(unsigned int cpu)
+#endif
+
+static int kvm_suspend(void)
  {
-       local_irq_disable();
-       kvm_guest_cpu_init();
-       local_irq_enable();
+       kvm_guest_cpu_offline(false);
+
         return 0;
  }
  
-static int kvm_cpu_down_prepare(unsigned int cpu)
+static void kvm_resume(void)
  {
-       local_irq_disable();
-       kvm_guest_cpu_offline();
-       local_irq_enable();
-       return 0;
+       kvm_cpu_online(raw_smp_processor_id());
+}
+
+static struct syscore_ops kvm_syscore_ops = {
+       .suspend        = kvm_suspend,
+       .resume         = kvm_resume,
+};
+
+static void kvm_pv_guest_cpu_reboot(void *unused)
+{
+       kvm_guest_cpu_offline(true);
+}
+
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+                               unsigned long code, void *unused)
+{
+       if (code == SYS_RESTART)
+               on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
+       return NOTIFY_DONE;
  }
  
+static struct notifier_block kvm_pv_reboot_nb = {
+       .notifier_call = kvm_pv_reboot_notify,
+};
+
+/*
+ * After a PV feature is registered, the host will keep writing to the
+ * registered memory location. If the guest happens to shutdown, this memory
+ * won't be valid. In cases like kexec, in which you install a new kernel, this
+ * means a random memory location will be kept being written.
+ */
+#ifdef CONFIG_KEXEC_CORE
+static void kvm_crash_shutdown(struct pt_regs *regs)
+{
+       kvm_guest_cpu_offline(true);
+       native_machine_crash_shutdown(regs);
+}
  #endif
  
  static void __init kvm_guest_init(void)
@@ -704,6 +735,12 @@ static void __init kvm_guest_init(void)
         kvm_guest_cpu_init();
  #endif
  
+#ifdef CONFIG_KEXEC_CORE
+       machine_ops.crash_shutdown = kvm_crash_shutdown;
+#endif
+
+       register_syscore_ops(&kvm_syscore_ops);
+
         /*
          * Hard lockup detection is enabled by default. Disable it, as guests
          * can get false positives too easily, for example if the host is
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c

index d37ed4e..ad273e5 100644 (file)
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -20,7 +20,6 @@
  #include <asm/hypervisor.h>
  #include <asm/mem_encrypt.h>
  #include <asm/x86_init.h>
-#include <asm/reboot.h>
  #include <asm/kvmclock.h>
  
  static int kvmclock __initdata = 1;
@@ -203,28 +202,9 @@ static void kvm_setup_secondary_clock(void)
  }
  #endif
  
-/*
- * After the clock is registered, the host will keep writing to the
- * registered memory location. If the guest happens to shutdown, this memory
- * won't be valid. In cases like kexec, in which you install a new kernel, this
- * means a random memory location will be kept being written. So before any
- * kind of shutdown from our side, we unregister the clock by writing anything
- * that does not have the 'enable' bit set in the msr
- */
-#ifdef CONFIG_KEXEC_CORE
-static void kvm_crash_shutdown(struct pt_regs *regs)
-{
-       native_write_msr(msr_kvm_system_time, 0, 0);
-       kvm_disable_steal_time();
-       native_machine_crash_shutdown(regs);
-}
-#endif
-
-static void kvm_shutdown(void)
+void kvmclock_disable(void)
  {
         native_write_msr(msr_kvm_system_time, 0, 0);
-       kvm_disable_steal_time();
-       native_machine_shutdown();
  }
  
  static void __init kvmclock_init_mem(void)
@@ -351,10 +331,6 @@ void __init kvmclock_init(void)
  #endif
         x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
         x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
-       machine_ops.shutdown  = kvm_shutdown;
-#ifdef CONFIG_KEXEC_CORE
-       machine_ops.crash_shutdown  = kvm_crash_shutdown;
-#endif
         kvm_get_preset_lpj();
  
         /*
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c

index 19606a3..9a48f13 100644 (file)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -458,7 +458,7 @@ void kvm_set_cpu_caps(void)
                 F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
                 F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
                 F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
-               F(SGX_LC)
+               F(SGX_LC) | F(BUS_LOCK_DETECT)
         );
         /* Set LA57 based on hardware capability. */
         if (cpuid_ecx(7) & F(LA57))
@@ -567,6 +567,21 @@ void kvm_set_cpu_caps(void)
                 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
                 F(PMM) | F(PMM_EN)
         );
+
+       /*
+        * Hide RDTSCP and RDPID if either feature is reported as supported but
+        * probing MSR_TSC_AUX failed.  This is purely a sanity check and
+        * should never happen, but the guest will likely crash if RDTSCP or
+        * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in
+        * the past.  For example, the sanity check may fire if this instance of
+        * KVM is running as L1 on top of an older, broken KVM.
+        */
+       if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) ||
+                    kvm_cpu_cap_has(X86_FEATURE_RDPID)) &&
+                    !kvm_is_supported_user_return_msr(MSR_TSC_AUX))) {
+               kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+               kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+       }
  }
  EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
  
@@ -637,7 +652,8 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
         case 7:
                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                 entry->eax = 0;
-               entry->ecx = F(RDPID);
+               if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+                       entry->ecx = F(RDPID);
                 ++array->nent;
         default:
                 break;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c

index 77e1c89..8a0ccdb 100644 (file)
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4502,7 +4502,7 @@ static const struct opcode group8[] = {
   * from the register case of group9.
   */
  static const struct gprefix pfx_0f_c7_7 = {
-       N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp),
+       N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdpid),
  };
  
  
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h

index 0d35911..f016838 100644 (file)
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -468,6 +468,7 @@ enum x86_intercept {
         x86_intercept_clgi,
         x86_intercept_skinit,
         x86_intercept_rdtscp,
+       x86_intercept_rdpid,
         x86_intercept_icebp,
         x86_intercept_wbinvd,
         x86_intercept_monitor,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index 152591f..c0ebef5 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1913,8 +1913,8 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
         if (!apic->lapic_timer.hv_timer_in_use)
                 goto out;
         WARN_ON(rcuwait_active(&vcpu->wait));
-       cancel_hv_timer(apic);
         apic_timer_expired(apic, false);
+       cancel_hv_timer(apic);
  
         if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
                 advance_periodic_target_expiration(apic);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index 4b3ee24..0144c40 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3310,12 +3310,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
         if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
  
-               if (WARN_ON_ONCE(!mmu->lm_root)) {
+               if (WARN_ON_ONCE(!mmu->pml4_root)) {
                         r = -EIO;
                         goto out_unlock;
                 }
  
-               mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask;
+               mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
         }
  
         for (i = 0; i < 4; ++i) {
@@ -3335,7 +3335,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
         }
  
         if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
-               mmu->root_hpa = __pa(mmu->lm_root);
+               mmu->root_hpa = __pa(mmu->pml4_root);
         else
                 mmu->root_hpa = __pa(mmu->pae_root);
  
@@ -3350,7 +3350,7 @@ out_unlock:
  static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
  {
         struct kvm_mmu *mmu = vcpu->arch.mmu;
-       u64 *lm_root, *pae_root;
+       u64 *pml4_root, *pae_root;
  
         /*
          * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
@@ -3369,14 +3369,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
         if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
                 return -EIO;
  
-       if (mmu->pae_root && mmu->lm_root)
+       if (mmu->pae_root && mmu->pml4_root)
                 return 0;
  
         /*
          * The special roots should always be allocated in concert.  Yell and
          * bail if KVM ends up in a state where only one of the roots is valid.
          */
-       if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root))
+       if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
                 return -EIO;
  
         /*
@@ -3387,14 +3387,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
         if (!pae_root)
                 return -ENOMEM;
  
-       lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
-       if (!lm_root) {
+       pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+       if (!pml4_root) {
                 free_page((unsigned long)pae_root);
                 return -ENOMEM;
         }
  
         mmu->pae_root = pae_root;
-       mmu->lm_root = lm_root;
+       mmu->pml4_root = pml4_root;
  
         return 0;
  }
@@ -5261,7 +5261,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
         if (!tdp_enabled && mmu->pae_root)
                 set_memory_encrypted((unsigned long)mmu->pae_root, 1);
         free_page((unsigned long)mmu->pae_root);
-       free_page((unsigned long)mmu->lm_root);
+       free_page((unsigned long)mmu->pml4_root);
  }
  
  static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c

index 88f69a6..95eeb5a 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -388,7 +388,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
  }
  
  /**
- * handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * __handle_changed_spte - handle bookkeeping associated with an SPTE change
   * @kvm: kvm instance
   * @as_id: the address space of the paging structure the SPTE was a part of
   * @gfn: the base GFN that was mapped by the SPTE
@@ -444,6 +444,13 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
  
         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
  
+       if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
+               if (is_large_pte(old_spte))
+                       atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
+               else
+                       atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
+       }
+
         /*
          * The only times a SPTE should be changed from a non-present to
          * non-present state is when an MMIO entry is installed/modified/
@@ -1009,6 +1016,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                 }
  
                 if (!is_shadow_present_pte(iter.old_spte)) {
+                       /*
+                        * If SPTE has been forzen by another thread, just
+                        * give up and retry, avoiding unnecessary page table
+                        * allocation and free.
+                        */
+                       if (is_removed_spte(iter.old_spte))
+                               break;
+
                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
                         child_pt = sp->spt;
  
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c

index 540d43b..5e8d844 100644 (file)
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -764,7 +764,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
         nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
  
         svm_switch_vmcb(svm, &svm->vmcb01);
-       WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN);
  
         /*
          * On vmexit the  GIF is set to false and
@@ -872,6 +871,15 @@ void svm_free_nested(struct vcpu_svm *svm)
         __free_page(virt_to_page(svm->nested.vmcb02.ptr));
         svm->nested.vmcb02.ptr = NULL;
  
+       /*
+        * When last_vmcb12_gpa matches the current vmcb12 gpa,
+        * some vmcb12 fields are not loaded if they are marked clean
+        * in the vmcb12, since in this case they are up to date already.
+        *
+        * When the vmcb02 is freed, this optimization becomes invalid.
+        */
+       svm->nested.last_vmcb12_gpa = INVALID_GPA;
+
         svm->nested.initialized = false;
  }
  
@@ -884,9 +892,11 @@ void svm_leave_nested(struct vcpu_svm *svm)
  
         if (is_guest_mode(vcpu)) {
                 svm->nested.nested_run_pending = 0;
+               svm->nested.vmcb12_gpa = INVALID_GPA;
+
                 leave_guest_mode(vcpu);
  
-               svm_switch_vmcb(svm, &svm->nested.vmcb02);
+               svm_switch_vmcb(svm, &svm->vmcb01);
  
                 nested_svm_uninit_mmu_context(vcpu);
                 vmcb_mark_all_dirty(svm->vmcb);
@@ -1298,12 +1308,17 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
          * L2 registers if needed are moved from the current VMCB to VMCB02.
          */
  
+       if (is_guest_mode(vcpu))
+               svm_leave_nested(svm);
+       else
+               svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+       svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+
         svm->nested.nested_run_pending =
                 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
  
         svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
-       if (svm->current_vmcb == &svm->vmcb01)
-               svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
  
         svm->vmcb01.ptr->save.es = save->es;
         svm->vmcb01.ptr->save.cs = save->cs;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c

index 1356ee0..5bc887e 100644 (file)
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -763,7 +763,7 @@ static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
  }
  
  static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
-                                 unsigned long __user dst_uaddr,
+                                 void __user *dst_uaddr,
                                   unsigned long dst_paddr,
                                   int size, int *err)
  {
@@ -787,8 +787,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
  
         if (tpage) {
                 offset = paddr & 15;
-               if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
-                                page_address(tpage) + offset, size))
+               if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
                         ret = -EFAULT;
         }
  
@@ -800,9 +799,9 @@ e_free:
  }
  
  static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
-                                 unsigned long __user vaddr,
+                                 void __user *vaddr,
                                   unsigned long dst_paddr,
-                                 unsigned long __user dst_vaddr,
+                                 void __user *dst_vaddr,
                                   int size, int *error)
  {
         struct page *src_tpage = NULL;
@@ -810,13 +809,12 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
         int ret, len = size;
  
         /* If source buffer is not aligned then use an intermediate buffer */
-       if (!IS_ALIGNED(vaddr, 16)) {
+       if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
                 src_tpage = alloc_page(GFP_KERNEL);
                 if (!src_tpage)
                         return -ENOMEM;
  
-               if (copy_from_user(page_address(src_tpage),
-                               (void __user *)(uintptr_t)vaddr, size)) {
+               if (copy_from_user(page_address(src_tpage), vaddr, size)) {
                         __free_page(src_tpage);
                         return -EFAULT;
                 }
@@ -830,7 +828,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
          *   - copy the source buffer in an intermediate buffer
          *   - use the intermediate buffer as source buffer
          */
-       if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+       if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
                 int dst_offset;
  
                 dst_tpage = alloc_page(GFP_KERNEL);
@@ -855,7 +853,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
                                page_address(src_tpage), size);
                 else {
                         if (copy_from_user(page_address(dst_tpage) + dst_offset,
-                                          (void __user *)(uintptr_t)vaddr, size)) {
+                                          vaddr, size)) {
                                 ret = -EFAULT;
                                 goto e_free;
                         }
@@ -935,15 +933,15 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
                 if (dec)
                         ret = __sev_dbg_decrypt_user(kvm,
                                                      __sme_page_pa(src_p[0]) + s_off,
-                                                    dst_vaddr,
+                                                    (void __user *)dst_vaddr,
                                                      __sme_page_pa(dst_p[0]) + d_off,
                                                      len, &argp->error);
                 else
                         ret = __sev_dbg_encrypt_user(kvm,
                                                      __sme_page_pa(src_p[0]) + s_off,
-                                                    vaddr,
+                                                    (void __user *)vaddr,
                                                      __sme_page_pa(dst_p[0]) + d_off,
-                                                    dst_vaddr,
+                                                    (void __user *)dst_vaddr,
                                                      len, &argp->error);
  
                 sev_unpin_memory(kvm, src_p, n);
@@ -1764,7 +1762,8 @@ e_mirror_unlock:
  e_source_unlock:
         mutex_unlock(&source_kvm->lock);
  e_source_put:
-       fput(source_kvm_file);
+       if (source_kvm_file)
+               fput(source_kvm_file);
         return ret;
  }
  
@@ -2198,7 +2197,7 @@ vmgexit_err:
         return -EINVAL;
  }
  
-static void pre_sev_es_run(struct vcpu_svm *svm)
+void sev_es_unmap_ghcb(struct vcpu_svm *svm)
  {
         if (!svm->ghcb)
                 return;
@@ -2234,9 +2233,6 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
         int asid = sev_get_asid(svm->vcpu.kvm);
  
-       /* Perform any SEV-ES pre-run actions */
-       pre_sev_es_run(svm);
-
         /* Assign the asid allocated with this SEV guest */
         svm->asid = asid;
  
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index b649f92..dfa351e 100644 (file)
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -212,7 +212,7 @@ DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
   * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
   * defer the restoration of TSC_AUX until the CPU returns to userspace.
   */
-#define TSC_AUX_URET_SLOT      0
+static int tsc_aux_uret_slot __read_mostly = -1;
  
  static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
  
@@ -447,6 +447,11 @@ static int has_svm(void)
                 return 0;
         }
  
+       if (pgtable_l5_enabled()) {
+               pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
+               return 0;
+       }
+
         return 1;
  }
  
@@ -959,8 +964,7 @@ static __init int svm_hardware_setup(void)
                 kvm_tsc_scaling_ratio_frac_bits = 32;
         }
  
-       if (boot_cpu_has(X86_FEATURE_RDTSCP))
-               kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX);
+       tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
  
         /* Check for pause filtering support */
         if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
@@ -1100,7 +1104,9 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
         return svm->vmcb->control.tsc_offset;
  }
  
-static void svm_check_invpcid(struct vcpu_svm *svm)
+/* Evaluate instruction intercepts that depend on guest CPUID features. */
+static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
+                                             struct vcpu_svm *svm)
  {
         /*
          * Intercept INVPCID if shadow paging is enabled to sync/free shadow
@@ -1113,6 +1119,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm)
                 else
                         svm_clr_intercept(svm, INTERCEPT_INVPCID);
         }
+
+       if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
+               if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+                       svm_clr_intercept(svm, INTERCEPT_RDTSCP);
+               else
+                       svm_set_intercept(svm, INTERCEPT_RDTSCP);
+       }
  }
  
  static void init_vmcb(struct kvm_vcpu *vcpu)
@@ -1235,8 +1248,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
         svm->current_vmcb->asid_generation = 0;
         svm->asid = 0;
  
-       svm->nested.vmcb12_gpa = 0;
-       svm->nested.last_vmcb12_gpa = 0;
+       svm->nested.vmcb12_gpa = INVALID_GPA;
+       svm->nested.last_vmcb12_gpa = INVALID_GPA;
         vcpu->arch.hflags = 0;
  
         if (!kvm_pause_in_guest(vcpu->kvm)) {
@@ -1248,7 +1261,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
                 svm_clr_intercept(svm, INTERCEPT_PAUSE);
         }
  
-       svm_check_invpcid(svm);
+       svm_recalc_instruction_intercepts(vcpu, svm);
  
         /*
          * If the host supports V_SPEC_CTRL then disable the interception
@@ -1424,6 +1437,9 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
         struct vcpu_svm *svm = to_svm(vcpu);
         struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
  
+       if (sev_es_guest(vcpu->kvm))
+               sev_es_unmap_ghcb(svm);
+
         if (svm->guest_state_loaded)
                 return;
  
@@ -1445,8 +1461,8 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
                 }
         }
  
-       if (static_cpu_has(X86_FEATURE_RDTSCP))
-               kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull);
+       if (likely(tsc_aux_uret_slot >= 0))
+               kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
  
         svm->guest_state_loaded = true;
  }
@@ -2655,11 +2671,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                         msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
                 break;
         case MSR_TSC_AUX:
-               if (!boot_cpu_has(X86_FEATURE_RDTSCP))
-                       return 1;
-               if (!msr_info->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
-                       return 1;
                 msr_info->data = svm->tsc_aux;
                 break;
         /*
@@ -2876,30 +2887,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
                 break;
         case MSR_TSC_AUX:
-               if (!boot_cpu_has(X86_FEATURE_RDTSCP))
-                       return 1;
-
-               if (!msr->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
-                       return 1;
-
-               /*
-                * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
-                * incomplete and conflicting architectural behavior.  Current
-                * AMD CPUs completely ignore bits 63:32, i.e. they aren't
-                * reserved and always read as zeros.  Emulate AMD CPU behavior
-                * to avoid explosions if the vCPU is migrated from an AMD host
-                * to an Intel host.
-                */
-               data = (u32)data;
-
                 /*
                  * TSC_AUX is usually changed only during boot and never read
                  * directly.  Intercept TSC_AUX instead of exposing it to the
                  * guest via direct_access_msrs, and switch it via user return.
                  */
                 preempt_disable();
-               r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
+               r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
                 preempt_enable();
                 if (r)
                         return 1;
@@ -3084,6 +3078,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [SVM_EXIT_STGI]                         = stgi_interception,
         [SVM_EXIT_CLGI]                         = clgi_interception,
         [SVM_EXIT_SKINIT]                       = skinit_interception,
+       [SVM_EXIT_RDTSCP]                       = kvm_handle_invalid_op,
         [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
         [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
         [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
@@ -3972,8 +3967,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
         svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
                              guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
  
-       /* Check again if INVPCID interception if required */
-       svm_check_invpcid(svm);
+       svm_recalc_instruction_intercepts(vcpu, svm);
  
         /* For sev guests, the memory encryption bit is not reserved in CR3.  */
         if (sev_guest(vcpu->kvm)) {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h

index 84b3133..e44567c 100644 (file)
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -581,6 +581,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm);
  void sev_es_create_vcpu(struct vcpu_svm *svm);
  void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
  void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
+void sev_es_unmap_ghcb(struct vcpu_svm *svm);
  
  /* vmenter.S */
  
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h

index d1d7798..8dee8a5 100644 (file)
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -398,6 +398,9 @@ static inline u64 vmx_supported_debugctl(void)
  {
         u64 debugctl = 0;
  
+       if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+               debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+
         if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)
                 debugctl |= DEBUGCTLMSR_LBR_MASK;
  
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c

index bced766..6058a65 100644 (file)
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3098,15 +3098,8 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
                         nested_vmx_handle_enlightened_vmptrld(vcpu, false);
  
                 if (evmptrld_status == EVMPTRLD_VMFAIL ||
-                   evmptrld_status == EVMPTRLD_ERROR) {
-                       pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
-                                            __func__);
-                       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                       vcpu->run->internal.suberror =
-                               KVM_INTERNAL_ERROR_EMULATION;
-                       vcpu->run->internal.ndata = 0;
+                   evmptrld_status == EVMPTRLD_ERROR)
                         return false;
-               }
         }
  
         return true;
@@ -3194,8 +3187,16 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
  
  static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
  {
-       if (!nested_get_evmcs_page(vcpu))
+       if (!nested_get_evmcs_page(vcpu)) {
+               pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
+                                    __func__);
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror =
+                       KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
+
                 return false;
+       }
  
         if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
                 return false;
@@ -4435,7 +4436,15 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
         /* Similarly, triple faults in L2 should never escape. */
         WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
  
-       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+       if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+               /*
+                * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
+                * Enlightened VMCS after migration and we still need to
+                * do that when something is forcing L2->L1 exit prior to
+                * the first L2 run.
+                */
+               (void)nested_get_evmcs_page(vcpu);
+       }
  
         /* Service the TLB flush request for L2 before switching to L1. */
         if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index d000cdd..4bceb5c 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -455,21 +455,6 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
  
  static unsigned long host_idt_base;
  
-/*
- * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
- * will emulate SYSCALL in legacy mode if the vendor string in guest
- * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
- * support this emulation, IA32_STAR must always be included in
- * vmx_uret_msrs_list[], even in i386 builds.
- */
-static const u32 vmx_uret_msrs_list[] = {
-#ifdef CONFIG_X86_64
-       MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
-#endif
-       MSR_EFER, MSR_TSC_AUX, MSR_STAR,
-       MSR_IA32_TSX_CTRL,
-};
-
  #if IS_ENABLED(CONFIG_HYPERV)
  static bool __read_mostly enlightened_vmcs = true;
  module_param(enlightened_vmcs, bool, 0444);
@@ -697,21 +682,11 @@ static bool is_valid_passthrough_msr(u32 msr)
         return r;
  }
  
-static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
-{
-       int i;
-
-       for (i = 0; i < vmx->nr_uret_msrs; ++i)
-               if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
-                       return i;
-       return -1;
-}
-
  struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
  {
         int i;
  
-       i = __vmx_find_uret_msr(vmx, msr);
+       i = kvm_find_user_return_msr(msr);
         if (i >= 0)
                 return &vmx->guest_uret_msrs[i];
         return NULL;
@@ -720,13 +695,14 @@ struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
  static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
                                   struct vmx_uret_msr *msr, u64 data)
  {
+       unsigned int slot = msr - vmx->guest_uret_msrs;
         int ret = 0;
  
         u64 old_msr_data = msr->data;
         msr->data = data;
-       if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
+       if (msr->load_into_hardware) {
                 preempt_disable();
-               ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
+               ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);
                 preempt_enable();
                 if (ret)
                         msr->data = old_msr_data;
@@ -1078,7 +1054,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
                 return false;
         }
  
-       i = __vmx_find_uret_msr(vmx, MSR_EFER);
+       i = kvm_find_user_return_msr(MSR_EFER);
         if (i < 0)
                 return false;
  
@@ -1240,11 +1216,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
          */
         if (!vmx->guest_uret_msrs_loaded) {
                 vmx->guest_uret_msrs_loaded = true;
-               for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
-                       kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
+               for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+                       if (!vmx->guest_uret_msrs[i].load_into_hardware)
+                               continue;
+
+                       kvm_set_user_return_msr(i,
                                                 vmx->guest_uret_msrs[i].data,
                                                 vmx->guest_uret_msrs[i].mask);
-
+               }
         }
  
         if (vmx->nested.need_vmcs12_to_shadow_sync)
@@ -1751,19 +1730,16 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
         vmx_clear_hlt(vcpu);
  }
  
-static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
+                              bool load_into_hardware)
  {
-       struct vmx_uret_msr tmp;
-       int from, to;
+       struct vmx_uret_msr *uret_msr;
  
-       from = __vmx_find_uret_msr(vmx, msr);
-       if (from < 0)
+       uret_msr = vmx_find_uret_msr(vmx, msr);
+       if (!uret_msr)
                 return;
-       to = vmx->nr_active_uret_msrs++;
  
-       tmp = vmx->guest_uret_msrs[to];
-       vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
-       vmx->guest_uret_msrs[from] = tmp;
+       uret_msr->load_into_hardware = load_into_hardware;
  }
  
  /*
@@ -1773,29 +1749,42 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
   */
  static void setup_msrs(struct vcpu_vmx *vmx)
  {
-       vmx->guest_uret_msrs_loaded = false;
-       vmx->nr_active_uret_msrs = 0;
  #ifdef CONFIG_X86_64
+       bool load_syscall_msrs;
+
         /*
          * The SYSCALL MSRs are only needed on long mode guests, and only
          * when EFER.SCE is set.
          */
-       if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
-               vmx_setup_uret_msr(vmx, MSR_STAR);
-               vmx_setup_uret_msr(vmx, MSR_LSTAR);
-               vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
-       }
+       load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
+                           (vmx->vcpu.arch.efer & EFER_SCE);
+
+       vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
+       vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
+       vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
  #endif
-       if (update_transition_efer(vmx))
-               vmx_setup_uret_msr(vmx, MSR_EFER);
+       vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
  
-       if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
-               vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
+       vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
+                          guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+                          guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
  
-       vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+       /*
+        * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
+        * kernel and old userspace.  If those guests run on a tsx=off host, do
+        * allow guests to use TSX_CTRL, but don't change the value in hardware
+        * so that TSX remains always disabled.
+        */
+       vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
  
         if (cpu_has_vmx_msr_bitmap())
                 vmx_update_msr_bitmap(&vmx->vcpu);
+
+       /*
+        * The set of MSRs to load may have changed, reload MSRs before the
+        * next VM-Enter.
+        */
+       vmx->guest_uret_msrs_loaded = false;
  }
  
  static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -1993,11 +1982,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 else
                         msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
                 break;
-       case MSR_TSC_AUX:
-               if (!msr_info->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
-                       return 1;
-               goto find_uret_msr;
         case MSR_IA32_DEBUGCTLMSR:
                 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
                 break;
@@ -2031,6 +2015,9 @@ static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
         if (!intel_pmu_lbr_is_enabled(vcpu))
                 debugctl &= ~DEBUGCTLMSR_LBR_MASK;
  
+       if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+               debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+
         return debugctl;
  }
  
@@ -2313,14 +2300,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 else
                         vmx->pt_desc.guest.addr_a[index / 2] = data;
                 break;
-       case MSR_TSC_AUX:
-               if (!msr_info->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
-                       return 1;
-               /* Check reserved bit, higher 32 bits should be zero */
-               if ((data >> 32) != 0)
-                       return 1;
-               goto find_uret_msr;
         case MSR_IA32_PERF_CAPABILITIES:
                 if (data && !vcpu_to_pmu(vcpu)->version)
                         return 1;
@@ -4369,7 +4348,23 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
                                                   xsaves_enabled, false);
         }
  
-       vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
+       /*
+        * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
+        * feature is exposed to the guest.  This creates a virtualization hole
+        * if both are supported in hardware but only one is exposed to the
+        * guest, but letting the guest execute RDTSCP or RDPID when either one
+        * is advertised is preferable to emulating the advertised instruction
+        * in KVM on #UD, and obviously better than incorrectly injecting #UD.
+        */
+       if (cpu_has_vmx_rdtscp()) {
+               bool rdpid_or_rdtscp_enabled =
+                       guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
+                       guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+
+               vmx_adjust_secondary_exec_control(vmx, &exec_control,
+                                                 SECONDARY_EXEC_ENABLE_RDTSCP,
+                                                 rdpid_or_rdtscp_enabled, false);
+       }
         vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
  
         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@@ -6855,6 +6850,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
  
  static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
  {
+       struct vmx_uret_msr *tsx_ctrl;
         struct vcpu_vmx *vmx;
         int i, cpu, err;
  
@@ -6877,43 +6873,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
                         goto free_vpid;
         }
  
-       BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
-
-       for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
-               u32 index = vmx_uret_msrs_list[i];
-               u32 data_low, data_high;
-               int j = vmx->nr_uret_msrs;
-
-               if (rdmsr_safe(index, &data_low, &data_high) < 0)
-                       continue;
-               if (wrmsr_safe(index, data_low, data_high) < 0)
-                       continue;
-
-               vmx->guest_uret_msrs[j].slot = i;
-               vmx->guest_uret_msrs[j].data = 0;
-               switch (index) {
-               case MSR_IA32_TSX_CTRL:
-                       /*
-                        * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
-                        * interception.  Keep the host value unchanged to avoid
-                        * changing CPUID bits under the host kernel's feet.
-                        *
-                        * hle=0, rtm=0, tsx_ctrl=1 can be found with some
-                        * combinations of new kernel and old userspace.  If
-                        * those guests run on a tsx=off host, do allow guests
-                        * to use TSX_CTRL, but do not change the value on the
-                        * host so that TSX remains always disabled.
-                        */
-                       if (boot_cpu_has(X86_FEATURE_RTM))
-                               vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
-                       else
-                               vmx->guest_uret_msrs[j].mask = 0;
-                       break;
-               default:
-                       vmx->guest_uret_msrs[j].mask = -1ull;
-                       break;
-               }
-               ++vmx->nr_uret_msrs;
+       for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+               vmx->guest_uret_msrs[i].data = 0;
+               vmx->guest_uret_msrs[i].mask = -1ull;
+       }
+       if (boot_cpu_has(X86_FEATURE_RTM)) {
+               /*
+                * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
+                * Keep the host value unchanged to avoid changing CPUID bits
+                * under the host kernel's feet.
+                */
+               tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+               if (tsx_ctrl)
+                       vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
         }
  
         err = alloc_loaded_vmcs(&vmx->vmcs01);
@@ -7344,9 +7316,11 @@ static __init void vmx_set_cpu_caps(void)
         if (!cpu_has_vmx_xsaves())
                 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
  
-       /* CPUID 0x80000001 */
-       if (!cpu_has_vmx_rdtscp())
+       /* CPUID 0x80000001 and 0x7 (RDPID) */
+       if (!cpu_has_vmx_rdtscp()) {
                 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+               kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+       }
  
         if (cpu_has_vmx_waitpkg())
                 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
@@ -7402,8 +7376,9 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
         /*
          * RDPID causes #UD if disabled through secondary execution controls.
          * Because it is marked as EmulateOnUD, we need to intercept it here.
+        * Note, RDPID is hidden behind ENABLE_RDTSCP.
          */
-       case x86_intercept_rdtscp:
+       case x86_intercept_rdpid:
                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
                         exception->vector = UD_VECTOR;
                         exception->error_code_valid = false;
@@ -7769,17 +7744,42 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
         .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
  };
  
+static __init void vmx_setup_user_return_msrs(void)
+{
+
+       /*
+        * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
+        * will emulate SYSCALL in legacy mode if the vendor string in guest
+        * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
+        * support this emulation, MSR_STAR is included in the list for i386,
+        * but is never loaded into hardware.  MSR_CSTAR is also never loaded
+        * into hardware and is here purely for emulation purposes.
+        */
+       const u32 vmx_uret_msrs_list[] = {
+       #ifdef CONFIG_X86_64
+               MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+       #endif
+               MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+               MSR_IA32_TSX_CTRL,
+       };
+       int i;
+
+       BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
+
+       for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+               kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
+}
+
  static __init int hardware_setup(void)
  {
         unsigned long host_bndcfgs;
         struct desc_ptr dt;
-       int r, i, ept_lpage_level;
+       int r, ept_lpage_level;
  
         store_idt(&dt);
         host_idt_base = dt.address;
  
-       for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
-               kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
+       vmx_setup_user_return_msrs();
  
         if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
                 return -EIO;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h

index 008cb87..16e4e45 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -36,7 +36,7 @@ struct vmx_msrs {
  };
  
  struct vmx_uret_msr {
-       unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */
+       bool load_into_hardware;
         u64 data;
         u64 mask;
  };
@@ -245,8 +245,16 @@ struct vcpu_vmx {
         u32                   idt_vectoring_info;
         ulong                 rflags;
  
+       /*
+        * User return MSRs are always emulated when enabled in the guest, but
+        * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside
+        * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to
+        * be loaded into hardware if those conditions aren't met.
+        * nr_active_uret_msrs tracks the number of MSRs that need to be loaded
+        * into hardware when running the guest.  guest_uret_msrs[] is resorted
+        * whenever the number of "active" uret MSRs is modified.
+        */
         struct vmx_uret_msr   guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
-       int                   nr_uret_msrs;
         int                   nr_active_uret_msrs;
         bool                  guest_uret_msrs_loaded;
  #ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 6eda283..9b6bca6 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -184,11 +184,6 @@ module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
   */
  #define KVM_MAX_NR_USER_RETURN_MSRS 16
  
-struct kvm_user_return_msrs_global {
-       int nr;
-       u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
-};
-
  struct kvm_user_return_msrs {
         struct user_return_notifier urn;
         bool registered;
@@ -198,7 +193,9 @@ struct kvm_user_return_msrs {
         } values[KVM_MAX_NR_USER_RETURN_MSRS];
  };
  
-static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
+u32 __read_mostly kvm_nr_uret_msrs;
+EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
+static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
  static struct kvm_user_return_msrs __percpu *user_return_msrs;
  
  #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
@@ -330,23 +327,53 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
                 user_return_notifier_unregister(urn);
         }
         local_irq_restore(flags);
-       for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
+       for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
                 values = &msrs->values[slot];
                 if (values->host != values->curr) {
-                       wrmsrl(user_return_msrs_global.msrs[slot], values->host);
+                       wrmsrl(kvm_uret_msrs_list[slot], values->host);
                         values->curr = values->host;
                 }
         }
  }
  
-void kvm_define_user_return_msr(unsigned slot, u32 msr)
+static int kvm_probe_user_return_msr(u32 msr)
+{
+       u64 val;
+       int ret;
+
+       preempt_disable();
+       ret = rdmsrl_safe(msr, &val);
+       if (ret)
+               goto out;
+       ret = wrmsrl_safe(msr, val);
+out:
+       preempt_enable();
+       return ret;
+}
+
+int kvm_add_user_return_msr(u32 msr)
  {
-       BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
-       user_return_msrs_global.msrs[slot] = msr;
-       if (slot >= user_return_msrs_global.nr)
-               user_return_msrs_global.nr = slot + 1;
+       BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
+
+       if (kvm_probe_user_return_msr(msr))
+               return -1;
+
+       kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
+       return kvm_nr_uret_msrs++;
  }
-EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
+EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
+
+int kvm_find_user_return_msr(u32 msr)
+{
+       int i;
+
+       for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+               if (kvm_uret_msrs_list[i] == msr)
+                       return i;
+       }
+       return -1;
+}
+EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
  
  static void kvm_user_return_msr_cpu_online(void)
  {
@@ -355,8 +382,8 @@ static void kvm_user_return_msr_cpu_online(void)
         u64 value;
         int i;
  
-       for (i = 0; i < user_return_msrs_global.nr; ++i) {
-               rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
+       for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+               rdmsrl_safe(kvm_uret_msrs_list[i], &value);
                 msrs->values[i].host = value;
                 msrs->values[i].curr = value;
         }
@@ -371,7 +398,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
         value = (value & mask) | (msrs->values[slot].host & ~mask);
         if (value == msrs->values[slot].curr)
                 return 0;
-       err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
+       err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
         if (err)
                 return 1;
  
@@ -1149,6 +1176,9 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
  
         if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
                 fixed |= DR6_RTM;
+
+       if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+               fixed |= DR6_BUS_LOCK;
         return fixed;
  }
  
@@ -1615,6 +1645,30 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
                  * invokes 64-bit SYSENTER.
                  */
                 data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
+               break;
+       case MSR_TSC_AUX:
+               if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+                       return 1;
+
+               if (!host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+                       return 1;
+
+               /*
+                * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
+                * incomplete and conflicting architectural behavior.  Current
+                * AMD CPUs completely ignore bits 63:32, i.e. they aren't
+                * reserved and always read as zeros.  Enforce Intel's reserved
+                * bits check if and only if the guest CPU is Intel, and clear
+                * the bits in all other cases.  This ensures cross-vendor
+                * migration will provide consistent behavior for the guest.
+                */
+               if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
+                       return 1;
+
+               data = (u32)data;
+               break;
         }
  
         msr.data = data;
@@ -1651,6 +1705,18 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
         if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
                 return KVM_MSR_RET_FILTERED;
  
+       switch (index) {
+       case MSR_TSC_AUX:
+               if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+                       return 1;
+
+               if (!host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+                       return 1;
+               break;
+       }
+
         msr.index = index;
         msr.host_initiated = host_initiated;
  
@@ -5468,14 +5534,18 @@ static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
  static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
                               struct kvm_msr_filter_range *user_range)
  {
-       struct msr_bitmap_range range;
         unsigned long *bitmap = NULL;
         size_t bitmap_size;
-       int r;
  
         if (!user_range->nmsrs)
                 return 0;
  
+       if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
+               return -EINVAL;
+
+       if (!user_range->flags)
+               return -EINVAL;
+
         bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
         if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
                 return -EINVAL;
@@ -5484,31 +5554,15 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
         if (IS_ERR(bitmap))
                 return PTR_ERR(bitmap);
  
-       range = (struct msr_bitmap_range) {
+       msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
                 .flags = user_range->flags,
                 .base = user_range->base,
                 .nmsrs = user_range->nmsrs,
                 .bitmap = bitmap,
         };
  
-       if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
-               r = -EINVAL;
-               goto err;
-       }
-
-       if (!range.flags) {
-               r = -EINVAL;
-               goto err;
-       }
-
-       /* Everything ok, add this range identifier. */
-       msr_filter->ranges[msr_filter->count] = range;
         msr_filter->count++;
-
         return 0;
-err:
-       kfree(bitmap);
-       return r;
  }
  
  static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
@@ -5937,7 +5991,8 @@ static void kvm_init_msr_list(void)
                                 continue;
                         break;
                 case MSR_TSC_AUX:
-                       if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+                       if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
+                           !kvm_cpu_cap_has(X86_FEATURE_RDPID))
                                 continue;
                         break;
                 case MSR_IA32_UMWAIT_CONTROL:
@@ -8039,6 +8094,18 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
  
  static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
  
+/*
+ * Indirection to move queue_work() out of the tk_core.seq write held
+ * region to prevent possible deadlocks against time accessors which
+ * are invoked with work related locks held.
+ */
+static void pvclock_irq_work_fn(struct irq_work *w)
+{
+       queue_work(system_long_wq, &pvclock_gtod_work);
+}
+
+static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
+
  /*
   * Notification about pvclock gtod data update.
   */
@@ -8050,13 +8117,14 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
  
         update_pvclock_gtod(tk);
  
-       /* disable master clock if host does not trust, or does not
-        * use, TSC based clocksource.
+       /*
+        * Disable master clock if host does not trust, or does not use,
+        * TSC based clocksource. Delegate queue_work() to irq_work as
+        * this is invoked with tk_core.seq write held.
          */
         if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
             atomic_read(&kvm_guest_has_master_clock) != 0)
-               queue_work(system_long_wq, &pvclock_gtod_work);
-
+               irq_work_queue(&pvclock_irq_work);
         return 0;
  }
  
@@ -8118,6 +8186,7 @@ int kvm_arch_init(void *opaque)
                 printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
                 goto out_free_x86_emulator_cache;
         }
+       kvm_nr_uret_msrs = 0;
  
         r = kvm_mmu_module_init();
         if (r)
@@ -8168,6 +8237,8 @@ void kvm_arch_exit(void)
         cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
  #ifdef CONFIG_X86_64
         pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+       irq_work_sync(&pvclock_irq_work);
+       cancel_work_sync(&pvclock_gtod_work);
  #endif
         kvm_x86_ops.hardware_enable = NULL;
         kvm_mmu_module_exit();
diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt

index feaf464..3a9f203 100644 (file)
--- a/tools/kvm/kvm_stat/kvm_stat.txt
+++ b/tools/kvm/kvm_stat/kvm_stat.txt
@@ -111,7 +111,7 @@ OPTIONS
  --tracepoints::
          retrieve statistics from tracepoints
  
-*z*::
+-z::
  --skip-zero-records::
          omit records with all zeros in logging mode
  
diff --git a/tools/testing/selftests/kvm/lib/x86_64/handlers.S b/tools/testing/selftests/kvm/lib/x86_64/handlers.S

index aaf7bc7..7629819 100644 (file)
--- a/tools/testing/selftests/kvm/lib/x86_64/handlers.S
+++ b/tools/testing/selftests/kvm/lib/x86_64/handlers.S
@@ -54,9 +54,9 @@ idt_handlers:
         .align 8
  
         /* Fetch current address and append it to idt_handlers. */
-       current_handler = .
+666 :
  .pushsection .rodata
-.quad current_handler
+       .quad 666b
  .popsection
  
         .if ! \has_error
diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c

index ca22ee6..63096ce 100644 (file)
--- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
@@ -18,6 +18,28 @@
  #include "vmx.h"
  
  #define VCPU_ID                5
+#define NMI_VECTOR     2
+
+static int ud_count;
+
+void enable_x2apic(void)
+{
+       uint32_t spiv_reg = APIC_BASE_MSR + (APIC_SPIV >> 4);
+
+       wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) |
+             MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD);
+       wrmsr(spiv_reg, rdmsr(spiv_reg) | APIC_SPIV_APIC_ENABLED);
+}
+
+static void guest_ud_handler(struct ex_regs *regs)
+{
+       ud_count++;
+       regs->rip += 3; /* VMLAUNCH */
+}
+
+static void guest_nmi_handler(struct ex_regs *regs)
+{
+}
  
  void l2_guest_code(void)
  {
@@ -25,15 +47,23 @@ void l2_guest_code(void)
  
         GUEST_SYNC(8);
  
+       /* Forced exit to L1 upon restore */
+       GUEST_SYNC(9);
+
         /* Done, exit to L1 and never come back.  */
         vmcall();
  }
  
-void l1_guest_code(struct vmx_pages *vmx_pages)
+void guest_code(struct vmx_pages *vmx_pages)
  {
  #define L2_GUEST_STACK_SIZE 64
         unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
  
+       enable_x2apic();
+
+       GUEST_SYNC(1);
+       GUEST_SYNC(2);
+
         enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist);
  
         GUEST_ASSERT(vmx_pages->vmcs_gpa);
@@ -55,27 +85,40 @@ void l1_guest_code(struct vmx_pages *vmx_pages)
         current_evmcs->revision_id = EVMCS_VERSION;
         GUEST_SYNC(6);
  
+       current_evmcs->pin_based_vm_exec_control |=
+               PIN_BASED_NMI_EXITING;
         GUEST_ASSERT(!vmlaunch());
         GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
-       GUEST_SYNC(9);
+
+       /*
+        * NMI forces L2->L1 exit, resuming L2 and hope that EVMCS is
+        * up-to-date (RIP points where it should and not at the beginning
+        * of l2_guest_code(). GUEST_SYNC(9) checkes that.
+        */
         GUEST_ASSERT(!vmresume());
-       GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
         GUEST_SYNC(10);
+
+       GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+       GUEST_SYNC(11);
+
+       /* Try enlightened vmptrld with an incorrect GPA */
+       evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs);
+       GUEST_ASSERT(vmlaunch());
+       GUEST_ASSERT(ud_count == 1);
+       GUEST_DONE();
  }
  
-void guest_code(struct vmx_pages *vmx_pages)
+void inject_nmi(struct kvm_vm *vm)
  {
-       GUEST_SYNC(1);
-       GUEST_SYNC(2);
+       struct kvm_vcpu_events events;
  
-       if (vmx_pages)
-               l1_guest_code(vmx_pages);
+       vcpu_events_get(vm, VCPU_ID, &events);
  
-       GUEST_DONE();
+       events.nmi.pending = 1;
+       events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
  
-       /* Try enlightened vmptrld with an incorrect GPA */
-       evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs);
-       GUEST_ASSERT(vmlaunch());
+       vcpu_events_set(vm, VCPU_ID, &events);
  }
  
  int main(int argc, char *argv[])
@@ -109,6 +152,13 @@ int main(int argc, char *argv[])
         vcpu_alloc_vmx(vm, &vmx_pages_gva);
         vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
  
+       vm_init_descriptor_tables(vm);
+       vcpu_init_descriptor_tables(vm, VCPU_ID);
+       vm_handle_exception(vm, UD_VECTOR, guest_ud_handler);
+       vm_handle_exception(vm, NMI_VECTOR, guest_nmi_handler);
+
+       pr_info("Running L1 which uses EVMCS to run L2\n");
+
         for (stage = 1;; stage++) {
                 _vcpu_run(vm, VCPU_ID);
                 TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
@@ -124,7 +174,7 @@ int main(int argc, char *argv[])
                 case UCALL_SYNC:
                         break;
                 case UCALL_DONE:
-                       goto part1_done;
+                       goto done;
                 default:
                         TEST_FAIL("Unknown ucall %lu", uc.cmd);
                 }
@@ -154,12 +204,14 @@ int main(int argc, char *argv[])
                 TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
                             "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
                             (ulong) regs2.rdi, (ulong) regs2.rsi);
-       }
  
-part1_done:
-       _vcpu_run(vm, VCPU_ID);
-       TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN,
-                   "Unexpected successful VMEnter with invalid eVMCS pointer!");
+               /* Force immediate L2->L1 exit before resuming */
+               if (stage == 8) {
+                       pr_info("Injecting NMI into L1 before L2 had a chance to run after restore\n");
+                       inject_nmi(vm);
+               }
+       }
  
+done:
         kvm_vm_free(vm);
  }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 2799c66..6b4feb9 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2893,8 +2893,8 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
         if (val < grow_start)
                 val = grow_start;
  
-       if (val > halt_poll_ns)
-               val = halt_poll_ns;
+       if (val > vcpu->kvm->max_halt_poll_ns)
+               val = vcpu->kvm->max_halt_poll_ns;
  
         vcpu->halt_poll_ns = val;
  out:
@@ -2973,7 +2973,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
                                 goto out;
                         }
                         poll_end = cur = ktime_get();
-               } while (single_task_running() && ktime_before(cur, stop));
+               } while (single_task_running() && !need_resched() &&
+                        ktime_before(cur, stop));
         }
  
         prepare_to_rcuwait(&vcpu->wait);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 10 May 2021 19:30:45 +0000 (12:30 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 10 May 2021 19:30:45 +0000 (12:30 -0700)
Documentation/virt/kvm/api.rst		patch \| blob \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/include/asm/kvm_para.h		patch \| blob \| history
arch/x86/include/uapi/asm/kvm.h		patch \| blob \| history
arch/x86/kernel/kvm.c		patch \| blob \| history
arch/x86/kernel/kvmclock.c		patch \| blob \| history
arch/x86/kvm/cpuid.c		patch \| blob \| history
arch/x86/kvm/emulate.c		patch \| blob \| history
arch/x86/kvm/kvm_emulate.h		patch \| blob \| history
arch/x86/kvm/lapic.c		patch \| blob \| history
arch/x86/kvm/mmu/mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.c		patch \| blob \| history
arch/x86/kvm/svm/nested.c		patch \| blob \| history
arch/x86/kvm/svm/sev.c		patch \| blob \| history
arch/x86/kvm/svm/svm.c		patch \| blob \| history
arch/x86/kvm/svm/svm.h		patch \| blob \| history
arch/x86/kvm/vmx/capabilities.h		patch \| blob \| history
arch/x86/kvm/vmx/nested.c		patch \| blob \| history
arch/x86/kvm/vmx/vmx.c		patch \| blob \| history
arch/x86/kvm/vmx/vmx.h		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
tools/kvm/kvm_stat/kvm_stat.txt		patch \| blob \| history
tools/testing/selftests/kvm/lib/x86_64/handlers.S		patch \| blob \| history
tools/testing/selftests/kvm/x86_64/evmcs_test.c		patch \| blob \| history
virt/kvm/kvm_main.c		patch \| blob \| history