Merge branch 'kvm-5.20-early'

[linux-2.6-microblaze.git] / arch / x86 / kvm / x86.c
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 158b2e1..e8177d8 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -87,8 +87,11 @@
  
  #define MAX_IO_MSRS 256
  #define KVM_MAX_MCE_BANKS 32
-u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
-EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
+
+struct kvm_caps kvm_caps __read_mostly = {
+       .supported_mce_cap = MCG_CTL_P | MCG_SER_P,
+};
+EXPORT_SYMBOL_GPL(kvm_caps);
  
  #define  ERR_PTR_USR(e)  ((void __user *)ERR_PTR(e))
  
@@ -151,19 +154,6 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
  static bool __read_mostly kvmclock_periodic_sync = true;
  module_param(kvmclock_periodic_sync, bool, S_IRUGO);
  
-bool __read_mostly kvm_has_tsc_control;
-EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
-u32  __read_mostly kvm_max_guest_tsc_khz;
-EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
-u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
-EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
-u64  __read_mostly kvm_max_tsc_scaling_ratio;
-EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-u64 __read_mostly kvm_default_tsc_scaling_ratio;
-EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
-bool __read_mostly kvm_has_bus_lock_exit;
-EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit);
-
  /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
  static u32 __read_mostly tsc_tolerance_ppm = 250;
  module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
@@ -235,8 +225,6 @@ EXPORT_SYMBOL_GPL(enable_apicv);
  
  u64 __read_mostly host_xss;
  EXPORT_SYMBOL_GPL(host_xss);
-u64 __read_mostly supported_xss;
-EXPORT_SYMBOL_GPL(supported_xss);
  
  const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
         KVM_GENERIC_VM_STATS(),
@@ -298,7 +286,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
         STATS_DESC_COUNTER(VCPU, directed_yield_successful),
         STATS_DESC_COUNTER(VCPU, preemption_reported),
         STATS_DESC_COUNTER(VCPU, preemption_other),
-       STATS_DESC_ICOUNTER(VCPU, guest_mode)
+       STATS_DESC_ICOUNTER(VCPU, guest_mode),
+       STATS_DESC_COUNTER(VCPU, notify_window_exits),
  };
  
  const struct kvm_stats_header kvm_vcpu_stats_header = {
@@ -311,8 +300,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
  };
  
  u64 __read_mostly host_xcr0;
-u64 __read_mostly supported_xcr0;
-EXPORT_SYMBOL_GPL(supported_xcr0);
  
  static struct kmem_cache *x86_emulator_cache;
  
@@ -1450,6 +1437,7 @@ static const u32 msrs_to_save_all[] = {
         MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
         MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
         MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+       MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
  
         MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
         MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
@@ -2346,12 +2334,12 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
  
         /* Guest TSC same frequency as host TSC? */
         if (!scale) {
-               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
+               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
                 return 0;
         }
  
         /* TSC scaling supported? */
-       if (!kvm_has_tsc_control) {
+       if (!kvm_caps.has_tsc_control) {
                 if (user_tsc_khz > tsc_khz) {
                         vcpu->arch.tsc_catchup = 1;
                         vcpu->arch.tsc_always_catchup = 1;
@@ -2363,10 +2351,10 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
         }
  
         /* TSC scaling required  - calculate ratio */
-       ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
+       ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits,
                                 user_tsc_khz, tsc_khz);
  
-       if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
+       if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) {
                 pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
                                     user_tsc_khz);
                 return -1;
@@ -2384,7 +2372,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
         /* tsc_khz can be zero if TSC calibration fails */
         if (user_tsc_khz == 0) {
                 /* set tsc_scaling_ratio to a safe value */
-               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
+               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
                 return -1;
         }
  
@@ -2461,18 +2449,18 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
   * (frac) represent the fractional part, ie. ratio represents a fixed
   * point number (mult + frac * 2^(-N)).
   *
- * N equals to kvm_tsc_scaling_ratio_frac_bits.
+ * N equals to kvm_caps.tsc_scaling_ratio_frac_bits.
   */
  static inline u64 __scale_tsc(u64 ratio, u64 tsc)
  {
-       return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
+       return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits);
  }
  
  u64 kvm_scale_tsc(u64 tsc, u64 ratio)
  {
         u64 _tsc = tsc;
  
-       if (ratio != kvm_default_tsc_scaling_ratio)
+       if (ratio != kvm_caps.default_tsc_scaling_ratio)
                 _tsc = __scale_tsc(ratio, tsc);
  
         return _tsc;
@@ -2499,11 +2487,11 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
  {
         u64 nested_offset;
  
-       if (l2_multiplier == kvm_default_tsc_scaling_ratio)
+       if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio)
                 nested_offset = l1_offset;
         else
                 nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
-                                               kvm_tsc_scaling_ratio_frac_bits);
+                                               kvm_caps.tsc_scaling_ratio_frac_bits);
  
         nested_offset += l2_offset;
         return nested_offset;
@@ -2512,9 +2500,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
  
  u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
  {
-       if (l2_multiplier != kvm_default_tsc_scaling_ratio)
+       if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio)
                 return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
-                                      kvm_tsc_scaling_ratio_frac_bits);
+                                      kvm_caps.tsc_scaling_ratio_frac_bits);
  
         return l1_multiplier;
  }
@@ -2556,7 +2544,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli
         else
                 vcpu->arch.tsc_scaling_ratio = l1_multiplier;
  
-       if (kvm_has_tsc_control)
+       if (kvm_caps.has_tsc_control)
                 static_call(kvm_x86_write_tsc_multiplier)(
                         vcpu, vcpu->arch.tsc_scaling_ratio);
  }
@@ -2692,7 +2680,7 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
  
  static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
  {
-       if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+       if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
                 WARN_ON(adjustment < 0);
         adjustment = kvm_scale_tsc((u64) adjustment,
                                    vcpu->arch.l1_tsc_scaling_ratio);
@@ -3105,7 +3093,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
  
         /* With all the info we got, fill in the values */
  
-       if (kvm_has_tsc_control)
+       if (kvm_caps.has_tsc_control)
                 tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
                                             v->arch.l1_tsc_scaling_ratio);
  
@@ -3236,10 +3224,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                         /* only 0 or all 1s can be written to IA32_MCi_CTL
                          * some Linux kernels though clear bit 10 in bank 4 to
                          * workaround a BIOS/GART TBL issue on AMD K8s, ignore
-                        * this to avoid an uncatched #GP in the guest
+                        * this to avoid an uncatched #GP in the guest.
+                        *
+                        * UNIXWARE clears bit 0 of MC1_CTL to ignore
+                        * correctable, single-bit ECC data errors.
                          */
                         if ((offset & 0x3) == 0 &&
-                           data != 0 && (data | (1 << 10)) != ~(u64)0)
+                           data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
                                 return -1;
  
                         /* MCi_STATUS */
@@ -3557,9 +3548,25 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                         vcpu->arch.ia32_tsc_adjust_msr = data;
                 }
                 break;
-       case MSR_IA32_MISC_ENABLE:
+       case MSR_IA32_MISC_ENABLE: {
+               u64 old_val = vcpu->arch.ia32_misc_enable_msr;
+               u64 pmu_mask = MSR_IA32_MISC_ENABLE_PMU_RO_MASK |
+                       MSR_IA32_MISC_ENABLE_EMON;
+
+               /* RO bits */
+               if (!msr_info->host_initiated &&
+                   ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK))
+                       return 1;
+
+               /*
+                * For a dummy user space, the order of setting vPMU capabilities and
+                * initialising MSR_IA32_MISC_ENABLE is not strictly guaranteed, so to
+                * avoid inconsistent functionality we keep the vPMU bits unchanged here.
+                */
+               data &= ~pmu_mask;
+               data |= old_val & pmu_mask;
                 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
-                   ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
+                   ((old_val ^ data)  & MSR_IA32_MISC_ENABLE_MWAIT)) {
                         if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
                                 return 1;
                         vcpu->arch.ia32_misc_enable_msr = data;
@@ -3568,6 +3575,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                         vcpu->arch.ia32_misc_enable_msr = data;
                 }
                 break;
+       }
         case MSR_IA32_SMBASE:
                 if (!msr_info->host_initiated)
                         return 1;
@@ -3594,7 +3602,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                  * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
                  * XSAVES/XRSTORS to save/restore PT MSRs.
                  */
-               if (data & ~supported_xss)
+               if (data & ~kvm_caps.supported_xss)
                         return 1;
                 vcpu->arch.ia32_xss = data;
                 kvm_update_cpuid_runtime(vcpu);
@@ -3700,7 +3708,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 fallthrough;
         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
-               if (kvm_pmu_is_valid_msr(vcpu, msr))
+               if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated))
                         return kvm_pmu_set_msr(vcpu, msr_info);
  
                 if (pr || data != 0)
@@ -3783,7 +3791,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 break;
  #endif
         default:
-               if (kvm_pmu_is_valid_msr(vcpu, msr))
+               if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated))
                         return kvm_pmu_set_msr(vcpu, msr_info);
                 return KVM_MSR_RET_INVALID;
         }
@@ -3863,7 +3871,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 msr_info->data = 0;
                 break;
         case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
-               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated))
                         return kvm_pmu_get_msr(vcpu, msr_info);
                 if (!msr_info->host_initiated)
                         return 1;
@@ -3873,7 +3881,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
-               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated))
                         return kvm_pmu_get_msr(vcpu, msr_info);
                 msr_info->data = 0;
                 break;
@@ -4119,7 +4127,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 break;
  #endif
         default:
-               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated))
                         return kvm_pmu_get_msr(vcpu, msr_info);
                 return KVM_MSR_RET_INVALID;
         }
@@ -4277,6 +4285,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
         case KVM_CAP_GET_MSR_FEATURES:
         case KVM_CAP_MSR_PLATFORM_INFO:
         case KVM_CAP_EXCEPTION_PAYLOAD:
+       case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
         case KVM_CAP_SET_GUEST_DEBUG:
         case KVM_CAP_LAST_CPU:
         case KVM_CAP_X86_USER_SPACE_MSR:
@@ -4354,7 +4363,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                 break;
         case KVM_CAP_TSC_CONTROL:
         case KVM_CAP_VM_TSC_CONTROL:
-               r = kvm_has_tsc_control;
+               r = kvm_caps.has_tsc_control;
                 break;
         case KVM_CAP_X2APIC_API:
                 r = KVM_X2APIC_API_VALID_FLAGS;
@@ -4376,7 +4385,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                 r = sched_info_on();
                 break;
         case KVM_CAP_X86_BUS_LOCK_EXIT:
-               if (kvm_has_bus_lock_exit)
+               if (kvm_caps.has_bus_lock_exit)
                         r = KVM_BUS_LOCK_DETECTION_OFF |
                             KVM_BUS_LOCK_DETECTION_EXIT;
                 else
@@ -4385,7 +4394,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
         case KVM_CAP_XSAVE2: {
                 u64 guest_perm = xstate_get_guest_group_perm();
  
-               r = xstate_required_size(supported_xcr0 & guest_perm, false);
+               r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false);
                 if (r < sizeof(struct kvm_xsave))
                         r = sizeof(struct kvm_xsave);
                 break;
@@ -4396,6 +4405,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
         case KVM_CAP_DISABLE_QUIRKS2:
                 r = KVM_X86_VALID_QUIRKS;
                 break;
+       case KVM_CAP_X86_NOTIFY_VMEXIT:
+               r = kvm_caps.has_notify_vmexit;
+               break;
         default:
                 break;
         }
@@ -4423,7 +4435,7 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
  
         switch (attr->attr) {
         case KVM_X86_XCOMP_GUEST_SUPP:
-               if (put_user(supported_xcr0, uaddr))
+               if (put_user(kvm_caps.supported_xcr0, uaddr))
                         return -EFAULT;
                 return 0;
         default:
@@ -4500,8 +4512,8 @@ long kvm_arch_dev_ioctl(struct file *filp,
         }
         case KVM_X86_GET_MCE_CAP_SUPPORTED:
                 r = -EFAULT;
-               if (copy_to_user(argp, &kvm_mce_cap_supported,
-                                sizeof(kvm_mce_cap_supported)))
+               if (copy_to_user(argp, &kvm_caps.supported_mce_cap,
+                                sizeof(kvm_caps.supported_mce_cap)))
                         goto out;
                 r = 0;
                 break;
@@ -4800,7 +4812,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
         r = -EINVAL;
         if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
                 goto out;
-       if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
+       if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000))
                 goto out;
         r = 0;
         vcpu->arch.mcg_cap = mcg_cap;
@@ -4938,6 +4950,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                          | KVM_VCPUEVENT_VALID_SMM);
         if (vcpu->kvm->arch.exception_payload_enabled)
                 events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
+       if (vcpu->kvm->arch.triple_fault_event) {
+               events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
+       }
  
         memset(&events->reserved, 0, sizeof(events->reserved));
  }
@@ -4951,7 +4967,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                               | KVM_VCPUEVENT_VALID_SIPI_VECTOR
                               | KVM_VCPUEVENT_VALID_SHADOW
                               | KVM_VCPUEVENT_VALID_SMM
-                             | KVM_VCPUEVENT_VALID_PAYLOAD))
+                             | KVM_VCPUEVENT_VALID_PAYLOAD
+                             | KVM_VCPUEVENT_VALID_TRIPLE_FAULT))
                 return -EINVAL;
  
         if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
@@ -5024,6 +5041,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                 }
         }
  
+       if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
+               if (!vcpu->kvm->arch.triple_fault_event)
+                       return -EINVAL;
+               if (events->triple_fault.pending)
+                       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               else
+                       kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+       }
+
         kvm_make_request(KVM_REQ_EVENT, vcpu);
  
         return 0;
@@ -5092,7 +5118,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
  
         return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
                                               guest_xsave->region,
-                                             supported_xcr0, &vcpu->arch.pkru);
+                                             kvm_caps.supported_xcr0,
+                                             &vcpu->arch.pkru);
  }
  
  static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
@@ -5597,8 +5624,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 r = -EINVAL;
                 user_tsc_khz = (u32)arg;
  
-               if (kvm_has_tsc_control &&
-                   user_tsc_khz >= kvm_max_guest_tsc_khz)
+               if (kvm_caps.has_tsc_control &&
+                   user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
                         goto out;
  
                 if (user_tsc_khz == 0)
@@ -6025,6 +6052,10 @@ split_irqchip_unlock:
                 kvm->arch.exception_payload_enabled = cap->args[0];
                 r = 0;
                 break;
+       case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
+               kvm->arch.triple_fault_event = cap->args[0];
+               r = 0;
+               break;
         case KVM_CAP_X86_USER_SPACE_MSR:
                 kvm->arch.user_space_msr_mask = cap->args[0];
                 r = 0;
@@ -6038,7 +6069,7 @@ split_irqchip_unlock:
                     (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
                         break;
  
-               if (kvm_has_bus_lock_exit &&
+               if (kvm_caps.has_bus_lock_exit &&
                     cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
                         kvm->arch.bus_lock_detection_enabled = true;
                 r = 0;
@@ -6101,6 +6132,36 @@ split_irqchip_unlock:
                 }
                 mutex_unlock(&kvm->lock);
                 break;
+       case KVM_CAP_MAX_VCPU_ID:
+               r = -EINVAL;
+               if (cap->args[0] > KVM_MAX_VCPU_IDS)
+                       break;
+
+               mutex_lock(&kvm->lock);
+               if (kvm->arch.max_vcpu_ids == cap->args[0]) {
+                       r = 0;
+               } else if (!kvm->arch.max_vcpu_ids) {
+                       kvm->arch.max_vcpu_ids = cap->args[0];
+                       r = 0;
+               }
+               mutex_unlock(&kvm->lock);
+               break;
+       case KVM_CAP_X86_NOTIFY_VMEXIT:
+               r = -EINVAL;
+               if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
+                       break;
+               if (!kvm_caps.has_notify_vmexit)
+                       break;
+               if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
+                       break;
+               mutex_lock(&kvm->lock);
+               if (!kvm->created_vcpus) {
+                       kvm->arch.notify_window = cap->args[0] >> 32;
+                       kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
+                       r = 0;
+               }
+               mutex_unlock(&kvm->lock);
+               break;
         default:
                 r = -EINVAL;
                 break;
@@ -6573,8 +6634,8 @@ set_pit2_out:
                 r = -EINVAL;
                 user_tsc_khz = (u32)arg;
  
-               if (kvm_has_tsc_control &&
-                   user_tsc_khz >= kvm_max_guest_tsc_khz)
+               if (kvm_caps.has_tsc_control &&
+                   user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
                         goto out;
  
                 if (user_tsc_khz == 0)
@@ -6649,15 +6710,12 @@ out:
  
  static void kvm_init_msr_list(void)
  {
-       struct x86_pmu_capability x86_pmu;
         u32 dummy[2];
         unsigned i;
  
         BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
                          "Please update the fixed PMCs in msrs_to_saved_all[]");
  
-       perf_get_x86_pmu_capability(&x86_pmu);
-
         num_msrs_to_save = 0;
         num_emulated_msrs = 0;
         num_msr_based_features = 0;
@@ -6709,12 +6767,12 @@ static void kvm_init_msr_list(void)
                         break;
                 case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
                         if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
-                           min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+                           min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
                                 continue;
                         break;
                 case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
                         if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
-                           min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+                           min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
                                 continue;
                         break;
                 case MSR_IA32_XFD:
@@ -8740,7 +8798,7 @@ static void kvm_hyperv_tsc_notifier(void)
         /* TSC frequency always matches when on Hyper-V */
         for_each_present_cpu(cpu)
                 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
-       kvm_max_guest_tsc_khz = tsc_khz;
+       kvm_caps.max_guest_tsc_khz = tsc_khz;
  
         list_for_each_entry(kvm, &vm_list, vm_list) {
                 __kvm_start_pvclock_update(kvm);
@@ -9002,7 +9060,7 @@ int kvm_arch_init(void *opaque)
  
         if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-               supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+               kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
         }
  
         if (pi_inject_timer == -1)
@@ -9422,6 +9480,11 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu)
  
  static void kvm_inject_exception(struct kvm_vcpu *vcpu)
  {
+       trace_kvm_inj_exception(vcpu->arch.exception.nr,
+                               vcpu->arch.exception.has_error_code,
+                               vcpu->arch.exception.error_code,
+                               vcpu->arch.exception.injected);
+
         if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
                 vcpu->arch.exception.error_code = false;
         static_call(kvm_x86_queue_exception)(vcpu);
@@ -9457,7 +9520,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
                         static_call(kvm_x86_inject_nmi)(vcpu);
                         can_inject = false;
                 } else if (vcpu->arch.interrupt.injected) {
-                       static_call(kvm_x86_inject_irq)(vcpu);
+                       static_call(kvm_x86_inject_irq)(vcpu, true);
                         can_inject = false;
                 }
         }
@@ -9479,13 +9542,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
  
         /* try to inject new event if pending */
         if (vcpu->arch.exception.pending) {
-               trace_kvm_inj_exception(vcpu->arch.exception.nr,
-                                       vcpu->arch.exception.has_error_code,
-                                       vcpu->arch.exception.error_code);
-
-               vcpu->arch.exception.pending = false;
-               vcpu->arch.exception.injected = true;
-
                 if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
                         __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
                                              X86_EFLAGS_RF);
@@ -9499,6 +9555,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
                 }
  
                 kvm_inject_exception(vcpu);
+
+               vcpu->arch.exception.pending = false;
+               vcpu->arch.exception.injected = true;
+
                 can_inject = false;
         }
  
@@ -9551,7 +9611,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
                         goto out;
                 if (r) {
                         kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
-                       static_call(kvm_x86_inject_irq)(vcpu);
+                       static_call(kvm_x86_inject_irq)(vcpu, false);
                         WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
                 }
                 if (kvm_cpu_has_injectable_intr(vcpu))
@@ -11263,11 +11323,17 @@ static int sync_regs(struct kvm_vcpu *vcpu)
  
  int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
  {
-       if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+       if (kvm_check_tsc_unstable() && kvm->created_vcpus)
                 pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
                              "guest TSC will not be reliable\n");
  
-       return 0;
+       if (!kvm->arch.max_vcpu_ids)
+               kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
+
+       if (id >= kvm->arch.max_vcpu_ids)
+               return -EINVAL;
+
+       return static_call(kvm_x86_vcpu_precreate)(kvm);
  }
  
  int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
@@ -11704,6 +11770,8 @@ int kvm_arch_hardware_setup(void *opaque)
         if (boot_cpu_has(X86_FEATURE_XSAVES))
                 rdmsrl(MSR_IA32_XSS, host_xss);
  
+       kvm_init_pmu_capability();
+
         r = ops->hardware_setup();
         if (r != 0)
                 return r;
@@ -11713,13 +11781,13 @@ int kvm_arch_hardware_setup(void *opaque)
         kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
  
         if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
-               supported_xss = 0;
+               kvm_caps.supported_xss = 0;
  
  #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
         cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
  #undef __kvm_cpu_cap_has
  
-       if (kvm_has_tsc_control) {
+       if (kvm_caps.has_tsc_control) {
                 /*
                  * Make sure the user can only configure tsc_khz values that
                  * fit into a signed integer.
@@ -11727,10 +11795,10 @@ int kvm_arch_hardware_setup(void *opaque)
                  * be 1 on all machines.
                  */
                 u64 max = min(0x7fffffffULL,
-                             __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
-               kvm_max_guest_tsc_khz = max;
+                             __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
+               kvm_caps.max_guest_tsc_khz = max;
         }
-       kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
+       kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
         kvm_init_msr_list();
         return 0;
  }