Merge branch 'kvm-5.20-early'
[linux-2.6-microblaze.git] / arch / x86 / kvm / x86.c
index 158b2e1..e8177d8 100644 (file)
 
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
-u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
-EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
+
+struct kvm_caps kvm_caps __read_mostly = {
+       .supported_mce_cap = MCG_CTL_P | MCG_SER_P,
+};
+EXPORT_SYMBOL_GPL(kvm_caps);
 
 #define  ERR_PTR_USR(e)  ((void __user *)ERR_PTR(e))
 
@@ -151,19 +154,6 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
 static bool __read_mostly kvmclock_periodic_sync = true;
 module_param(kvmclock_periodic_sync, bool, S_IRUGO);
 
-bool __read_mostly kvm_has_tsc_control;
-EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
-u32  __read_mostly kvm_max_guest_tsc_khz;
-EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
-u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
-EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
-u64  __read_mostly kvm_max_tsc_scaling_ratio;
-EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-u64 __read_mostly kvm_default_tsc_scaling_ratio;
-EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
-bool __read_mostly kvm_has_bus_lock_exit;
-EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit);
-
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 static u32 __read_mostly tsc_tolerance_ppm = 250;
 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
@@ -235,8 +225,6 @@ EXPORT_SYMBOL_GPL(enable_apicv);
 
 u64 __read_mostly host_xss;
 EXPORT_SYMBOL_GPL(host_xss);
-u64 __read_mostly supported_xss;
-EXPORT_SYMBOL_GPL(supported_xss);
 
 const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
        KVM_GENERIC_VM_STATS(),
@@ -298,7 +286,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
        STATS_DESC_COUNTER(VCPU, directed_yield_successful),
        STATS_DESC_COUNTER(VCPU, preemption_reported),
        STATS_DESC_COUNTER(VCPU, preemption_other),
-       STATS_DESC_ICOUNTER(VCPU, guest_mode)
+       STATS_DESC_ICOUNTER(VCPU, guest_mode),
+       STATS_DESC_COUNTER(VCPU, notify_window_exits),
 };
 
 const struct kvm_stats_header kvm_vcpu_stats_header = {
@@ -311,8 +300,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
 };
 
 u64 __read_mostly host_xcr0;
-u64 __read_mostly supported_xcr0;
-EXPORT_SYMBOL_GPL(supported_xcr0);
 
 static struct kmem_cache *x86_emulator_cache;
 
@@ -1450,6 +1437,7 @@ static const u32 msrs_to_save_all[] = {
        MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
        MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
        MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+       MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
 
        MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
        MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
@@ -2346,12 +2334,12 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 
        /* Guest TSC same frequency as host TSC? */
        if (!scale) {
-               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
+               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
                return 0;
        }
 
        /* TSC scaling supported? */
-       if (!kvm_has_tsc_control) {
+       if (!kvm_caps.has_tsc_control) {
                if (user_tsc_khz > tsc_khz) {
                        vcpu->arch.tsc_catchup = 1;
                        vcpu->arch.tsc_always_catchup = 1;
@@ -2363,10 +2351,10 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
        }
 
        /* TSC scaling required  - calculate ratio */
-       ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
+       ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits,
                                user_tsc_khz, tsc_khz);
 
-       if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
+       if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) {
                pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
                                    user_tsc_khz);
                return -1;
@@ -2384,7 +2372,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
        /* tsc_khz can be zero if TSC calibration fails */
        if (user_tsc_khz == 0) {
                /* set tsc_scaling_ratio to a safe value */
-               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
+               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
                return -1;
        }
 
@@ -2461,18 +2449,18 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
  * (frac) represent the fractional part, ie. ratio represents a fixed
  * point number (mult + frac * 2^(-N)).
  *
- * N equals to kvm_tsc_scaling_ratio_frac_bits.
+ * N equals to kvm_caps.tsc_scaling_ratio_frac_bits.
  */
 static inline u64 __scale_tsc(u64 ratio, u64 tsc)
 {
-       return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
+       return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits);
 }
 
 u64 kvm_scale_tsc(u64 tsc, u64 ratio)
 {
        u64 _tsc = tsc;
 
-       if (ratio != kvm_default_tsc_scaling_ratio)
+       if (ratio != kvm_caps.default_tsc_scaling_ratio)
                _tsc = __scale_tsc(ratio, tsc);
 
        return _tsc;
@@ -2499,11 +2487,11 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
 {
        u64 nested_offset;
 
-       if (l2_multiplier == kvm_default_tsc_scaling_ratio)
+       if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio)
                nested_offset = l1_offset;
        else
                nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
-                                               kvm_tsc_scaling_ratio_frac_bits);
+                                               kvm_caps.tsc_scaling_ratio_frac_bits);
 
        nested_offset += l2_offset;
        return nested_offset;
@@ -2512,9 +2500,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
 
 u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
 {
-       if (l2_multiplier != kvm_default_tsc_scaling_ratio)
+       if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio)
                return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
-                                      kvm_tsc_scaling_ratio_frac_bits);
+                                      kvm_caps.tsc_scaling_ratio_frac_bits);
 
        return l1_multiplier;
 }
@@ -2556,7 +2544,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli
        else
                vcpu->arch.tsc_scaling_ratio = l1_multiplier;
 
-       if (kvm_has_tsc_control)
+       if (kvm_caps.has_tsc_control)
                static_call(kvm_x86_write_tsc_multiplier)(
                        vcpu, vcpu->arch.tsc_scaling_ratio);
 }
@@ -2692,7 +2680,7 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
 
 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
 {
-       if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+       if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
                WARN_ON(adjustment < 0);
        adjustment = kvm_scale_tsc((u64) adjustment,
                                   vcpu->arch.l1_tsc_scaling_ratio);
@@ -3105,7 +3093,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
        /* With all the info we got, fill in the values */
 
-       if (kvm_has_tsc_control)
+       if (kvm_caps.has_tsc_control)
                tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
                                            v->arch.l1_tsc_scaling_ratio);
 
@@ -3236,10 +3224,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        /* only 0 or all 1s can be written to IA32_MCi_CTL
                         * some Linux kernels though clear bit 10 in bank 4 to
                         * workaround a BIOS/GART TBL issue on AMD K8s, ignore
-                        * this to avoid an uncatched #GP in the guest
+                        * this to avoid an uncatched #GP in the guest.
+                        *
+                        * UNIXWARE clears bit 0 of MC1_CTL to ignore
+                        * correctable, single-bit ECC data errors.
                         */
                        if ((offset & 0x3) == 0 &&
-                           data != 0 && (data | (1 << 10)) != ~(u64)0)
+                           data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
                                return -1;
 
                        /* MCi_STATUS */
@@ -3557,9 +3548,25 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        vcpu->arch.ia32_tsc_adjust_msr = data;
                }
                break;
-       case MSR_IA32_MISC_ENABLE:
+       case MSR_IA32_MISC_ENABLE: {
+               u64 old_val = vcpu->arch.ia32_misc_enable_msr;
+               u64 pmu_mask = MSR_IA32_MISC_ENABLE_PMU_RO_MASK |
+                       MSR_IA32_MISC_ENABLE_EMON;
+
+               /* RO bits */
+               if (!msr_info->host_initiated &&
+                   ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK))
+                       return 1;
+
+               /*
+                * For a dummy user space, the order of setting vPMU capabilities and
+                * initialising MSR_IA32_MISC_ENABLE is not strictly guaranteed, so to
+                * avoid inconsistent functionality we keep the vPMU bits unchanged here.
+                */
+               data &= ~pmu_mask;
+               data |= old_val & pmu_mask;
                if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
-                   ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
+                   ((old_val ^ data)  & MSR_IA32_MISC_ENABLE_MWAIT)) {
                        if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
                                return 1;
                        vcpu->arch.ia32_misc_enable_msr = data;
@@ -3568,6 +3575,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        vcpu->arch.ia32_misc_enable_msr = data;
                }
                break;
+       }
        case MSR_IA32_SMBASE:
                if (!msr_info->host_initiated)
                        return 1;
@@ -3594,7 +3602,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
                 * XSAVES/XRSTORS to save/restore PT MSRs.
                 */
-               if (data & ~supported_xss)
+               if (data & ~kvm_caps.supported_xss)
                        return 1;
                vcpu->arch.ia32_xss = data;
                kvm_update_cpuid_runtime(vcpu);
@@ -3700,7 +3708,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                fallthrough;
        case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
        case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
-               if (kvm_pmu_is_valid_msr(vcpu, msr))
+               if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated))
                        return kvm_pmu_set_msr(vcpu, msr_info);
 
                if (pr || data != 0)
@@ -3783,7 +3791,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
 #endif
        default:
-               if (kvm_pmu_is_valid_msr(vcpu, msr))
+               if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated))
                        return kvm_pmu_set_msr(vcpu, msr_info);
                return KVM_MSR_RET_INVALID;
        }
@@ -3863,7 +3871,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = 0;
                break;
        case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
-               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated))
                        return kvm_pmu_get_msr(vcpu, msr_info);
                if (!msr_info->host_initiated)
                        return 1;
@@ -3873,7 +3881,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
        case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
        case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
-               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated))
                        return kvm_pmu_get_msr(vcpu, msr_info);
                msr_info->data = 0;
                break;
@@ -4119,7 +4127,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
 #endif
        default:
-               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated))
                        return kvm_pmu_get_msr(vcpu, msr_info);
                return KVM_MSR_RET_INVALID;
        }
@@ -4277,6 +4285,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_GET_MSR_FEATURES:
        case KVM_CAP_MSR_PLATFORM_INFO:
        case KVM_CAP_EXCEPTION_PAYLOAD:
+       case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
        case KVM_CAP_SET_GUEST_DEBUG:
        case KVM_CAP_LAST_CPU:
        case KVM_CAP_X86_USER_SPACE_MSR:
@@ -4354,7 +4363,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                break;
        case KVM_CAP_TSC_CONTROL:
        case KVM_CAP_VM_TSC_CONTROL:
-               r = kvm_has_tsc_control;
+               r = kvm_caps.has_tsc_control;
                break;
        case KVM_CAP_X2APIC_API:
                r = KVM_X2APIC_API_VALID_FLAGS;
@@ -4376,7 +4385,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = sched_info_on();
                break;
        case KVM_CAP_X86_BUS_LOCK_EXIT:
-               if (kvm_has_bus_lock_exit)
+               if (kvm_caps.has_bus_lock_exit)
                        r = KVM_BUS_LOCK_DETECTION_OFF |
                            KVM_BUS_LOCK_DETECTION_EXIT;
                else
@@ -4385,7 +4394,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_XSAVE2: {
                u64 guest_perm = xstate_get_guest_group_perm();
 
-               r = xstate_required_size(supported_xcr0 & guest_perm, false);
+               r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false);
                if (r < sizeof(struct kvm_xsave))
                        r = sizeof(struct kvm_xsave);
                break;
@@ -4396,6 +4405,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_DISABLE_QUIRKS2:
                r = KVM_X86_VALID_QUIRKS;
                break;
+       case KVM_CAP_X86_NOTIFY_VMEXIT:
+               r = kvm_caps.has_notify_vmexit;
+               break;
        default:
                break;
        }
@@ -4423,7 +4435,7 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
 
        switch (attr->attr) {
        case KVM_X86_XCOMP_GUEST_SUPP:
-               if (put_user(supported_xcr0, uaddr))
+               if (put_user(kvm_caps.supported_xcr0, uaddr))
                        return -EFAULT;
                return 0;
        default:
@@ -4500,8 +4512,8 @@ long kvm_arch_dev_ioctl(struct file *filp,
        }
        case KVM_X86_GET_MCE_CAP_SUPPORTED:
                r = -EFAULT;
-               if (copy_to_user(argp, &kvm_mce_cap_supported,
-                                sizeof(kvm_mce_cap_supported)))
+               if (copy_to_user(argp, &kvm_caps.supported_mce_cap,
+                                sizeof(kvm_caps.supported_mce_cap)))
                        goto out;
                r = 0;
                break;
@@ -4800,7 +4812,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
        r = -EINVAL;
        if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
                goto out;
-       if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
+       if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000))
                goto out;
        r = 0;
        vcpu->arch.mcg_cap = mcg_cap;
@@ -4938,6 +4950,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                         | KVM_VCPUEVENT_VALID_SMM);
        if (vcpu->kvm->arch.exception_payload_enabled)
                events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
+       if (vcpu->kvm->arch.triple_fault_event) {
+               events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
+       }
 
        memset(&events->reserved, 0, sizeof(events->reserved));
 }
@@ -4951,7 +4967,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                              | KVM_VCPUEVENT_VALID_SIPI_VECTOR
                              | KVM_VCPUEVENT_VALID_SHADOW
                              | KVM_VCPUEVENT_VALID_SMM
-                             | KVM_VCPUEVENT_VALID_PAYLOAD))
+                             | KVM_VCPUEVENT_VALID_PAYLOAD
+                             | KVM_VCPUEVENT_VALID_TRIPLE_FAULT))
                return -EINVAL;
 
        if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
@@ -5024,6 +5041,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                }
        }
 
+       if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
+               if (!vcpu->kvm->arch.triple_fault_event)
+                       return -EINVAL;
+               if (events->triple_fault.pending)
+                       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               else
+                       kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+       }
+
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 
        return 0;
@@ -5092,7 +5118,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 
        return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
                                              guest_xsave->region,
-                                             supported_xcr0, &vcpu->arch.pkru);
+                                             kvm_caps.supported_xcr0,
+                                             &vcpu->arch.pkru);
 }
 
 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
@@ -5597,8 +5624,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = -EINVAL;
                user_tsc_khz = (u32)arg;
 
-               if (kvm_has_tsc_control &&
-                   user_tsc_khz >= kvm_max_guest_tsc_khz)
+               if (kvm_caps.has_tsc_control &&
+                   user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
                        goto out;
 
                if (user_tsc_khz == 0)
@@ -6025,6 +6052,10 @@ split_irqchip_unlock:
                kvm->arch.exception_payload_enabled = cap->args[0];
                r = 0;
                break;
+       case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
+               kvm->arch.triple_fault_event = cap->args[0];
+               r = 0;
+               break;
        case KVM_CAP_X86_USER_SPACE_MSR:
                kvm->arch.user_space_msr_mask = cap->args[0];
                r = 0;
@@ -6038,7 +6069,7 @@ split_irqchip_unlock:
                    (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
                        break;
 
-               if (kvm_has_bus_lock_exit &&
+               if (kvm_caps.has_bus_lock_exit &&
                    cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
                        kvm->arch.bus_lock_detection_enabled = true;
                r = 0;
@@ -6101,6 +6132,36 @@ split_irqchip_unlock:
                }
                mutex_unlock(&kvm->lock);
                break;
+       case KVM_CAP_MAX_VCPU_ID:
+               r = -EINVAL;
+               if (cap->args[0] > KVM_MAX_VCPU_IDS)
+                       break;
+
+               mutex_lock(&kvm->lock);
+               if (kvm->arch.max_vcpu_ids == cap->args[0]) {
+                       r = 0;
+               } else if (!kvm->arch.max_vcpu_ids) {
+                       kvm->arch.max_vcpu_ids = cap->args[0];
+                       r = 0;
+               }
+               mutex_unlock(&kvm->lock);
+               break;
+       case KVM_CAP_X86_NOTIFY_VMEXIT:
+               r = -EINVAL;
+               if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
+                       break;
+               if (!kvm_caps.has_notify_vmexit)
+                       break;
+               if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
+                       break;
+               mutex_lock(&kvm->lock);
+               if (!kvm->created_vcpus) {
+                       kvm->arch.notify_window = cap->args[0] >> 32;
+                       kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
+                       r = 0;
+               }
+               mutex_unlock(&kvm->lock);
+               break;
        default:
                r = -EINVAL;
                break;
@@ -6573,8 +6634,8 @@ set_pit2_out:
                r = -EINVAL;
                user_tsc_khz = (u32)arg;
 
-               if (kvm_has_tsc_control &&
-                   user_tsc_khz >= kvm_max_guest_tsc_khz)
+               if (kvm_caps.has_tsc_control &&
+                   user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
                        goto out;
 
                if (user_tsc_khz == 0)
@@ -6649,15 +6710,12 @@ out:
 
 static void kvm_init_msr_list(void)
 {
-       struct x86_pmu_capability x86_pmu;
        u32 dummy[2];
        unsigned i;
 
        BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
                         "Please update the fixed PMCs in msrs_to_saved_all[]");
 
-       perf_get_x86_pmu_capability(&x86_pmu);
-
        num_msrs_to_save = 0;
        num_emulated_msrs = 0;
        num_msr_based_features = 0;
@@ -6709,12 +6767,12 @@ static void kvm_init_msr_list(void)
                        break;
                case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
                        if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
-                           min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+                           min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
                                continue;
                        break;
                case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
                        if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
-                           min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+                           min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
                                continue;
                        break;
                case MSR_IA32_XFD:
@@ -8740,7 +8798,7 @@ static void kvm_hyperv_tsc_notifier(void)
        /* TSC frequency always matches when on Hyper-V */
        for_each_present_cpu(cpu)
                per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
-       kvm_max_guest_tsc_khz = tsc_khz;
+       kvm_caps.max_guest_tsc_khz = tsc_khz;
 
        list_for_each_entry(kvm, &vm_list, vm_list) {
                __kvm_start_pvclock_update(kvm);
@@ -9002,7 +9060,7 @@ int kvm_arch_init(void *opaque)
 
        if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-               supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+               kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
        }
 
        if (pi_inject_timer == -1)
@@ -9422,6 +9480,11 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu)
 
 static void kvm_inject_exception(struct kvm_vcpu *vcpu)
 {
+       trace_kvm_inj_exception(vcpu->arch.exception.nr,
+                               vcpu->arch.exception.has_error_code,
+                               vcpu->arch.exception.error_code,
+                               vcpu->arch.exception.injected);
+
        if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
                vcpu->arch.exception.error_code = false;
        static_call(kvm_x86_queue_exception)(vcpu);
@@ -9457,7 +9520,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
                        static_call(kvm_x86_inject_nmi)(vcpu);
                        can_inject = false;
                } else if (vcpu->arch.interrupt.injected) {
-                       static_call(kvm_x86_inject_irq)(vcpu);
+                       static_call(kvm_x86_inject_irq)(vcpu, true);
                        can_inject = false;
                }
        }
@@ -9479,13 +9542,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
 
        /* try to inject new event if pending */
        if (vcpu->arch.exception.pending) {
-               trace_kvm_inj_exception(vcpu->arch.exception.nr,
-                                       vcpu->arch.exception.has_error_code,
-                                       vcpu->arch.exception.error_code);
-
-               vcpu->arch.exception.pending = false;
-               vcpu->arch.exception.injected = true;
-
                if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
                        __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
                                             X86_EFLAGS_RF);
@@ -9499,6 +9555,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
                }
 
                kvm_inject_exception(vcpu);
+
+               vcpu->arch.exception.pending = false;
+               vcpu->arch.exception.injected = true;
+
                can_inject = false;
        }
 
@@ -9551,7 +9611,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
                        goto out;
                if (r) {
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
-                       static_call(kvm_x86_inject_irq)(vcpu);
+                       static_call(kvm_x86_inject_irq)(vcpu, false);
                        WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
                }
                if (kvm_cpu_has_injectable_intr(vcpu))
@@ -11263,11 +11323,17 @@ static int sync_regs(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
 {
-       if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+       if (kvm_check_tsc_unstable() && kvm->created_vcpus)
                pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
                             "guest TSC will not be reliable\n");
 
-       return 0;
+       if (!kvm->arch.max_vcpu_ids)
+               kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
+
+       if (id >= kvm->arch.max_vcpu_ids)
+               return -EINVAL;
+
+       return static_call(kvm_x86_vcpu_precreate)(kvm);
 }
 
 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
@@ -11704,6 +11770,8 @@ int kvm_arch_hardware_setup(void *opaque)
        if (boot_cpu_has(X86_FEATURE_XSAVES))
                rdmsrl(MSR_IA32_XSS, host_xss);
 
+       kvm_init_pmu_capability();
+
        r = ops->hardware_setup();
        if (r != 0)
                return r;
@@ -11713,13 +11781,13 @@ int kvm_arch_hardware_setup(void *opaque)
        kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
 
        if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
-               supported_xss = 0;
+               kvm_caps.supported_xss = 0;
 
 #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
        cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
 #undef __kvm_cpu_cap_has
 
-       if (kvm_has_tsc_control) {
+       if (kvm_caps.has_tsc_control) {
                /*
                 * Make sure the user can only configure tsc_khz values that
                 * fit into a signed integer.
@@ -11727,10 +11795,10 @@ int kvm_arch_hardware_setup(void *opaque)
                 * be 1 on all machines.
                 */
                u64 max = min(0x7fffffffULL,
-                             __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
-               kvm_max_guest_tsc_khz = max;
+                             __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
+               kvm_caps.max_guest_tsc_khz = max;
        }
-       kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
+       kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
        kvm_init_msr_list();
        return 0;
 }