Merge tag 'iommu-updates-v5.20-or-v6.0' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-microblaze.git] / arch / x86 / kvm / x86.c
index 1bd8f65..79a8a74 100644 (file)
 
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
-u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
-EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
+
+struct kvm_caps kvm_caps __read_mostly = {
+       .supported_mce_cap = MCG_CTL_P | MCG_SER_P,
+};
+EXPORT_SYMBOL_GPL(kvm_caps);
 
 #define  ERR_PTR_USR(e)  ((void __user *)ERR_PTR(e))
 
@@ -150,19 +153,6 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
 static bool __read_mostly kvmclock_periodic_sync = true;
 module_param(kvmclock_periodic_sync, bool, S_IRUGO);
 
-bool __read_mostly kvm_has_tsc_control;
-EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
-u32  __read_mostly kvm_max_guest_tsc_khz;
-EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
-u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
-EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
-u64  __read_mostly kvm_max_tsc_scaling_ratio;
-EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-u64 __read_mostly kvm_default_tsc_scaling_ratio;
-EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
-bool __read_mostly kvm_has_bus_lock_exit;
-EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit);
-
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 static u32 __read_mostly tsc_tolerance_ppm = 250;
 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
@@ -234,8 +224,6 @@ EXPORT_SYMBOL_GPL(enable_apicv);
 
 u64 __read_mostly host_xss;
 EXPORT_SYMBOL_GPL(host_xss);
-u64 __read_mostly supported_xss;
-EXPORT_SYMBOL_GPL(supported_xss);
 
 const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
        KVM_GENERIC_VM_STATS(),
@@ -297,7 +285,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
        STATS_DESC_COUNTER(VCPU, directed_yield_successful),
        STATS_DESC_COUNTER(VCPU, preemption_reported),
        STATS_DESC_COUNTER(VCPU, preemption_other),
-       STATS_DESC_IBOOLEAN(VCPU, guest_mode)
+       STATS_DESC_IBOOLEAN(VCPU, guest_mode),
+       STATS_DESC_COUNTER(VCPU, notify_window_exits),
 };
 
 const struct kvm_stats_header kvm_vcpu_stats_header = {
@@ -310,8 +299,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
 };
 
 u64 __read_mostly host_xcr0;
-u64 __read_mostly supported_xcr0;
-EXPORT_SYMBOL_GPL(supported_xcr0);
 
 static struct kmem_cache *x86_emulator_cache;
 
@@ -861,7 +848,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
         */
        real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
                                     PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
-       if (real_gpa == UNMAPPED_GVA)
+       if (real_gpa == INVALID_GPA)
                return 0;
 
        /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
@@ -1093,7 +1080,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
 
-bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        if (cr4 & cr4_reserved_bits)
                return false;
@@ -1101,9 +1088,15 @@ bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
                return false;
 
-       return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
+       return true;
+}
+EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4);
+
+static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+       return __kvm_is_valid_cr4(vcpu, cr4) &&
+              static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
 }
-EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
 
 void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
 {
@@ -1449,6 +1442,7 @@ static const u32 msrs_to_save_all[] = {
        MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
        MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
        MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+       MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
 
        MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
        MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
@@ -2050,13 +2044,6 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_invd);
 
-int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
-{
-       pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n");
-       return kvm_emulate_as_nop(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
-
 int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
 {
        kvm_queue_exception(vcpu, UD_VECTOR);
@@ -2064,11 +2051,26 @@ int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
 
-int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+
+static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
 {
-       pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n");
+       if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) &&
+           !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
+               return kvm_handle_invalid_op(vcpu);
+
+       pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn);
        return kvm_emulate_as_nop(vcpu);
 }
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+       return kvm_emulate_monitor_mwait(vcpu, "MWAIT");
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+       return kvm_emulate_monitor_mwait(vcpu, "MONITOR");
+}
 EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
 
 static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
@@ -2348,12 +2350,12 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 
        /* Guest TSC same frequency as host TSC? */
        if (!scale) {
-               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
+               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
                return 0;
        }
 
        /* TSC scaling supported? */
-       if (!kvm_has_tsc_control) {
+       if (!kvm_caps.has_tsc_control) {
                if (user_tsc_khz > tsc_khz) {
                        vcpu->arch.tsc_catchup = 1;
                        vcpu->arch.tsc_always_catchup = 1;
@@ -2365,10 +2367,10 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
        }
 
        /* TSC scaling required  - calculate ratio */
-       ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
+       ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits,
                                user_tsc_khz, tsc_khz);
 
-       if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
+       if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) {
                pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
                                    user_tsc_khz);
                return -1;
@@ -2386,7 +2388,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
        /* tsc_khz can be zero if TSC calibration fails */
        if (user_tsc_khz == 0) {
                /* set tsc_scaling_ratio to a safe value */
-               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
+               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
                return -1;
        }
 
@@ -2463,18 +2465,18 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
  * (frac) represent the fractional part, ie. ratio represents a fixed
  * point number (mult + frac * 2^(-N)).
  *
- * N equals to kvm_tsc_scaling_ratio_frac_bits.
+ * N equals to kvm_caps.tsc_scaling_ratio_frac_bits.
  */
 static inline u64 __scale_tsc(u64 ratio, u64 tsc)
 {
-       return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
+       return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits);
 }
 
 u64 kvm_scale_tsc(u64 tsc, u64 ratio)
 {
        u64 _tsc = tsc;
 
-       if (ratio != kvm_default_tsc_scaling_ratio)
+       if (ratio != kvm_caps.default_tsc_scaling_ratio)
                _tsc = __scale_tsc(ratio, tsc);
 
        return _tsc;
@@ -2501,11 +2503,11 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
 {
        u64 nested_offset;
 
-       if (l2_multiplier == kvm_default_tsc_scaling_ratio)
+       if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio)
                nested_offset = l1_offset;
        else
                nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
-                                               kvm_tsc_scaling_ratio_frac_bits);
+                                               kvm_caps.tsc_scaling_ratio_frac_bits);
 
        nested_offset += l2_offset;
        return nested_offset;
@@ -2514,9 +2516,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
 
 u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
 {
-       if (l2_multiplier != kvm_default_tsc_scaling_ratio)
+       if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio)
                return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
-                                      kvm_tsc_scaling_ratio_frac_bits);
+                                      kvm_caps.tsc_scaling_ratio_frac_bits);
 
        return l1_multiplier;
 }
@@ -2558,7 +2560,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli
        else
                vcpu->arch.tsc_scaling_ratio = l1_multiplier;
 
-       if (kvm_has_tsc_control)
+       if (kvm_caps.has_tsc_control)
                static_call(kvm_x86_write_tsc_multiplier)(
                        vcpu, vcpu->arch.tsc_scaling_ratio);
 }
@@ -2694,7 +2696,7 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
 
 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
 {
-       if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+       if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
                WARN_ON(adjustment < 0);
        adjustment = kvm_scale_tsc((u64) adjustment,
                                   vcpu->arch.l1_tsc_scaling_ratio);
@@ -3107,7 +3109,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
        /* With all the info we got, fill in the values */
 
-       if (kvm_has_tsc_control)
+       if (kvm_caps.has_tsc_control)
                tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
                                            v->arch.l1_tsc_scaling_ratio);
 
@@ -3197,6 +3199,16 @@ static void kvmclock_sync_fn(struct work_struct *work)
                                        KVMCLOCK_SYNC_PERIOD);
 }
 
+/* These helpers are safe iff @msr is known to be an MCx bank MSR. */
+static bool is_mci_control_msr(u32 msr)
+{
+       return (msr & 3) == 0;
+}
+static bool is_mci_status_msr(u32 msr)
+{
+       return (msr & 3) == 1;
+}
+
 /*
  * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
  */
@@ -3215,6 +3227,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        unsigned bank_num = mcg_cap & 0xff;
        u32 msr = msr_info->index;
        u64 data = msr_info->data;
+       u32 offset, last_msr;
 
        switch (msr) {
        case MSR_IA32_MCG_STATUS:
@@ -3228,32 +3241,53 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                vcpu->arch.mcg_ctl = data;
                break;
-       default:
-               if (msr >= MSR_IA32_MC0_CTL &&
-                   msr < MSR_IA32_MCx_CTL(bank_num)) {
-                       u32 offset = array_index_nospec(
-                               msr - MSR_IA32_MC0_CTL,
-                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
-
-                       /* only 0 or all 1s can be written to IA32_MCi_CTL
-                        * some Linux kernels though clear bit 10 in bank 4 to
-                        * workaround a BIOS/GART TBL issue on AMD K8s, ignore
-                        * this to avoid an uncatched #GP in the guest
-                        */
-                       if ((offset & 0x3) == 0 &&
-                           data != 0 && (data | (1 << 10)) != ~(u64)0)
-                               return -1;
-
-                       /* MCi_STATUS */
-                       if (!msr_info->host_initiated &&
-                           (offset & 0x3) == 1 && data != 0) {
-                               if (!can_set_mci_status(vcpu))
-                                       return -1;
-                       }
+       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+               last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
+               if (msr > last_msr)
+                       return 1;
 
-                       vcpu->arch.mce_banks[offset] = data;
-                       break;
-               }
+               if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
+                       return 1;
+               /* An attempt to write a 1 to a reserved bit raises #GP */
+               if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
+                       return 1;
+               offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
+                                           last_msr + 1 - MSR_IA32_MC0_CTL2);
+               vcpu->arch.mci_ctl2_banks[offset] = data;
+               break;
+       case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+               last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
+               if (msr > last_msr)
+                       return 1;
+
+               /*
+                * Only 0 or all 1s can be written to IA32_MCi_CTL, all other
+                * values are architecturally undefined.  But, some Linux
+                * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB
+                * issue on AMD K8s, allow bit 10 to be clear when setting all
+                * other bits in order to avoid an uncaught #GP in the guest.
+                *
+                * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable,
+                * single-bit ECC data errors.
+                */
+               if (is_mci_control_msr(msr) &&
+                   data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
+                       return 1;
+
+               /*
+                * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR.
+                * AMD-based CPUs allow non-zero values, but if and only if
+                * HWCR[McStatusWrEn] is set.
+                */
+               if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
+                   data != 0 && !can_set_mci_status(vcpu))
+                       return 1;
+
+               offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
+                                           last_msr + 1 - MSR_IA32_MC0_CTL);
+               vcpu->arch.mce_banks[offset] = data;
+               break;
+       default:
                return 1;
        }
        return 0;
@@ -3537,7 +3571,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                }
                break;
-       case 0x200 ... 0x2ff:
+       case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
+       case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
                return kvm_mtrr_set_msr(vcpu, msr, data);
        case MSR_IA32_APICBASE:
                return kvm_set_apic_base(vcpu, msr_info);
@@ -3559,9 +3594,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        vcpu->arch.ia32_tsc_adjust_msr = data;
                }
                break;
-       case MSR_IA32_MISC_ENABLE:
+       case MSR_IA32_MISC_ENABLE: {
+               u64 old_val = vcpu->arch.ia32_misc_enable_msr;
+
+               if (!msr_info->host_initiated) {
+                       /* RO bits */
+                       if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK)
+                               return 1;
+
+                       /* R bits, i.e. writes are ignored, but don't fault. */
+                       data = data & ~MSR_IA32_MISC_ENABLE_EMON;
+                       data |= old_val & MSR_IA32_MISC_ENABLE_EMON;
+               }
+
                if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
-                   ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
+                   ((old_val ^ data)  & MSR_IA32_MISC_ENABLE_MWAIT)) {
                        if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
                                return 1;
                        vcpu->arch.ia32_misc_enable_msr = data;
@@ -3570,6 +3617,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        vcpu->arch.ia32_misc_enable_msr = data;
                }
                break;
+       }
        case MSR_IA32_SMBASE:
                if (!msr_info->host_initiated)
                        return 1;
@@ -3596,7 +3644,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
                 * XSAVES/XRSTORS to save/restore PT MSRs.
                 */
-               if (data & ~supported_xss)
+               if (data & ~kvm_caps.supported_xss)
                        return 1;
                vcpu->arch.ia32_xss = data;
                kvm_update_cpuid_runtime(vcpu);
@@ -3694,6 +3742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_MCG_CTL:
        case MSR_IA32_MCG_STATUS:
        case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
                return set_msr_mce(vcpu, msr_info);
 
        case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
@@ -3784,6 +3833,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vcpu->arch.guest_fpu.xfd_err = data;
                break;
 #endif
+       case MSR_IA32_PEBS_ENABLE:
+       case MSR_IA32_DS_AREA:
+       case MSR_PEBS_DATA_CFG:
+       case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+               if (kvm_pmu_is_valid_msr(vcpu, msr))
+                       return kvm_pmu_set_msr(vcpu, msr_info);
+               /*
+                * Userspace is allowed to write '0' to MSRs that KVM reports
+                * as to-be-saved, even if an MSRs isn't fully supported.
+                */
+               return !msr_info->host_initiated || data;
        default:
                if (kvm_pmu_is_valid_msr(vcpu, msr))
                        return kvm_pmu_set_msr(vcpu, msr_info);
@@ -3798,6 +3858,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
        u64 data;
        u64 mcg_cap = vcpu->arch.mcg_cap;
        unsigned bank_num = mcg_cap & 0xff;
+       u32 offset, last_msr;
 
        switch (msr) {
        case MSR_IA32_P5_MC_ADDR:
@@ -3815,16 +3876,27 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
        case MSR_IA32_MCG_STATUS:
                data = vcpu->arch.mcg_status;
                break;
-       default:
-               if (msr >= MSR_IA32_MC0_CTL &&
-                   msr < MSR_IA32_MCx_CTL(bank_num)) {
-                       u32 offset = array_index_nospec(
-                               msr - MSR_IA32_MC0_CTL,
-                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+               last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
+               if (msr > last_msr)
+                       return 1;
 
-                       data = vcpu->arch.mce_banks[offset];
-                       break;
-               }
+               if (!(mcg_cap & MCG_CMCI_P) && !host)
+                       return 1;
+               offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
+                                           last_msr + 1 - MSR_IA32_MC0_CTL2);
+               data = vcpu->arch.mci_ctl2_banks[offset];
+               break;
+       case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+               last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
+               if (msr > last_msr)
+                       return 1;
+
+               offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
+                                           last_msr + 1 - MSR_IA32_MC0_CTL);
+               data = vcpu->arch.mce_banks[offset];
+               break;
+       default:
                return 1;
        }
        *pdata = data;
@@ -3864,9 +3936,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_DRAM_ENERGY_STATUS:    /* DRAM controller */
                msr_info->data = 0;
                break;
+       case MSR_IA32_PEBS_ENABLE:
+       case MSR_IA32_DS_AREA:
+       case MSR_PEBS_DATA_CFG:
        case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
                if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
                        return kvm_pmu_get_msr(vcpu, msr_info);
+               /*
+                * Userspace is allowed to read MSRs that KVM reports as
+                * to-be-saved, even if an MSR isn't fully supported.
+                */
                if (!msr_info->host_initiated)
                        return 1;
                msr_info->data = 0;
@@ -3921,7 +4000,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        }
        case MSR_MTRRcap:
-       case 0x200 ... 0x2ff:
+       case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
+       case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
                return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
        case 0xcd: /* fsb frequency */
                msr_info->data = 3;
@@ -4037,6 +4117,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_MCG_CTL:
        case MSR_IA32_MCG_STATUS:
        case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
                return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
                                   msr_info->host_initiated);
        case MSR_IA32_XSS:
@@ -4279,6 +4360,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_GET_MSR_FEATURES:
        case KVM_CAP_MSR_PLATFORM_INFO:
        case KVM_CAP_EXCEPTION_PAYLOAD:
+       case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
        case KVM_CAP_SET_GUEST_DEBUG:
        case KVM_CAP_LAST_CPU:
        case KVM_CAP_X86_USER_SPACE_MSR:
@@ -4295,6 +4377,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_SYS_ATTRIBUTES:
        case KVM_CAP_VAPIC:
        case KVM_CAP_ENABLE_CAP:
+       case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
                r = 1;
                break;
        case KVM_CAP_EXIT_HYPERCALL:
@@ -4356,7 +4439,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                break;
        case KVM_CAP_TSC_CONTROL:
        case KVM_CAP_VM_TSC_CONTROL:
-               r = kvm_has_tsc_control;
+               r = kvm_caps.has_tsc_control;
                break;
        case KVM_CAP_X2APIC_API:
                r = KVM_X2APIC_API_VALID_FLAGS;
@@ -4378,7 +4461,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = sched_info_on();
                break;
        case KVM_CAP_X86_BUS_LOCK_EXIT:
-               if (kvm_has_bus_lock_exit)
+               if (kvm_caps.has_bus_lock_exit)
                        r = KVM_BUS_LOCK_DETECTION_OFF |
                            KVM_BUS_LOCK_DETECTION_EXIT;
                else
@@ -4387,17 +4470,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_XSAVE2: {
                u64 guest_perm = xstate_get_guest_group_perm();
 
-               r = xstate_required_size(supported_xcr0 & guest_perm, false);
+               r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false);
                if (r < sizeof(struct kvm_xsave))
                        r = sizeof(struct kvm_xsave);
                break;
+       }
        case KVM_CAP_PMU_CAPABILITY:
                r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
                break;
-       }
        case KVM_CAP_DISABLE_QUIRKS2:
                r = KVM_X86_VALID_QUIRKS;
                break;
+       case KVM_CAP_X86_NOTIFY_VMEXIT:
+               r = kvm_caps.has_notify_vmexit;
+               break;
        default:
                break;
        }
@@ -4425,7 +4511,7 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
 
        switch (attr->attr) {
        case KVM_X86_XCOMP_GUEST_SUPP:
-               if (put_user(supported_xcr0, uaddr))
+               if (put_user(kvm_caps.supported_xcr0, uaddr))
                        return -EFAULT;
                return 0;
        default:
@@ -4502,8 +4588,8 @@ long kvm_arch_dev_ioctl(struct file *filp,
        }
        case KVM_X86_GET_MCE_CAP_SUPPORTED:
                r = -EFAULT;
-               if (copy_to_user(argp, &kvm_mce_cap_supported,
-                                sizeof(kvm_mce_cap_supported)))
+               if (copy_to_user(argp, &kvm_caps.supported_mce_cap,
+                                sizeof(kvm_caps.supported_mce_cap)))
                        goto out;
                r = 0;
                break;
@@ -4802,22 +4888,63 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
        r = -EINVAL;
        if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
                goto out;
-       if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
+       if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000))
                goto out;
        r = 0;
        vcpu->arch.mcg_cap = mcg_cap;
        /* Init IA32_MCG_CTL to all 1s */
        if (mcg_cap & MCG_CTL_P)
                vcpu->arch.mcg_ctl = ~(u64)0;
-       /* Init IA32_MCi_CTL to all 1s */
-       for (bank = 0; bank < bank_num; bank++)
+       /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */
+       for (bank = 0; bank < bank_num; bank++) {
                vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+               if (mcg_cap & MCG_CMCI_P)
+                       vcpu->arch.mci_ctl2_banks[bank] = 0;
+       }
+
+       kvm_apic_after_set_mcg_cap(vcpu);
 
        static_call(kvm_x86_setup_mce)(vcpu);
 out:
        return r;
 }
 
+/*
+ * Validate this is an UCNA (uncorrectable no action) error by checking the
+ * MCG_STATUS and MCi_STATUS registers:
+ * - none of the bits for Machine Check Exceptions are set
+ * - both the VAL (valid) and UC (uncorrectable) bits are set
+ * MCI_STATUS_PCC - Processor Context Corrupted
+ * MCI_STATUS_S - Signaled as a Machine Check Exception
+ * MCI_STATUS_AR - Software recoverable Action Required
+ */
+static bool is_ucna(struct kvm_x86_mce *mce)
+{
+       return  !mce->mcg_status &&
+               !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) &&
+               (mce->status & MCI_STATUS_VAL) &&
+               (mce->status & MCI_STATUS_UC);
+}
+
+static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks)
+{
+       u64 mcg_cap = vcpu->arch.mcg_cap;
+
+       banks[1] = mce->status;
+       banks[2] = mce->addr;
+       banks[3] = mce->misc;
+       vcpu->arch.mcg_status = mce->mcg_status;
+
+       if (!(mcg_cap & MCG_CMCI_P) ||
+           !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
+               return 0;
+
+       if (lapic_in_kernel(vcpu))
+               kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
+
+       return 0;
+}
+
 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
                                      struct kvm_x86_mce *mce)
 {
@@ -4827,6 +4954,12 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 
        if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
                return -EINVAL;
+
+       banks += array_index_nospec(4 * mce->bank, 4 * bank_num);
+
+       if (is_ucna(mce))
+               return kvm_vcpu_x86_set_ucna(vcpu, mce, banks);
+
        /*
         * if IA32_MCG_CTL is not all 1s, the uncorrected error
         * reporting is disabled
@@ -4834,7 +4967,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
        if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
            vcpu->arch.mcg_ctl != ~(u64)0)
                return 0;
-       banks += 4 * mce->bank;
        /*
         * if IA32_MCi_CTL is not all 1s, the uncorrected error
         * reporting is disabled for the bank
@@ -4940,6 +5072,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                         | KVM_VCPUEVENT_VALID_SMM);
        if (vcpu->kvm->arch.exception_payload_enabled)
                events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
+       if (vcpu->kvm->arch.triple_fault_event) {
+               events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
+       }
 
        memset(&events->reserved, 0, sizeof(events->reserved));
 }
@@ -4953,7 +5089,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                              | KVM_VCPUEVENT_VALID_SIPI_VECTOR
                              | KVM_VCPUEVENT_VALID_SHADOW
                              | KVM_VCPUEVENT_VALID_SMM
-                             | KVM_VCPUEVENT_VALID_PAYLOAD))
+                             | KVM_VCPUEVENT_VALID_PAYLOAD
+                             | KVM_VCPUEVENT_VALID_TRIPLE_FAULT))
                return -EINVAL;
 
        if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
@@ -5026,6 +5163,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                }
        }
 
+       if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
+               if (!vcpu->kvm->arch.triple_fault_event)
+                       return -EINVAL;
+               if (events->triple_fault.pending)
+                       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               else
+                       kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+       }
+
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 
        return 0;
@@ -5094,7 +5240,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 
        return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
                                              guest_xsave->region,
-                                             supported_xcr0, &vcpu->arch.pkru);
+                                             kvm_caps.supported_xcr0,
+                                             &vcpu->arch.pkru);
 }
 
 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
@@ -5599,8 +5746,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = -EINVAL;
                user_tsc_khz = (u32)arg;
 
-               if (kvm_has_tsc_control &&
-                   user_tsc_khz >= kvm_max_guest_tsc_khz)
+               if (kvm_caps.has_tsc_control &&
+                   user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
                        goto out;
 
                if (user_tsc_khz == 0)
@@ -6027,6 +6174,10 @@ split_irqchip_unlock:
                kvm->arch.exception_payload_enabled = cap->args[0];
                r = 0;
                break;
+       case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
+               kvm->arch.triple_fault_event = cap->args[0];
+               r = 0;
+               break;
        case KVM_CAP_X86_USER_SPACE_MSR:
                r = -EINVAL;
                if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL |
@@ -6045,7 +6196,7 @@ split_irqchip_unlock:
                    (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
                        break;
 
-               if (kvm_has_bus_lock_exit &&
+               if (kvm_caps.has_bus_lock_exit &&
                    cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
                        kvm->arch.bus_lock_detection_enabled = true;
                r = 0;
@@ -6108,6 +6259,65 @@ split_irqchip_unlock:
                }
                mutex_unlock(&kvm->lock);
                break;
+       case KVM_CAP_MAX_VCPU_ID:
+               r = -EINVAL;
+               if (cap->args[0] > KVM_MAX_VCPU_IDS)
+                       break;
+
+               mutex_lock(&kvm->lock);
+               if (kvm->arch.max_vcpu_ids == cap->args[0]) {
+                       r = 0;
+               } else if (!kvm->arch.max_vcpu_ids) {
+                       kvm->arch.max_vcpu_ids = cap->args[0];
+                       r = 0;
+               }
+               mutex_unlock(&kvm->lock);
+               break;
+       case KVM_CAP_X86_NOTIFY_VMEXIT:
+               r = -EINVAL;
+               if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
+                       break;
+               if (!kvm_caps.has_notify_vmexit)
+                       break;
+               if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
+                       break;
+               mutex_lock(&kvm->lock);
+               if (!kvm->created_vcpus) {
+                       kvm->arch.notify_window = cap->args[0] >> 32;
+                       kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
+                       r = 0;
+               }
+               mutex_unlock(&kvm->lock);
+               break;
+       case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
+               r = -EINVAL;
+
+               /*
+                * Since the risk of disabling NX hugepages is a guest crashing
+                * the system, ensure the userspace process has permission to
+                * reboot the system.
+                *
+                * Note that unlike the reboot() syscall, the process must have
+                * this capability in the root namespace because exposing
+                * /dev/kvm into a container does not limit the scope of the
+                * iTLB multihit bug to that container. In other words,
+                * this must use capable(), not ns_capable().
+                */
+               if (!capable(CAP_SYS_BOOT)) {
+                       r = -EPERM;
+                       break;
+               }
+
+               if (cap->args[0])
+                       break;
+
+               mutex_lock(&kvm->lock);
+               if (!kvm->created_vcpus) {
+                       kvm->arch.disable_nx_huge_pages = true;
+                       r = 0;
+               }
+               mutex_unlock(&kvm->lock);
+               break;
        default:
                r = -EINVAL;
                break;
@@ -6583,8 +6793,8 @@ set_pit2_out:
                r = -EINVAL;
                user_tsc_khz = (u32)arg;
 
-               if (kvm_has_tsc_control &&
-                   user_tsc_khz >= kvm_max_guest_tsc_khz)
+               if (kvm_caps.has_tsc_control &&
+                   user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
                        goto out;
 
                if (user_tsc_khz == 0)
@@ -6659,15 +6869,12 @@ out:
 
 static void kvm_init_msr_list(void)
 {
-       struct x86_pmu_capability x86_pmu;
        u32 dummy[2];
        unsigned i;
 
        BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
                         "Please update the fixed PMCs in msrs_to_saved_all[]");
 
-       perf_get_x86_pmu_capability(&x86_pmu);
-
        num_msrs_to_save = 0;
        num_emulated_msrs = 0;
        num_msr_based_features = 0;
@@ -6719,12 +6926,12 @@ static void kvm_init_msr_list(void)
                        break;
                case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
                        if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
-                           min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+                           min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
                                continue;
                        break;
                case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
                        if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
-                           min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+                           min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
                                continue;
                        break;
                case MSR_IA32_XFD:
@@ -6881,7 +7088,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
                unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
                int ret;
 
-               if (gpa == UNMAPPED_GVA)
+               if (gpa == INVALID_GPA)
                        return X86EMUL_PROPAGATE_FAULT;
                ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
                                               offset, toread);
@@ -6912,7 +7119,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
        /* Inline kvm_read_guest_virt_helper for speed.  */
        gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
                                    exception);
-       if (unlikely(gpa == UNMAPPED_GVA))
+       if (unlikely(gpa == INVALID_GPA))
                return X86EMUL_PROPAGATE_FAULT;
 
        offset = addr & (PAGE_SIZE-1);
@@ -6982,7 +7189,7 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes
                unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
                int ret;
 
-               if (gpa == UNMAPPED_GVA)
+               if (gpa == INVALID_GPA)
                        return X86EMUL_PROPAGATE_FAULT;
                ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
                if (ret < 0) {
@@ -7093,7 +7300,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 
        *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
 
-       if (*gpa == UNMAPPED_GVA)
+       if (*gpa == INVALID_GPA)
                return -1;
 
        return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
@@ -7330,7 +7537,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 
        gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
 
-       if (gpa == UNMAPPED_GVA ||
+       if (gpa == INVALID_GPA ||
            (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
                goto emul_write;
 
@@ -7384,36 +7591,47 @@ emul_write:
        return emulator_write_emulated(ctxt, addr, new, bytes, exception);
 }
 
-static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
+static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
+                              unsigned short port, void *data,
+                              unsigned int count, bool in)
 {
-       int r = 0, i;
+       unsigned i;
+       int r;
 
-       for (i = 0; i < vcpu->arch.pio.count; i++) {
-               if (vcpu->arch.pio.in)
-                       r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
-                                           vcpu->arch.pio.size, pd);
+       WARN_ON_ONCE(vcpu->arch.pio.count);
+       for (i = 0; i < count; i++) {
+               if (in)
+                       r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data);
                else
-                       r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
-                                            vcpu->arch.pio.port, vcpu->arch.pio.size,
-                                            pd);
-               if (r)
+                       r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data);
+
+               if (r) {
+                       if (i == 0)
+                               goto userspace_io;
+
+                       /*
+                        * Userspace must have unregistered the device while PIO
+                        * was running.  Drop writes / read as 0.
+                        */
+                       if (in)
+                               memset(data, 0, size * (count - i));
                        break;
-               pd += vcpu->arch.pio.size;
+               }
+
+               data += size;
        }
-       return r;
-}
+       return 1;
 
-static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
-                              unsigned short port,
-                              unsigned int count, bool in)
-{
+userspace_io:
        vcpu->arch.pio.port = port;
        vcpu->arch.pio.in = in;
-       vcpu->arch.pio.count  = count;
+       vcpu->arch.pio.count = count;
        vcpu->arch.pio.size = size;
 
-       if (!kernel_pio(vcpu, vcpu->arch.pio_data))
-               return 1;
+       if (in)
+               memset(vcpu->arch.pio_data, 0, size * count);
+       else
+               memcpy(vcpu->arch.pio_data, data, size * count);
 
        vcpu->run->exit_reason = KVM_EXIT_IO;
        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -7421,30 +7639,33 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
        vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
        vcpu->run->io.count = count;
        vcpu->run->io.port = port;
-
        return 0;
 }
 
-static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size,
-                            unsigned short port, unsigned int count)
+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+                          unsigned short port, void *val, unsigned int count)
 {
-       WARN_ON(vcpu->arch.pio.count);
-       memset(vcpu->arch.pio_data, 0, size * count);
-       return emulator_pio_in_out(vcpu, size, port, count, true);
+       int r = emulator_pio_in_out(vcpu, size, port, val, count, true);
+       if (r)
+               trace_kvm_pio(KVM_PIO_IN, port, size, count, val);
+
+       return r;
 }
 
 static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val)
 {
        int size = vcpu->arch.pio.size;
-       unsigned count = vcpu->arch.pio.count;
+       unsigned int count = vcpu->arch.pio.count;
        memcpy(val, vcpu->arch.pio_data, size * count);
        trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
        vcpu->arch.pio.count = 0;
 }
 
-static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
-                          unsigned short port, void *val, unsigned int count)
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+                                   int size, unsigned short port, void *val,
+                                   unsigned int count)
 {
+       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        if (vcpu->arch.pio.count) {
                /*
                 * Complete a previous iteration that required userspace I/O.
@@ -7453,39 +7674,19 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
                 * shenanigans as KVM doesn't support modifying the rep count,
                 * and the emulator ensures @count doesn't overflow the buffer.
                 */
-       } else {
-               int r = __emulator_pio_in(vcpu, size, port, count);
-               if (!r)
-                       return r;
-
-               /* Results already available, fall through.  */
+               complete_emulator_pio_in(vcpu, val);
+               return 1;
        }
 
-       complete_emulator_pio_in(vcpu, val);
-       return 1;
-}
-
-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
-                                   int size, unsigned short port, void *val,
-                                   unsigned int count)
-{
-       return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
-
+       return emulator_pio_in(vcpu, size, port, val, count);
 }
 
 static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
                            unsigned short port, const void *val,
                            unsigned int count)
 {
-       int ret;
-
-       memcpy(vcpu->arch.pio_data, val, size * count);
-       trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
-       ret = emulator_pio_in_out(vcpu, size, port, count, false);
-       if (ret)
-                vcpu->arch.pio.count = 0;
-
-        return ret;
+       trace_kvm_pio(KVM_PIO_OUT, port, size, count, val);
+       return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
 }
 
 static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
@@ -7867,7 +8068,16 @@ static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
        return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
 }
 
+static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt)
+{
+       struct kvm *kvm = emul_to_vcpu(ctxt)->kvm;
+
+       if (!kvm->vm_bugged)
+               kvm_vm_bugged(kvm);
+}
+
 static const struct x86_emulate_ops emulate_ops = {
+       .vm_bugged           = emulator_vm_bugged,
        .read_gpr            = emulator_read_gpr,
        .write_gpr           = emulator_write_gpr,
        .read_std            = emulator_read_std,
@@ -8144,7 +8354,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                 * If the mapping is invalid in guest, let cpu retry
                 * it to generate fault.
                 */
-               if (gpa == UNMAPPED_GVA)
+               if (gpa == INVALID_GPA)
                        return true;
        }
 
@@ -8671,11 +8881,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
        /* For size less than 4 we merge, else we zero extend */
        val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
 
-       /*
-        * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
-        * the copy and tracing
-        */
-       emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
+       complete_emulator_pio_in(vcpu, &val);
        kvm_rax_write(vcpu, val);
 
        return kvm_skip_emulated_instruction(vcpu);
@@ -8750,7 +8956,7 @@ static void kvm_hyperv_tsc_notifier(void)
        /* TSC frequency always matches when on Hyper-V */
        for_each_present_cpu(cpu)
                per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
-       kvm_max_guest_tsc_khz = tsc_khz;
+       kvm_caps.max_guest_tsc_khz = tsc_khz;
 
        list_for_each_entry(kvm, &vm_list, vm_list) {
                __kvm_start_pvclock_update(kvm);
@@ -8951,25 +9157,23 @@ static struct notifier_block pvclock_gtod_notifier = {
 int kvm_arch_init(void *opaque)
 {
        struct kvm_x86_init_ops *ops = opaque;
+       u64 host_pat;
        int r;
 
        if (kvm_x86_ops.hardware_enable) {
                pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
-               r = -EEXIST;
-               goto out;
+               return -EEXIST;
        }
 
        if (!ops->cpu_has_kvm_support()) {
                pr_err_ratelimited("kvm: no hardware support for '%s'\n",
                                   ops->runtime_ops->name);
-               r = -EOPNOTSUPP;
-               goto out;
+               return -EOPNOTSUPP;
        }
        if (ops->disabled_by_bios()) {
                pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
                                   ops->runtime_ops->name);
-               r = -EOPNOTSUPP;
-               goto out;
+               return -EOPNOTSUPP;
        }
 
        /*
@@ -8979,27 +9183,37 @@ int kvm_arch_init(void *opaque)
         */
        if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
                printk(KERN_ERR "kvm: inadequate fpu\n");
-               r = -EOPNOTSUPP;
-               goto out;
+               return -EOPNOTSUPP;
        }
 
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
                pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
-               r = -EOPNOTSUPP;
-               goto out;
+               return -EOPNOTSUPP;
        }
 
-       r = -ENOMEM;
+       /*
+        * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes
+        * the PAT bits in SPTEs.  Bail if PAT[0] is programmed to something
+        * other than WB.  Note, EPT doesn't utilize the PAT, but don't bother
+        * with an exception.  PAT[0] is set to WB on RESET and also by the
+        * kernel, i.e. failure indicates a kernel bug or broken firmware.
+        */
+       if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
+           (host_pat & GENMASK(2, 0)) != 6) {
+               pr_err("kvm: host PAT[0] is not WB\n");
+               return -EIO;
+       }
 
        x86_emulator_cache = kvm_alloc_emulator_cache();
        if (!x86_emulator_cache) {
                pr_err("kvm: failed to allocate cache for x86 emulator\n");
-               goto out;
+               return -ENOMEM;
        }
 
        user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
        if (!user_return_msrs) {
                printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
+               r = -ENOMEM;
                goto out_free_x86_emulator_cache;
        }
        kvm_nr_uret_msrs = 0;
@@ -9012,7 +9226,7 @@ int kvm_arch_init(void *opaque)
 
        if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-               supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+               kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
        }
 
        if (pi_inject_timer == -1)
@@ -9030,7 +9244,6 @@ out_free_percpu:
        free_percpu(user_return_msrs);
 out_free_x86_emulator_cache:
        kmem_cache_destroy(x86_emulator_cache);
-out:
        return r;
 }
 
@@ -9405,7 +9618,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
        if (!lapic_in_kernel(vcpu))
                return;
 
-       if (vcpu->arch.apicv_active)
+       if (vcpu->arch.apic->apicv_active)
                return;
 
        if (!vcpu->arch.apic->vapic_addr)
@@ -9434,6 +9647,11 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu)
 
 static void kvm_inject_exception(struct kvm_vcpu *vcpu)
 {
+       trace_kvm_inj_exception(vcpu->arch.exception.nr,
+                               vcpu->arch.exception.has_error_code,
+                               vcpu->arch.exception.error_code,
+                               vcpu->arch.exception.injected);
+
        if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
                vcpu->arch.exception.error_code = false;
        static_call(kvm_x86_queue_exception)(vcpu);
@@ -9469,7 +9687,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
                        static_call(kvm_x86_inject_nmi)(vcpu);
                        can_inject = false;
                } else if (vcpu->arch.interrupt.injected) {
-                       static_call(kvm_x86_inject_irq)(vcpu);
+                       static_call(kvm_x86_inject_irq)(vcpu, true);
                        can_inject = false;
                }
        }
@@ -9491,13 +9709,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
 
        /* try to inject new event if pending */
        if (vcpu->arch.exception.pending) {
-               trace_kvm_inj_exception(vcpu->arch.exception.nr,
-                                       vcpu->arch.exception.has_error_code,
-                                       vcpu->arch.exception.error_code);
-
-               vcpu->arch.exception.pending = false;
-               vcpu->arch.exception.injected = true;
-
                if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
                        __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
                                             X86_EFLAGS_RF);
@@ -9511,6 +9722,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
                }
 
                kvm_inject_exception(vcpu);
+
+               vcpu->arch.exception.pending = false;
+               vcpu->arch.exception.injected = true;
+
                can_inject = false;
        }
 
@@ -9563,7 +9778,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
                        goto out;
                if (r) {
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
-                       static_call(kvm_x86_inject_irq)(vcpu);
+                       static_call(kvm_x86_inject_irq)(vcpu, false);
                        WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
                }
                if (kvm_cpu_has_injectable_intr(vcpu))
@@ -9856,6 +10071,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
 
 void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
 {
+       struct kvm_lapic *apic = vcpu->arch.apic;
        bool activate;
 
        if (!lapic_in_kernel(vcpu))
@@ -9864,12 +10080,14 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
        down_read(&vcpu->kvm->arch.apicv_update_lock);
        preempt_disable();
 
-       activate = kvm_vcpu_apicv_activated(vcpu);
+       /* Do not activate APICV when APIC is disabled */
+       activate = kvm_vcpu_apicv_activated(vcpu) &&
+                  (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED);
 
-       if (vcpu->arch.apicv_active == activate)
+       if (apic->apicv_active == activate)
                goto out;
 
-       vcpu->arch.apicv_active = activate;
+       apic->apicv_active = activate;
        kvm_apic_update_apicv(vcpu);
        static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
 
@@ -9879,7 +10097,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
         * still active when the interrupt got accepted. Make sure
         * inject_pending_event() is called to check for that.
         */
-       if (!vcpu->arch.apicv_active)
+       if (!apic->apicv_active)
                kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 out:
@@ -10274,7 +10492,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 * per-VM state, and responsing vCPUs must wait for the update
                 * to complete before servicing KVM_REQ_APICV_UPDATE.
                 */
-               WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu));
+               WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+                            (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
 
                exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
                if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
@@ -10653,8 +10872,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                r = cui(vcpu);
                if (r <= 0)
                        goto out;
-       } else
-               WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
+       } else {
+               WARN_ON_ONCE(vcpu->arch.pio.count);
+               WARN_ON_ONCE(vcpu->mmio_needed);
+       }
 
        if (kvm_run->immediate_exit) {
                r = -EINTR;
@@ -11182,7 +11403,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
        gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
        tr->physical_address = gpa;
-       tr->valid = gpa != UNMAPPED_GVA;
+       tr->valid = gpa != INVALID_GPA;
        tr->writeable = 1;
        tr->usermode = 0;
 
@@ -11275,11 +11496,17 @@ static int sync_regs(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
 {
-       if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+       if (kvm_check_tsc_unstable() && kvm->created_vcpus)
                pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
                             "guest TSC will not be reliable\n");
 
-       return 0;
+       if (!kvm->arch.max_vcpu_ids)
+               kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
+
+       if (id >= kvm->arch.max_vcpu_ids)
+               return -EINVAL;
+
+       return static_call(kvm_x86_vcpu_precreate)(kvm);
 }
 
 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
@@ -11316,7 +11543,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
                 * will ensure the vCPU gets the correct state before VM-Entry.
                 */
                if (enable_apicv) {
-                       vcpu->arch.apicv_active = true;
+                       vcpu->arch.apic->apicv_active = true;
                        kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
                }
        } else
@@ -11329,9 +11556,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
                goto fail_free_lapic;
        vcpu->arch.pio_data = page_address(page);
 
-       vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+       vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
                                       GFP_KERNEL_ACCOUNT);
-       if (!vcpu->arch.mce_banks)
+       vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
+                                           GFP_KERNEL_ACCOUNT);
+       if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
                goto fail_free_pio_data;
        vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 
@@ -11385,6 +11614,7 @@ free_wbinvd_dirty_mask:
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 fail_free_mce_banks:
        kfree(vcpu->arch.mce_banks);
+       kfree(vcpu->arch.mci_ctl2_banks);
 fail_free_pio_data:
        free_page((unsigned long)vcpu->arch.pio_data);
 fail_free_lapic:
@@ -11430,6 +11660,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_hv_vcpu_uninit(vcpu);
        kvm_pmu_destroy(vcpu);
        kfree(vcpu->arch.mce_banks);
+       kfree(vcpu->arch.mci_ctl2_banks);
        kvm_free_lapic(vcpu);
        idx = srcu_read_lock(&vcpu->kvm->srcu);
        kvm_mmu_destroy(vcpu);
@@ -11509,6 +11740,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
                vcpu->arch.smbase = 0x30000;
 
                vcpu->arch.msr_misc_features_enables = 0;
+               vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
+                                                 MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
 
                __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
                __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
@@ -11525,7 +11758,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
         * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry
         * on RESET.  But, go through the motions in case that's ever remedied.
         */
-       cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0);
+       cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1);
        kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
 
        static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
@@ -11716,6 +11949,8 @@ int kvm_arch_hardware_setup(void *opaque)
        if (boot_cpu_has(X86_FEATURE_XSAVES))
                rdmsrl(MSR_IA32_XSS, host_xss);
 
+       kvm_init_pmu_capability();
+
        r = ops->hardware_setup();
        if (r != 0)
                return r;
@@ -11725,13 +11960,13 @@ int kvm_arch_hardware_setup(void *opaque)
        kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
 
        if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
-               supported_xss = 0;
+               kvm_caps.supported_xss = 0;
 
 #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
        cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
 #undef __kvm_cpu_cap_has
 
-       if (kvm_has_tsc_control) {
+       if (kvm_caps.has_tsc_control) {
                /*
                 * Make sure the user can only configure tsc_khz values that
                 * fit into a signed integer.
@@ -11739,10 +11974,10 @@ int kvm_arch_hardware_setup(void *opaque)
                 * be 1 on all machines.
                 */
                u64 max = min(0x7fffffffULL,
-                             __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
-               kvm_max_guest_tsc_khz = max;
+                             __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
+               kvm_caps.max_guest_tsc_khz = max;
        }
-       kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
+       kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
        kvm_init_msr_list();
        return 0;
 }
@@ -12330,7 +12565,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 
 bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
 {
-       if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
+       if (kvm_vcpu_apicv_active(vcpu) &&
+           static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
                return true;
 
        return false;
@@ -12772,7 +13008,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c
                (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
 
        if (!(error_code & PFERR_PRESENT_MASK) ||
-           mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) {
+           mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) {
                /*
                 * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
                 * tables probably do not match the TLB.  Just proceed
@@ -12997,6 +13233,12 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
 }
 EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
 
+static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size)
+{
+       vcpu->arch.sev_pio_count -= count;
+       vcpu->arch.sev_pio_data += count * size;
+}
+
 static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
                           unsigned int port);
 
@@ -13020,8 +13262,7 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
                int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
 
                /* memcpy done already by emulator_pio_out.  */
-               vcpu->arch.sev_pio_count -= count;
-               vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
+               advance_sev_es_emulated_pio(vcpu, count, size);
                if (!ret)
                        break;
 
@@ -13037,20 +13278,14 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
 static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
                          unsigned int port);
 
-static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
-{
-       unsigned count = vcpu->arch.pio.count;
-       complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
-       vcpu->arch.sev_pio_count -= count;
-       vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
-}
-
 static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
 {
+       unsigned count = vcpu->arch.pio.count;
        int size = vcpu->arch.pio.size;
        int port = vcpu->arch.pio.port;
 
-       advance_sev_es_emulated_ins(vcpu);
+       complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
+       advance_sev_es_emulated_pio(vcpu, count, size);
        if (vcpu->arch.sev_pio_count)
                return kvm_sev_es_ins(vcpu, size, port);
        return 1;
@@ -13062,11 +13297,11 @@ static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
        for (;;) {
                unsigned int count =
                        min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
-               if (!__emulator_pio_in(vcpu, size, port, count))
+               if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count))
                        break;
 
                /* Emulation done by the kernel.  */
-               advance_sev_es_emulated_ins(vcpu);
+               advance_sev_es_emulated_pio(vcpu, count, size);
                if (!vcpu->arch.sev_pio_count)
                        return 1;
        }
@@ -13109,6 +13344,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);