KVM: stats: Separate generic stats from architecture specific ones
[linux-2.6-microblaze.git] / arch / x86 / kvm / x86.c
index 9b6bca6..7120233 100644 (file)
@@ -58,6 +58,7 @@
 #include <linux/sched/isolation.h>
 #include <linux/mem_encrypt.h>
 #include <linux/entry-kvm.h>
+#include <linux/suspend.h>
 
 #include <trace/events/kvm.h>
 
@@ -102,6 +103,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 
 static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
 
+#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
+
 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
 
@@ -113,6 +116,9 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 static void store_regs(struct kvm_vcpu *vcpu);
 static int sync_regs(struct kvm_vcpu *vcpu);
 
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+
 struct kvm_x86_ops kvm_x86_ops __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
@@ -209,6 +215,9 @@ EXPORT_SYMBOL_GPL(host_efer);
 bool __read_mostly allow_smaller_maxphyaddr = 0;
 EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
 
+bool __read_mostly enable_apicv = true;
+EXPORT_SYMBOL_GPL(enable_apicv);
+
 u64 __read_mostly host_xss;
 EXPORT_SYMBOL_GPL(host_xss);
 u64 __read_mostly supported_xss;
@@ -226,10 +235,10 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        VCPU_STAT("irq_window", irq_window_exits),
        VCPU_STAT("nmi_window", nmi_window_exits),
        VCPU_STAT("halt_exits", halt_exits),
-       VCPU_STAT("halt_successful_poll", halt_successful_poll),
-       VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
-       VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
-       VCPU_STAT("halt_wakeup", halt_wakeup),
+       VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
+       VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
+       VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
+       VCPU_STAT("halt_wakeup", generic.halt_wakeup),
        VCPU_STAT("hypercalls", hypercalls),
        VCPU_STAT("request_irq", request_irq_exits),
        VCPU_STAT("irq_exits", irq_exits),
@@ -241,11 +250,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        VCPU_STAT("nmi_injections", nmi_injections),
        VCPU_STAT("req_event", req_event),
        VCPU_STAT("l1d_flush", l1d_flush),
-       VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
-       VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+       VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
+       VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
        VCPU_STAT("nested_run", nested_run),
        VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
        VCPU_STAT("directed_yield_successful", directed_yield_successful),
+       VCPU_STAT("guest_mode", guest_mode),
        VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
        VM_STAT("mmu_pte_write", mmu_pte_write),
        VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
@@ -253,7 +263,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        VM_STAT("mmu_recycled", mmu_recycled),
        VM_STAT("mmu_cache_miss", mmu_cache_miss),
        VM_STAT("mmu_unsync", mmu_unsync),
-       VM_STAT("remote_tlb_flush", remote_tlb_flush),
+       VM_STAT("remote_tlb_flush", generic.remote_tlb_flush),
        VM_STAT("largepages", lpages, .mode = 0444),
        VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
        VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
@@ -778,13 +788,6 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 
-static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
-                              void *data, int offset, int len, u32 access)
-{
-       return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
-                                      data, offset, len, access);
-}
-
 static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
@@ -819,6 +822,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 
        memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
        kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+       vcpu->arch.pdptrs_from_userspace = false;
 
 out:
 
@@ -826,30 +830,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(load_pdptrs);
 
-bool pdptrs_changed(struct kvm_vcpu *vcpu)
-{
-       u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
-       int offset;
-       gfn_t gfn;
-       int r;
-
-       if (!is_pae_paging(vcpu))
-               return false;
-
-       if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
-               return true;
-
-       gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
-       offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
-       r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
-                                      PFERR_USER_MASK | PFERR_WRITE_MASK);
-       if (r < 0)
-               return true;
-
-       return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
-}
-EXPORT_SYMBOL_GPL(pdptrs_changed);
-
 void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
 {
        unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
@@ -1084,25 +1064,46 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
+static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
+{
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       unsigned long roots_to_free = 0;
+       int i;
+
+       /*
+        * If neither the current CR3 nor any of the prev_roots use the given
+        * PCID, then nothing needs to be done here because a resync will
+        * happen anyway before switching to any other CR3.
+        */
+       if (kvm_get_active_pcid(vcpu) == pcid) {
+               kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+       }
+
+       for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+               if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
+                       roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+       kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+}
+
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
        bool skip_tlb_flush = false;
+       unsigned long pcid = 0;
 #ifdef CONFIG_X86_64
        bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
 
        if (pcid_enabled) {
                skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
                cr3 &= ~X86_CR3_PCID_NOFLUSH;
+               pcid = cr3 & X86_CR3_PCID_MASK;
        }
 #endif
 
-       if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
-               if (!skip_tlb_flush) {
-                       kvm_mmu_sync_roots(vcpu);
-                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-               }
-               return 0;
-       }
+       /* PDPTRs are always reloaded for PAE paging. */
+       if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
+               goto handle_tlb_flush;
 
        /*
         * Do not condition the GPA check on long mode, this helper is used to
@@ -1115,10 +1116,23 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
                return 1;
 
-       kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
+       if (cr3 != kvm_read_cr3(vcpu))
+               kvm_mmu_new_pgd(vcpu, cr3);
+
        vcpu->arch.cr3 = cr3;
        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
 
+handle_tlb_flush:
+       /*
+        * A load of CR3 that flushes the TLB flushes only the current PCID,
+        * even if PCID is disabled, in which case PCID=0 is flushed.  It's a
+        * moot point in the end because _disabling_ PCID will flush all PCIDs,
+        * and it's impossible to use a non-zero PCID when PCID is disabled,
+        * i.e. only PCID=0 can be relevant.
+        */
+       if (!skip_tlb_flush)
+               kvm_invalidate_pcid(vcpu, pcid);
+
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
@@ -2179,13 +2193,15 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
        return v;
 }
 
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
+
 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 {
        u64 ratio;
 
        /* Guest TSC same frequency as host TSC? */
        if (!scale) {
-               vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
                return 0;
        }
 
@@ -2211,7 +2227,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
                return -1;
        }
 
-       vcpu->arch.tsc_scaling_ratio = ratio;
+       kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
        return 0;
 }
 
@@ -2223,7 +2239,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
        /* tsc_khz can be zero if TSC calibration fails */
        if (user_tsc_khz == 0) {
                /* set tsc_scaling_ratio to a safe value */
-               vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
                return -1;
        }
 
@@ -2305,10 +2321,9 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc)
        return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
 }
 
-u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio)
 {
        u64 _tsc = tsc;
-       u64 ratio = vcpu->arch.tsc_scaling_ratio;
 
        if (ratio != kvm_default_tsc_scaling_ratio)
                _tsc = __scale_tsc(ratio, tsc);
@@ -2317,25 +2332,86 @@ u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
 }
 EXPORT_SYMBOL_GPL(kvm_scale_tsc);
 
-static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 {
        u64 tsc;
 
-       tsc = kvm_scale_tsc(vcpu, rdtsc());
+       tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
 
        return target_tsc - tsc;
 }
 
 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
-       return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+       return vcpu->arch.l1_tsc_offset +
+               kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
 }
 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 
-static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
 {
-       vcpu->arch.l1_tsc_offset = offset;
-       vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
+       u64 nested_offset;
+
+       if (l2_multiplier == kvm_default_tsc_scaling_ratio)
+               nested_offset = l1_offset;
+       else
+               nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
+                                               kvm_tsc_scaling_ratio_frac_bits);
+
+       nested_offset += l2_offset;
+       return nested_offset;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
+
+u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
+{
+       if (l2_multiplier != kvm_default_tsc_scaling_ratio)
+               return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
+                                      kvm_tsc_scaling_ratio_frac_bits);
+
+       return l1_multiplier;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
+
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
+{
+       trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+                                  vcpu->arch.l1_tsc_offset,
+                                  l1_offset);
+
+       vcpu->arch.l1_tsc_offset = l1_offset;
+
+       /*
+        * If we are here because L1 chose not to trap WRMSR to TSC then
+        * according to the spec this should set L1's TSC (as opposed to
+        * setting L1's offset for L2).
+        */
+       if (is_guest_mode(vcpu))
+               vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+                       l1_offset,
+                       static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
+                       static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+       else
+               vcpu->arch.tsc_offset = l1_offset;
+
+       static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
+}
+
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
+{
+       vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
+
+       /* Userspace is changing the multiplier while L2 is active */
+       if (is_guest_mode(vcpu))
+               vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+                       l1_multiplier,
+                       static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+       else
+               vcpu->arch.tsc_scaling_ratio = l1_multiplier;
+
+       if (kvm_has_tsc_control)
+               static_call(kvm_x86_write_tsc_multiplier)(
+                       vcpu, vcpu->arch.tsc_scaling_ratio);
 }
 
 static inline bool kvm_check_tsc_unstable(void)
@@ -2361,7 +2437,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
        bool synchronizing = false;
 
        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
-       offset = kvm_compute_tsc_offset(vcpu, data);
+       offset = kvm_compute_l1_tsc_offset(vcpu, data);
        ns = get_kvmclock_base_ns();
        elapsed = ns - kvm->arch.last_tsc_nsec;
 
@@ -2400,7 +2476,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
                } else {
                        u64 delta = nsec_to_cycles(vcpu, elapsed);
                        data += delta;
-                       offset = kvm_compute_tsc_offset(vcpu, data);
+                       offset = kvm_compute_l1_tsc_offset(vcpu, data);
                }
                matched = true;
                already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
@@ -2459,9 +2535,10 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
 
 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
 {
-       if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+       if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
                WARN_ON(adjustment < 0);
-       adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+       adjustment = kvm_scale_tsc(vcpu, (u64) adjustment,
+                                  vcpu->arch.l1_tsc_scaling_ratio);
        adjust_tsc_offset_guest(vcpu, adjustment);
 }
 
@@ -2844,7 +2921,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        /* With all the info we got, fill in the values */
 
        if (kvm_has_tsc_control)
-               tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+               tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz,
+                                           v->arch.l1_tsc_scaling_ratio);
 
        if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
                kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
@@ -3072,6 +3150,19 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
 static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
 {
        ++vcpu->stat.tlb_flush;
+
+       if (!tdp_enabled) {
+               /*
+                * A TLB flush on behalf of the guest is equivalent to
+                * INVPCID(all), toggling CR4.PGE, etc., which requires
+                * a forced sync of the shadow page tables.  Unload the
+                * entire MMU here and the subsequent load will sync the
+                * shadow page tables, and also flush the TLB.
+                */
+               kvm_mmu_unload(vcpu);
+               return;
+       }
+
        static_call(kvm_x86_tlb_flush_guest)(vcpu);
 }
 
@@ -3101,10 +3192,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
         * expensive IPIs.
         */
        if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+               u8 st_preempted = xchg(&st->preempted, 0);
+
                trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
-                                      st->preempted & KVM_VCPU_FLUSH_TLB);
-               if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+                                      st_preempted & KVM_VCPU_FLUSH_TLB);
+               if (st_preempted & KVM_VCPU_FLUSH_TLB)
                        kvm_vcpu_flush_tlb_guest(vcpu);
+       } else {
+               st->preempted = 0;
        }
 
        vcpu->arch.st.preempted = 0;
@@ -3233,7 +3328,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if (msr_info->host_initiated) {
                        kvm_synchronize_tsc(vcpu, data);
                } else {
-                       u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+                       u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
                        adjust_tsc_offset_guest(vcpu, adj);
                        vcpu->arch.ia32_tsc_adjust_msr += adj;
                }
@@ -3468,7 +3563,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_LASTBRANCHTOIP:
        case MSR_IA32_LASTINTFROMIP:
        case MSR_IA32_LASTINTTOIP:
-       case MSR_K8_SYSCFG:
+       case MSR_AMD64_SYSCFG:
        case MSR_K8_TSEG_ADDR:
        case MSR_K8_TSEG_MASK:
        case MSR_VM_HSAVE_PA:
@@ -3535,10 +3630,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 * return L1's TSC value to ensure backwards-compatible
                 * behavior for migration.
                 */
-               u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
-                                                           vcpu->arch.tsc_offset;
+               u64 offset, ratio;
 
-               msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
+               if (msr_info->host_initiated) {
+                       offset = vcpu->arch.l1_tsc_offset;
+                       ratio = vcpu->arch.l1_tsc_scaling_ratio;
+               } else {
+                       offset = vcpu->arch.tsc_offset;
+                       ratio = vcpu->arch.tsc_scaling_ratio;
+               }
+
+               msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset;
                break;
        }
        case MSR_MTRRcap:
@@ -3862,6 +3964,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_HYPERV_TLBFLUSH:
        case KVM_CAP_HYPERV_SEND_IPI:
        case KVM_CAP_HYPERV_CPUID:
+       case KVM_CAP_HYPERV_ENFORCE_CPUID:
        case KVM_CAP_SYS_HYPERV_CPUID:
        case KVM_CAP_PCI_SEGMENT:
        case KVM_CAP_DEBUGREGS:
@@ -3892,8 +3995,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_SGX_ATTRIBUTE:
 #endif
        case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+       case KVM_CAP_SREGS2:
                r = 1;
                break;
+       case KVM_CAP_EXIT_HYPERCALL:
+               r = KVM_EXIT_HYPERCALL_VALID_MASK;
+               break;
        case KVM_CAP_SET_GUEST_DEBUG2:
                return KVM_GUESTDBG_VALID_MASK;
 #ifdef CONFIG_KVM_XEN
@@ -4121,7 +4228,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                        mark_tsc_unstable("KVM discovered backwards TSC");
 
                if (kvm_check_tsc_unstable()) {
-                       u64 offset = kvm_compute_tsc_offset(vcpu,
+                       u64 offset = kvm_compute_l1_tsc_offset(vcpu,
                                                vcpu->arch.last_guest_tsc);
                        kvm_vcpu_write_tsc_offset(vcpu, offset);
                        vcpu->arch.tsc_catchup = 1;
@@ -4440,7 +4547,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
        memset(&events->reserved, 0, sizeof(events->reserved));
 }
 
-static void kvm_smm_changed(struct kvm_vcpu *vcpu);
+static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
 
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                                              struct kvm_vcpu_events *events)
@@ -4500,13 +4607,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
        if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
-               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
-                       if (events->smi.smm)
-                               vcpu->arch.hflags |= HF_SMM_MASK;
-                       else
-                               vcpu->arch.hflags &= ~HF_SMM_MASK;
-                       kvm_smm_changed(vcpu);
-               }
+               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
+                       kvm_smm_changed(vcpu, events->smi.smm);
 
                vcpu->arch.smi_pending = events->smi.pending;
 
@@ -4790,6 +4892,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 
                return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
 
+       case KVM_CAP_HYPERV_ENFORCE_CPUID:
+               return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
+
        case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
                vcpu->arch.pv_cpuid.enforce = cap->args[0];
                if (vcpu->arch.pv_cpuid.enforce)
@@ -4808,6 +4913,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        void __user *argp = (void __user *)arg;
        int r;
        union {
+               struct kvm_sregs2 *sregs2;
                struct kvm_lapic_state *lapic;
                struct kvm_xsave *xsave;
                struct kvm_xcrs *xcrs;
@@ -5180,6 +5286,28 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                break;
        }
 #endif
+       case KVM_GET_SREGS2: {
+               u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!u.sregs2)
+                       goto out;
+               __get_sregs2(vcpu, u.sregs2);
+               r = -EFAULT;
+               if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_SREGS2: {
+               u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
+               if (IS_ERR(u.sregs2)) {
+                       r = PTR_ERR(u.sregs2);
+                       u.sregs2 = NULL;
+                       goto out;
+               }
+               r = __set_sregs2(vcpu, u.sregs2);
+               break;
+       }
        default:
                r = -EINVAL;
        }
@@ -5499,6 +5627,14 @@ split_irqchip_unlock:
                if (kvm_x86_ops.vm_copy_enc_context_from)
                        r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
                return r;
+       case KVM_CAP_EXIT_HYPERCALL:
+               if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
+                       r = -EINVAL;
+                       break;
+               }
+               kvm->arch.hypercall_exit_enabled = cap->args[0];
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@ -5613,6 +5749,41 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
        return 0;
 }
 
+#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+static int kvm_arch_suspend_notifier(struct kvm *kvm)
+{
+       struct kvm_vcpu *vcpu;
+       int i, ret = 0;
+
+       mutex_lock(&kvm->lock);
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (!vcpu->arch.pv_time_enabled)
+                       continue;
+
+               ret = kvm_set_guest_paused(vcpu);
+               if (ret) {
+                       kvm_err("Failed to pause guest VCPU%d: %d\n",
+                               vcpu->vcpu_id, ret);
+                       break;
+               }
+       }
+       mutex_unlock(&kvm->lock);
+
+       return ret ? NOTIFY_BAD : NOTIFY_DONE;
+}
+
+int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
+{
+       switch (state) {
+       case PM_HIBERNATION_PREPARE:
+       case PM_SUSPEND_PREPARE:
+               return kvm_arch_suspend_notifier(kvm);
+       }
+
+       return NOTIFY_DONE;
+}
+#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+
 long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg)
 {
@@ -7087,20 +7258,22 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
        return emul_to_vcpu(ctxt)->arch.hflags;
 }
 
-static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt)
 {
-       emul_to_vcpu(ctxt)->arch.hflags = emul_flags;
+       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+       kvm_smm_changed(vcpu, false);
 }
 
-static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
+static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt,
                                  const char *smstate)
 {
-       return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate);
+       return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate);
 }
 
-static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
+static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
 {
-       kvm_smm_changed(emul_to_vcpu(ctxt));
+       kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
 }
 
 static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
@@ -7149,9 +7322,9 @@ static const struct x86_emulate_ops emulate_ops = {
        .guest_has_fxsr      = emulator_guest_has_fxsr,
        .set_nmi_mask        = emulator_set_nmi_mask,
        .get_hflags          = emulator_get_hflags,
-       .set_hflags          = emulator_set_hflags,
-       .pre_leave_smm       = emulator_pre_leave_smm,
-       .post_leave_smm      = emulator_post_leave_smm,
+       .exiting_smm         = emulator_exiting_smm,
+       .leave_smm           = emulator_leave_smm,
+       .triple_fault        = emulator_triple_fault,
        .set_xcr             = emulator_set_xcr,
 };
 
@@ -7226,6 +7399,11 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
        BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
        BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
 
+       ctxt->interruptibility = 0;
+       ctxt->have_exception = false;
+       ctxt->exception.vector = -1;
+       ctxt->perm_ok = false;
+
        init_decode_cache(ctxt);
        vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
 }
@@ -7407,11 +7585,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
 
-static void kvm_smm_changed(struct kvm_vcpu *vcpu)
+static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
 {
-       if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
-               /* This is a good place to trace that we are exiting SMM.  */
-               trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+       trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
+
+       if (entering_smm) {
+               vcpu->arch.hflags |= HF_SMM_MASK;
+       } else {
+               vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
 
                /* Process a latched INIT or SMI, if any.  */
                kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -7561,14 +7742,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
            kvm_vcpu_check_breakpoint(vcpu, &r))
                return r;
 
-       ctxt->interruptibility = 0;
-       ctxt->have_exception = false;
-       ctxt->exception.vector = -1;
-       ctxt->perm_ok = false;
-
-       ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
-
-       r = x86_decode_insn(ctxt, insn, insn_len);
+       r = x86_decode_insn(ctxt, insn, insn_len, emulation_type);
 
        trace_kvm_emulate_insn_start(vcpu);
        ++vcpu->stat.insn_emulation;
@@ -8243,6 +8417,7 @@ void kvm_arch_exit(void)
        kvm_x86_ops.hardware_enable = NULL;
        kvm_mmu_module_exit();
        free_percpu(user_return_msrs);
+       kmem_cache_destroy(x86_emulator_cache);
        kmem_cache_destroy(x86_fpu_cache);
 #ifdef CONFIG_KVM_XEN
        static_key_deferred_flush(&kvm_xen_enabled);
@@ -8342,16 +8517,15 @@ bool kvm_apicv_activated(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_apicv_activated);
 
-void kvm_apicv_init(struct kvm *kvm, bool enable)
+static void kvm_apicv_init(struct kvm *kvm)
 {
-       if (enable)
+       if (enable_apicv)
                clear_bit(APICV_INHIBIT_REASON_DISABLE,
                          &kvm->arch.apicv_inhibit_reasons);
        else
                set_bit(APICV_INHIBIT_REASON_DISABLE,
                        &kvm->arch.apicv_inhibit_reasons);
 }
-EXPORT_SYMBOL_GPL(kvm_apicv_init);
 
 static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
 {
@@ -8360,6 +8534,9 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
 
        vcpu->stat.directed_yield_attempted++;
 
+       if (single_task_running())
+               goto no_yield;
+
        rcu_read_lock();
        map = rcu_dereference(vcpu->kvm->arch.apic_map);
 
@@ -8384,6 +8561,17 @@ no_yield:
        return;
 }
 
+static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
+{
+       u64 ret = vcpu->run->hypercall.ret;
+
+       if (!is_64_bit_mode(vcpu))
+               ret = (u32)ret;
+       kvm_rax_write(vcpu, ret);
+       ++vcpu->stat.hypercalls;
+       return kvm_skip_emulated_instruction(vcpu);
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
        unsigned long nr, a0, a1, a2, a3, ret;
@@ -8449,6 +8637,28 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                kvm_sched_yield(vcpu, a0);
                ret = 0;
                break;
+       case KVM_HC_MAP_GPA_RANGE: {
+               u64 gpa = a0, npages = a1, attrs = a2;
+
+               ret = -KVM_ENOSYS;
+               if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
+                       break;
+
+               if (!PAGE_ALIGNED(gpa) || !npages ||
+                   gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
+                       ret = -KVM_EINVAL;
+                       break;
+               }
+
+               vcpu->run->exit_reason        = KVM_EXIT_HYPERCALL;
+               vcpu->run->hypercall.nr       = KVM_HC_MAP_GPA_RANGE;
+               vcpu->run->hypercall.args[0]  = gpa;
+               vcpu->run->hypercall.args[1]  = npages;
+               vcpu->run->hypercall.args[2]  = attrs;
+               vcpu->run->hypercall.longmode = op_64_bit;
+               vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+               return 0;
+       }
        default:
                ret = -KVM_ENOSYS;
                break;
@@ -8532,9 +8742,6 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 
 int kvm_check_nested_events(struct kvm_vcpu *vcpu)
 {
-       if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
-               return -EIO;
-
        if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
                kvm_x86_ops.nested_ops->triple_fault(vcpu);
                return 1;
@@ -8550,7 +8757,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
        static_call(kvm_x86_queue_exception)(vcpu);
 }
 
-static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
+static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
 {
        int r;
        bool can_inject = true;
@@ -8597,7 +8804,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
        if (is_guest_mode(vcpu)) {
                r = kvm_check_nested_events(vcpu);
                if (r < 0)
-                       goto busy;
+                       goto out;
        }
 
        /* try to inject new event if pending */
@@ -8639,7 +8846,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
        if (vcpu->arch.smi_pending) {
                r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
                if (r < 0)
-                       goto busy;
+                       goto out;
                if (r) {
                        vcpu->arch.smi_pending = false;
                        ++vcpu->arch.smi_count;
@@ -8652,7 +8859,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
        if (vcpu->arch.nmi_pending) {
                r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
                if (r < 0)
-                       goto busy;
+                       goto out;
                if (r) {
                        --vcpu->arch.nmi_pending;
                        vcpu->arch.nmi_injected = true;
@@ -8667,7 +8874,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
        if (kvm_cpu_has_injectable_intr(vcpu)) {
                r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
                if (r < 0)
-                       goto busy;
+                       goto out;
                if (r) {
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
                        static_call(kvm_x86_set_irq)(vcpu);
@@ -8683,11 +8890,14 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
                *req_immediate_exit = true;
 
        WARN_ON(vcpu->arch.exception.pending);
-       return;
+       return 0;
 
-busy:
-       *req_immediate_exit = true;
-       return;
+out:
+       if (r == -EBUSY) {
+               *req_immediate_exit = true;
+               r = 0;
+       }
+       return r;
 }
 
 static void process_nmi(struct kvm_vcpu *vcpu)
@@ -8869,7 +9079,6 @@ static void enter_smm(struct kvm_vcpu *vcpu)
        char buf[512];
        u32 cr0;
 
-       trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
        memset(buf, 0, 512);
 #ifdef CONFIG_X86_64
        if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
@@ -8879,13 +9088,13 @@ static void enter_smm(struct kvm_vcpu *vcpu)
                enter_smm_save_state_32(vcpu, buf);
 
        /*
-        * Give pre_enter_smm() a chance to make ISA-specific changes to the
-        * vCPU state (e.g. leave guest mode) after we've saved the state into
-        * the SMM state-save area.
+        * Give enter_smm() a chance to make ISA-specific changes to the vCPU
+        * state (e.g. leave guest mode) after we've saved the state into the
+        * SMM state-save area.
         */
-       static_call(kvm_x86_pre_enter_smm)(vcpu, buf);
+       static_call(kvm_x86_enter_smm)(vcpu, buf);
 
-       vcpu->arch.hflags |= HF_SMM_MASK;
+       kvm_smm_changed(vcpu, true);
        kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
 
        if (static_call(kvm_x86_get_nmi_mask)(vcpu))
@@ -8974,6 +9183,15 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
        vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
        kvm_apic_update_apicv(vcpu);
        static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
+
+       /*
+        * When APICv gets disabled, we may still have injected interrupts
+        * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
+        * still active when the interrupt got accepted. Make sure
+        * inject_pending_event() is called to check for that.
+        */
+       if (!vcpu->arch.apicv_active)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
 
@@ -9149,7 +9367,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                }
                if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
                        kvm_vcpu_flush_tlb_current(vcpu);
-               if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+               if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
                        kvm_vcpu_flush_tlb_guest(vcpu);
 
                if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
@@ -9242,13 +9460,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
            kvm_xen_has_interrupt(vcpu)) {
                ++vcpu->stat.req_event;
-               kvm_apic_accept_events(vcpu);
+               r = kvm_apic_accept_events(vcpu);
+               if (r < 0) {
+                       r = 0;
+                       goto out;
+               }
                if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
                        r = 1;
                        goto out;
                }
 
-               inject_pending_event(vcpu, &req_immediate_exit);
+               r = inject_pending_event(vcpu, &req_immediate_exit);
+               if (r < 0) {
+                       r = 0;
+                       goto out;
+               }
                if (req_int_win)
                        static_call(kvm_x86_enable_irq_window)(vcpu);
 
@@ -9450,7 +9676,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
                        return 1;
        }
 
-       kvm_apic_accept_events(vcpu);
+       if (kvm_apic_accept_events(vcpu) < 0)
+               return 0;
        switch(vcpu->arch.mp_state) {
        case KVM_MP_STATE_HALTED:
        case KVM_MP_STATE_AP_RESET_HOLD:
@@ -9496,7 +9723,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
                if (r <= 0)
                        break;
 
-               kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
+               kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
                if (kvm_cpu_has_pending_timer(vcpu))
                        kvm_inject_pending_timer_irqs(vcpu);
 
@@ -9674,7 +9901,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                        goto out;
                }
                kvm_vcpu_block(vcpu);
-               kvm_apic_accept_events(vcpu);
+               if (kvm_apic_accept_events(vcpu) < 0) {
+                       r = 0;
+                       goto out;
+               }
                kvm_clear_request(KVM_REQ_UNHALT, vcpu);
                r = -EAGAIN;
                if (signal_pending(current)) {
@@ -9823,7 +10053,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
 
-static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
        struct desc_ptr dt;
 
@@ -9856,14 +10086,36 @@ skip_protected_regs:
        sregs->cr8 = kvm_get_cr8(vcpu);
        sregs->efer = vcpu->arch.efer;
        sregs->apic_base = kvm_get_apic_base(vcpu);
+}
 
-       memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+       __get_sregs_common(vcpu, sregs);
+
+       if (vcpu->arch.guest_state_protected)
+               return;
 
        if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
                set_bit(vcpu->arch.interrupt.nr,
                        (unsigned long *)sregs->interrupt_bitmap);
 }
 
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+       int i;
+
+       __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
+
+       if (vcpu->arch.guest_state_protected)
+               return;
+
+       if (is_pae_paging(vcpu)) {
+               for (i = 0 ; i < 4 ; i++)
+                       sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
+               sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
+       }
+}
+
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
 {
@@ -9876,11 +10128,17 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
+       int r;
+
        vcpu_load(vcpu);
        if (kvm_mpx_supported())
                kvm_load_guest_fpu(vcpu);
 
-       kvm_apic_accept_events(vcpu);
+       r = kvm_apic_accept_events(vcpu);
+       if (r < 0)
+               goto out;
+       r = 0;
+
        if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
             vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
            vcpu->arch.pv.pv_unhalted)
@@ -9888,10 +10146,11 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
        else
                mp_state->mp_state = vcpu->arch.mp_state;
 
+out:
        if (kvm_mpx_supported())
                kvm_put_guest_fpu(vcpu);
        vcpu_put(vcpu);
-       return 0;
+       return r;
 }
 
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
@@ -9975,24 +10234,23 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        return kvm_is_valid_cr4(vcpu, sregs->cr4);
 }
 
-static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
+               int *mmu_reset_needed, bool update_pdptrs)
 {
        struct msr_data apic_base_msr;
-       int mmu_reset_needed = 0;
-       int pending_vec, max_bits, idx;
+       int idx;
        struct desc_ptr dt;
-       int ret = -EINVAL;
 
        if (!kvm_is_valid_sregs(vcpu, sregs))
-               goto out;
+               return -EINVAL;
 
        apic_base_msr.data = sregs->apic_base;
        apic_base_msr.host_initiated = true;
        if (kvm_set_apic_base(vcpu, &apic_base_msr))
-               goto out;
+               return -EINVAL;
 
        if (vcpu->arch.guest_state_protected)
-               goto skip_protected_regs;
+               return 0;
 
        dt.size = sregs->idt.limit;
        dt.address = sregs->idt.base;
@@ -10002,31 +10260,30 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        static_call(kvm_x86_set_gdt)(vcpu, &dt);
 
        vcpu->arch.cr2 = sregs->cr2;
-       mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+       *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
        vcpu->arch.cr3 = sregs->cr3;
        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
 
        kvm_set_cr8(vcpu, sregs->cr8);
 
-       mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+       *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
        static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
 
-       mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+       *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
        static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
        vcpu->arch.cr0 = sregs->cr0;
 
-       mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+       *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
        static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
 
-       idx = srcu_read_lock(&vcpu->kvm->srcu);
-       if (is_pae_paging(vcpu)) {
-               load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
-               mmu_reset_needed = 1;
+       if (update_pdptrs) {
+               idx = srcu_read_lock(&vcpu->kvm->srcu);
+               if (is_pae_paging(vcpu)) {
+                       load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+                       *mmu_reset_needed = 1;
+               }
+               srcu_read_unlock(&vcpu->kvm->srcu, idx);
        }
-       srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
-       if (mmu_reset_needed)
-               kvm_mmu_reset_context(vcpu);
 
        kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
        kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@ -10046,20 +10303,63 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
            !is_protmode(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
-skip_protected_regs:
+       return 0;
+}
+
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+       int pending_vec, max_bits;
+       int mmu_reset_needed = 0;
+       int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
+
+       if (ret)
+               return ret;
+
+       if (mmu_reset_needed)
+               kvm_mmu_reset_context(vcpu);
+
        max_bits = KVM_NR_INTERRUPTS;
        pending_vec = find_first_bit(
                (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+
        if (pending_vec < max_bits) {
                kvm_queue_interrupt(vcpu, pending_vec, false);
                pr_debug("Set back pending irq %d\n", pending_vec);
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
+       return 0;
+}
 
-       kvm_make_request(KVM_REQ_EVENT, vcpu);
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+       int mmu_reset_needed = 0;
+       bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
+       bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
+               !(sregs2->efer & EFER_LMA);
+       int i, ret;
 
-       ret = 0;
-out:
-       return ret;
+       if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
+               return -EINVAL;
+
+       if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
+               return -EINVAL;
+
+       ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
+                                &mmu_reset_needed, !valid_pdptrs);
+       if (ret)
+               return ret;
+
+       if (valid_pdptrs) {
+               for (i = 0; i < 4 ; i++)
+                       kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
+
+               kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+               mmu_reset_needed = 1;
+               vcpu->arch.pdptrs_from_userspace = true;
+       }
+       if (mmu_reset_needed)
+               kvm_mmu_reset_context(vcpu);
+       return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
@@ -10115,8 +10415,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
        kvm_update_dr7(vcpu);
 
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-               vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
-                       get_segment_base(vcpu, VCPU_SREG_CS);
+               vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
 
        /*
         * Trigger an rflags update that will inject or remove the trace
@@ -10289,8 +10588,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        else
                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
 
-       kvm_set_tsc_khz(vcpu, max_tsc_khz);
-
        r = kvm_mmu_create(vcpu);
        if (r < 0)
                return r;
@@ -10350,6 +10647,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        vcpu->arch.pending_external_vector = -1;
        vcpu->arch.preempted_in_kernel = false;
 
+#if IS_ENABLED(CONFIG_HYPERV)
+       vcpu->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
        r = static_call(kvm_x86_vcpu_create)(vcpu);
        if (r)
                goto free_guest_fpu;
@@ -10358,8 +10659,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
        kvm_vcpu_mtrr_init(vcpu);
        vcpu_load(vcpu);
+       kvm_set_tsc_khz(vcpu, max_tsc_khz);
        kvm_vcpu_reset(vcpu, false);
-       kvm_init_mmu(vcpu, false);
+       kvm_init_mmu(vcpu);
        vcpu_put(vcpu);
        return 0;
 
@@ -10618,6 +10920,9 @@ int kvm_arch_hardware_setup(void *opaque)
        int r;
 
        rdmsrl_safe(MSR_EFER, &host_efer);
+       if (WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_NX) &&
+                        !(host_efer & EFER_NX)))
+               return -EIO;
 
        if (boot_cpu_has(X86_FEATURE_XSAVES))
                rdmsrl(MSR_IA32_XSS, host_xss);
@@ -10733,9 +11038,15 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        kvm->arch.guest_can_read_msr_platform_info = true;
 
+#if IS_ENABLED(CONFIG_HYPERV)
+       spin_lock_init(&kvm->arch.hv_root_tdp_lock);
+       kvm->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
 
+       kvm_apicv_init(kvm);
        kvm_hv_init_vm(kvm);
        kvm_page_track_init(kvm);
        kvm_mmu_init_vm(kvm);
@@ -10896,17 +11207,23 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kvm_hv_destroy_vm(kvm);
 }
 
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+static void memslot_rmap_free(struct kvm_memory_slot *slot)
 {
        int i;
 
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                kvfree(slot->arch.rmap[i]);
                slot->arch.rmap[i] = NULL;
+       }
+}
 
-               if (i == 0)
-                       continue;
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+       int i;
+
+       memslot_rmap_free(slot);
 
+       for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
                kvfree(slot->arch.lpage_info[i - 1]);
                slot->arch.lpage_info[i - 1] = NULL;
        }
@@ -10914,11 +11231,79 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
        kvm_page_track_free_memslot(slot);
 }
 
-static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
-                                     unsigned long npages)
+static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
+                             unsigned long npages)
 {
+       const int sz = sizeof(*slot->arch.rmap[0]);
        int i;
 
+       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               int level = i + 1;
+               int lpages = gfn_to_index(slot->base_gfn + npages - 1,
+                                         slot->base_gfn, level) + 1;
+
+               WARN_ON(slot->arch.rmap[i]);
+
+               slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
+               if (!slot->arch.rmap[i]) {
+                       memslot_rmap_free(slot);
+                       return -ENOMEM;
+               }
+       }
+
+       return 0;
+}
+
+int alloc_all_memslots_rmaps(struct kvm *kvm)
+{
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *slot;
+       int r, i;
+
+       /*
+        * Check if memslots alreday have rmaps early before acquiring
+        * the slots_arch_lock below.
+        */
+       if (kvm_memslots_have_rmaps(kvm))
+               return 0;
+
+       mutex_lock(&kvm->slots_arch_lock);
+
+       /*
+        * Read memslots_have_rmaps again, under the slots arch lock,
+        * before allocating the rmaps
+        */
+       if (kvm_memslots_have_rmaps(kvm)) {
+               mutex_unlock(&kvm->slots_arch_lock);
+               return 0;
+       }
+
+       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+               slots = __kvm_memslots(kvm, i);
+               kvm_for_each_memslot(slot, slots) {
+                       r = memslot_rmap_alloc(slot, slot->npages);
+                       if (r) {
+                               mutex_unlock(&kvm->slots_arch_lock);
+                               return r;
+                       }
+               }
+       }
+
+       /*
+        * Ensure that memslots_have_rmaps becomes true strictly after
+        * all the rmap pointers are set.
+        */
+       smp_store_release(&kvm->arch.memslots_have_rmaps, true);
+       mutex_unlock(&kvm->slots_arch_lock);
+       return 0;
+}
+
+static int kvm_alloc_memslot_metadata(struct kvm *kvm,
+                                     struct kvm_memory_slot *slot,
+                                     unsigned long npages)
+{
+       int i, r;
+
        /*
         * Clear out the previous array pointers for the KVM_MR_MOVE case.  The
         * old arrays will be freed by __kvm_set_memory_region() if installing
@@ -10926,7 +11311,13 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
         */
        memset(&slot->arch, 0, sizeof(slot->arch));
 
-       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+       if (kvm_memslots_have_rmaps(kvm)) {
+               r = memslot_rmap_alloc(slot, npages);
+               if (r)
+                       return r;
+       }
+
+       for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
                struct kvm_lpage_info *linfo;
                unsigned long ugfn;
                int lpages;
@@ -10935,14 +11326,6 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
                lpages = gfn_to_index(slot->base_gfn + npages - 1,
                                      slot->base_gfn, level) + 1;
 
-               slot->arch.rmap[i] =
-                       kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
-                                GFP_KERNEL_ACCOUNT);
-               if (!slot->arch.rmap[i])
-                       goto out_free;
-               if (i == 0)
-                       continue;
-
                linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
                if (!linfo)
                        goto out_free;
@@ -10972,12 +11355,9 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
        return 0;
 
 out_free:
-       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
-               kvfree(slot->arch.rmap[i]);
-               slot->arch.rmap[i] = NULL;
-               if (i == 0)
-                       continue;
+       memslot_rmap_free(slot);
 
+       for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
                kvfree(slot->arch.lpage_info[i - 1]);
                slot->arch.lpage_info[i - 1] = NULL;
        }
@@ -11006,7 +11386,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                enum kvm_mr_change change)
 {
        if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
-               return kvm_alloc_memslot_metadata(memslot,
+               return kvm_alloc_memslot_metadata(kvm, memslot,
                                                  mem->memory_size >> PAGE_SHIFT);
        return 0;
 }
@@ -11082,36 +11462,19 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
                 */
                kvm_mmu_zap_collapsible_sptes(kvm, new);
        } else {
-               /* By default, write-protect everything to log writes. */
-               int level = PG_LEVEL_4K;
+               /*
+                * Initially-all-set does not require write protecting any page,
+                * because they're all assumed to be dirty.
+                */
+               if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+                       return;
 
                if (kvm_x86_ops.cpu_dirty_log_size) {
-                       /*
-                        * Clear all dirty bits, unless pages are treated as
-                        * dirty from the get-go.
-                        */
-                       if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
-                               kvm_mmu_slot_leaf_clear_dirty(kvm, new);
-
-                       /*
-                        * Write-protect large pages on write so that dirty
-                        * logging happens at 4k granularity.  No need to
-                        * write-protect small SPTEs since write accesses are
-                        * logged by the CPU via dirty bits.
-                        */
-                       level = PG_LEVEL_2M;
-               } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
-                       /*
-                        * If we're with initial-all-set, we don't need
-                        * to write protect any small page because
-                        * they're reported as dirty already.  However
-                        * we still need to write-protect huge pages
-                        * so that the page split can happen lazily on
-                        * the first write to the huge page.
-                        */
-                       level = PG_LEVEL_2M;
+                       kvm_mmu_slot_leaf_clear_dirty(kvm, new);
+                       kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
+               } else {
+                       kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
                }
-               kvm_mmu_slot_remove_write_access(kvm, new, level);
        }
 }
 
@@ -11499,7 +11862,8 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
 
 void kvm_arch_start_assignment(struct kvm *kvm)
 {
-       atomic_inc(&kvm->arch.assigned_device_count);
+       if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
+               static_call_cond(kvm_x86_start_assignment)(kvm);
 }
 EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
 
@@ -11679,8 +12043,6 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
 {
        bool pcid_enabled;
        struct x86_exception e;
-       unsigned i;
-       unsigned long roots_to_free = 0;
        struct {
                u64 pcid;
                u64 gla;
@@ -11714,23 +12076,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
                        return 1;
                }
 
-               if (kvm_get_active_pcid(vcpu) == operand.pcid) {
-                       kvm_mmu_sync_roots(vcpu);
-                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-               }
-
-               for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
-                           == operand.pcid)
-                               roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
-
-               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
-               /*
-                * If neither the current cr3 nor any of the prev_roots use the
-                * given PCID, then nothing needs to be done here because a
-                * resync will happen anyway before switching to any other CR3.
-                */
-
+               kvm_invalidate_pcid(vcpu, operand.pcid);
                return kvm_skip_emulated_instruction(vcpu);
 
        case INVPCID_TYPE_ALL_NON_GLOBAL:
@@ -11743,7 +12089,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
 
                fallthrough;
        case INVPCID_TYPE_ALL_INCL_GLOBAL:
-               kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
                return kvm_skip_emulated_instruction(vcpu);
 
        default: