KVM: SVM: avoid refreshing avic if its state didn't change
[linux-2.6-microblaze.git] / arch / x86 / kvm / x86.c
index e0f4a46..bf8cb10 100644 (file)
@@ -58,6 +58,7 @@
 #include <linux/sched/isolation.h>
 #include <linux/mem_encrypt.h>
 #include <linux/entry-kvm.h>
+#include <linux/suspend.h>
 
 #include <trace/events/kvm.h>
 
@@ -65,6 +66,7 @@
 #include <asm/msr.h>
 #include <asm/desc.h>
 #include <asm/mce.h>
+#include <asm/pkru.h>
 #include <linux/kernel_stat.h>
 #include <asm/fpu/internal.h> /* Ugh! */
 #include <asm/pvclock.h>
@@ -102,6 +104,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 
 static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
 
+#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
+
 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
 
@@ -113,6 +117,9 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 static void store_regs(struct kvm_vcpu *vcpu);
 static int sync_regs(struct kvm_vcpu *vcpu);
 
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+
 struct kvm_x86_ops kvm_x86_ops __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
@@ -209,55 +216,79 @@ EXPORT_SYMBOL_GPL(host_efer);
 bool __read_mostly allow_smaller_maxphyaddr = 0;
 EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
 
+bool __read_mostly enable_apicv = true;
+EXPORT_SYMBOL_GPL(enable_apicv);
+
 u64 __read_mostly host_xss;
 EXPORT_SYMBOL_GPL(host_xss);
 u64 __read_mostly supported_xss;
 EXPORT_SYMBOL_GPL(supported_xss);
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
-       VCPU_STAT("pf_fixed", pf_fixed),
-       VCPU_STAT("pf_guest", pf_guest),
-       VCPU_STAT("tlb_flush", tlb_flush),
-       VCPU_STAT("invlpg", invlpg),
-       VCPU_STAT("exits", exits),
-       VCPU_STAT("io_exits", io_exits),
-       VCPU_STAT("mmio_exits", mmio_exits),
-       VCPU_STAT("signal_exits", signal_exits),
-       VCPU_STAT("irq_window", irq_window_exits),
-       VCPU_STAT("nmi_window", nmi_window_exits),
-       VCPU_STAT("halt_exits", halt_exits),
-       VCPU_STAT("halt_successful_poll", halt_successful_poll),
-       VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
-       VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
-       VCPU_STAT("halt_wakeup", halt_wakeup),
-       VCPU_STAT("hypercalls", hypercalls),
-       VCPU_STAT("request_irq", request_irq_exits),
-       VCPU_STAT("irq_exits", irq_exits),
-       VCPU_STAT("host_state_reload", host_state_reload),
-       VCPU_STAT("fpu_reload", fpu_reload),
-       VCPU_STAT("insn_emulation", insn_emulation),
-       VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
-       VCPU_STAT("irq_injections", irq_injections),
-       VCPU_STAT("nmi_injections", nmi_injections),
-       VCPU_STAT("req_event", req_event),
-       VCPU_STAT("l1d_flush", l1d_flush),
-       VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
-       VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
-       VCPU_STAT("nested_run", nested_run),
-       VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
-       VCPU_STAT("directed_yield_successful", directed_yield_successful),
-       VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
-       VM_STAT("mmu_pte_write", mmu_pte_write),
-       VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
-       VM_STAT("mmu_flooded", mmu_flooded),
-       VM_STAT("mmu_recycled", mmu_recycled),
-       VM_STAT("mmu_cache_miss", mmu_cache_miss),
-       VM_STAT("mmu_unsync", mmu_unsync),
-       VM_STAT("remote_tlb_flush", remote_tlb_flush),
-       VM_STAT("largepages", lpages, .mode = 0444),
-       VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
-       VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
-       { NULL }
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+       KVM_GENERIC_VM_STATS(),
+       STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
+       STATS_DESC_COUNTER(VM, mmu_pte_write),
+       STATS_DESC_COUNTER(VM, mmu_pde_zapped),
+       STATS_DESC_COUNTER(VM, mmu_flooded),
+       STATS_DESC_COUNTER(VM, mmu_recycled),
+       STATS_DESC_COUNTER(VM, mmu_cache_miss),
+       STATS_DESC_ICOUNTER(VM, mmu_unsync),
+       STATS_DESC_ICOUNTER(VM, lpages),
+       STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
+       STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
+       STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+               sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+       .name_size = KVM_STATS_NAME_SIZE,
+       .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+       .id_offset = sizeof(struct kvm_stats_header),
+       .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+       .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+                      sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+       KVM_GENERIC_VCPU_STATS(),
+       STATS_DESC_COUNTER(VCPU, pf_fixed),
+       STATS_DESC_COUNTER(VCPU, pf_guest),
+       STATS_DESC_COUNTER(VCPU, tlb_flush),
+       STATS_DESC_COUNTER(VCPU, invlpg),
+       STATS_DESC_COUNTER(VCPU, exits),
+       STATS_DESC_COUNTER(VCPU, io_exits),
+       STATS_DESC_COUNTER(VCPU, mmio_exits),
+       STATS_DESC_COUNTER(VCPU, signal_exits),
+       STATS_DESC_COUNTER(VCPU, irq_window_exits),
+       STATS_DESC_COUNTER(VCPU, nmi_window_exits),
+       STATS_DESC_COUNTER(VCPU, l1d_flush),
+       STATS_DESC_COUNTER(VCPU, halt_exits),
+       STATS_DESC_COUNTER(VCPU, request_irq_exits),
+       STATS_DESC_COUNTER(VCPU, irq_exits),
+       STATS_DESC_COUNTER(VCPU, host_state_reload),
+       STATS_DESC_COUNTER(VCPU, fpu_reload),
+       STATS_DESC_COUNTER(VCPU, insn_emulation),
+       STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
+       STATS_DESC_COUNTER(VCPU, hypercalls),
+       STATS_DESC_COUNTER(VCPU, irq_injections),
+       STATS_DESC_COUNTER(VCPU, nmi_injections),
+       STATS_DESC_COUNTER(VCPU, req_event),
+       STATS_DESC_COUNTER(VCPU, nested_run),
+       STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
+       STATS_DESC_COUNTER(VCPU, directed_yield_successful),
+       STATS_DESC_ICOUNTER(VCPU, guest_mode)
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+               sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+       .name_size = KVM_STATS_NAME_SIZE,
+       .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+       .id_offset = sizeof(struct kvm_stats_header),
+       .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+       .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+                      sizeof(kvm_vcpu_stats_desc),
 };
 
 u64 __read_mostly host_xcr0;
@@ -455,7 +486,14 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 
-asmlinkage __visible noinstr void kvm_spurious_fault(void)
+/*
+ * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
+ *
+ * Hardware virtualization extension instructions may fault if a reboot turns
+ * off virtualization while processes are running.  Usually after catching the
+ * fault we just panic; during reboot instead the instruction is ignored.
+ */
+noinstr void kvm_spurious_fault(void)
 {
        /* Fault while not rebooting.  We want the trace. */
        BUG_ON(!kvm_rebooting);
@@ -778,13 +816,6 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 
-static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
-                              void *data, int offset, int len, u32 access)
-{
-       return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
-                                      data, offset, len, access);
-}
-
 static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
@@ -819,6 +850,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 
        memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
        kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+       vcpu->arch.pdptrs_from_userspace = false;
 
 out:
 
@@ -826,40 +858,14 @@ out:
 }
 EXPORT_SYMBOL_GPL(load_pdptrs);
 
-bool pdptrs_changed(struct kvm_vcpu *vcpu)
-{
-       u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
-       int offset;
-       gfn_t gfn;
-       int r;
-
-       if (!is_pae_paging(vcpu))
-               return false;
-
-       if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
-               return true;
-
-       gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
-       offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
-       r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
-                                      PFERR_USER_MASK | PFERR_WRITE_MASK);
-       if (r < 0)
-               return true;
-
-       return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
-}
-EXPORT_SYMBOL_GPL(pdptrs_changed);
-
 void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
 {
-       unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
-
        if ((cr0 ^ old_cr0) & X86_CR0_PG) {
                kvm_clear_async_pf_completion_queue(vcpu);
                kvm_async_pf_hash_reset(vcpu);
        }
 
-       if ((cr0 ^ old_cr0) & update_bits)
+       if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
                kvm_mmu_reset_context(vcpu);
 
        if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
@@ -942,7 +948,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
            (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
             (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
            vcpu->arch.pkru != vcpu->arch.host_pkru)
-               __write_pkru(vcpu->arch.pkru);
+               write_pkru(vcpu->arch.pkru);
 }
 EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
 
@@ -956,7 +962,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
             (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
                vcpu->arch.pkru = rdpkru();
                if (vcpu->arch.pkru != vcpu->arch.host_pkru)
-                       __write_pkru(vcpu->arch.host_pkru);
+                       write_pkru(vcpu->arch.host_pkru);
        }
 
        if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
@@ -1038,10 +1044,7 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
 
 void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
 {
-       unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
-                                     X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
-
-       if (((cr4 ^ old_cr4) & mmu_role_bits) ||
+       if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) ||
            (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
                kvm_mmu_reset_context(vcpu);
 }
@@ -1084,25 +1087,46 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
+static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
+{
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       unsigned long roots_to_free = 0;
+       int i;
+
+       /*
+        * If neither the current CR3 nor any of the prev_roots use the given
+        * PCID, then nothing needs to be done here because a resync will
+        * happen anyway before switching to any other CR3.
+        */
+       if (kvm_get_active_pcid(vcpu) == pcid) {
+               kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+       }
+
+       for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+               if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
+                       roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+       kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+}
+
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
        bool skip_tlb_flush = false;
+       unsigned long pcid = 0;
 #ifdef CONFIG_X86_64
        bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
 
        if (pcid_enabled) {
                skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
                cr3 &= ~X86_CR3_PCID_NOFLUSH;
+               pcid = cr3 & X86_CR3_PCID_MASK;
        }
 #endif
 
-       if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
-               if (!skip_tlb_flush) {
-                       kvm_mmu_sync_roots(vcpu);
-                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-               }
-               return 0;
-       }
+       /* PDPTRs are always reloaded for PAE paging. */
+       if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
+               goto handle_tlb_flush;
 
        /*
         * Do not condition the GPA check on long mode, this helper is used to
@@ -1115,10 +1139,23 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
                return 1;
 
-       kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
+       if (cr3 != kvm_read_cr3(vcpu))
+               kvm_mmu_new_pgd(vcpu, cr3);
+
        vcpu->arch.cr3 = cr3;
        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
 
+handle_tlb_flush:
+       /*
+        * A load of CR3 that flushes the TLB flushes only the current PCID,
+        * even if PCID is disabled, in which case PCID=0 is flushed.  It's a
+        * moot point in the end because _disabling_ PCID will flush all PCIDs,
+        * and it's impossible to use a non-zero PCID when PCID is disabled,
+        * i.e. only PCID=0 can be relevant.
+        */
+       if (!skip_tlb_flush)
+               kvm_invalidate_pcid(vcpu, pcid);
+
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
@@ -1151,7 +1188,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
        if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
                for (i = 0; i < KVM_NR_DB_REGS; i++)
                        vcpu->arch.eff_db[i] = vcpu->arch.db[i];
-               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
        }
 }
 
@@ -2179,13 +2215,15 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
        return v;
 }
 
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
+
 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 {
        u64 ratio;
 
        /* Guest TSC same frequency as host TSC? */
        if (!scale) {
-               vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
                return 0;
        }
 
@@ -2211,7 +2249,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
                return -1;
        }
 
-       vcpu->arch.tsc_scaling_ratio = ratio;
+       kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
        return 0;
 }
 
@@ -2223,7 +2261,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
        /* tsc_khz can be zero if TSC calibration fails */
        if (user_tsc_khz == 0) {
                /* set tsc_scaling_ratio to a safe value */
-               vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
                return -1;
        }
 
@@ -2305,10 +2343,9 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc)
        return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
 }
 
-u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio)
 {
        u64 _tsc = tsc;
-       u64 ratio = vcpu->arch.tsc_scaling_ratio;
 
        if (ratio != kvm_default_tsc_scaling_ratio)
                _tsc = __scale_tsc(ratio, tsc);
@@ -2317,25 +2354,86 @@ u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
 }
 EXPORT_SYMBOL_GPL(kvm_scale_tsc);
 
-static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 {
        u64 tsc;
 
-       tsc = kvm_scale_tsc(vcpu, rdtsc());
+       tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
 
        return target_tsc - tsc;
 }
 
 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
-       return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+       return vcpu->arch.l1_tsc_offset +
+               kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
 }
 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 
-static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
+{
+       u64 nested_offset;
+
+       if (l2_multiplier == kvm_default_tsc_scaling_ratio)
+               nested_offset = l1_offset;
+       else
+               nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
+                                               kvm_tsc_scaling_ratio_frac_bits);
+
+       nested_offset += l2_offset;
+       return nested_offset;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
+
+u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
 {
-       vcpu->arch.l1_tsc_offset = offset;
-       vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
+       if (l2_multiplier != kvm_default_tsc_scaling_ratio)
+               return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
+                                      kvm_tsc_scaling_ratio_frac_bits);
+
+       return l1_multiplier;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
+
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
+{
+       trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+                                  vcpu->arch.l1_tsc_offset,
+                                  l1_offset);
+
+       vcpu->arch.l1_tsc_offset = l1_offset;
+
+       /*
+        * If we are here because L1 chose not to trap WRMSR to TSC then
+        * according to the spec this should set L1's TSC (as opposed to
+        * setting L1's offset for L2).
+        */
+       if (is_guest_mode(vcpu))
+               vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+                       l1_offset,
+                       static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
+                       static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+       else
+               vcpu->arch.tsc_offset = l1_offset;
+
+       static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
+}
+
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
+{
+       vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
+
+       /* Userspace is changing the multiplier while L2 is active */
+       if (is_guest_mode(vcpu))
+               vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+                       l1_multiplier,
+                       static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+       else
+               vcpu->arch.tsc_scaling_ratio = l1_multiplier;
+
+       if (kvm_has_tsc_control)
+               static_call(kvm_x86_write_tsc_multiplier)(
+                       vcpu, vcpu->arch.tsc_scaling_ratio);
 }
 
 static inline bool kvm_check_tsc_unstable(void)
@@ -2361,7 +2459,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
        bool synchronizing = false;
 
        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
-       offset = kvm_compute_tsc_offset(vcpu, data);
+       offset = kvm_compute_l1_tsc_offset(vcpu, data);
        ns = get_kvmclock_base_ns();
        elapsed = ns - kvm->arch.last_tsc_nsec;
 
@@ -2400,7 +2498,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
                } else {
                        u64 delta = nsec_to_cycles(vcpu, elapsed);
                        data += delta;
-                       offset = kvm_compute_tsc_offset(vcpu, data);
+                       offset = kvm_compute_l1_tsc_offset(vcpu, data);
                }
                matched = true;
                already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
@@ -2459,9 +2557,10 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
 
 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
 {
-       if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+       if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
                WARN_ON(adjustment < 0);
-       adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+       adjustment = kvm_scale_tsc(vcpu, (u64) adjustment,
+                                  vcpu->arch.l1_tsc_scaling_ratio);
        adjust_tsc_offset_guest(vcpu, adjustment);
 }
 
@@ -2844,7 +2943,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        /* With all the info we got, fill in the values */
 
        if (kvm_has_tsc_control)
-               tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+               tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz,
+                                           v->arch.l1_tsc_scaling_ratio);
 
        if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
                kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
@@ -3250,7 +3350,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if (msr_info->host_initiated) {
                        kvm_synchronize_tsc(vcpu, data);
                } else {
-                       u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+                       u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
                        adjust_tsc_offset_guest(vcpu, adj);
                        vcpu->arch.ia32_tsc_adjust_msr += adj;
                }
@@ -3314,7 +3414,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                break;
        case MSR_KVM_ASYNC_PF_ACK:
-               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
                        return 1;
                if (data & 0x1) {
                        vcpu->arch.apf.pageready_pending = false;
@@ -3552,10 +3652,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 * return L1's TSC value to ensure backwards-compatible
                 * behavior for migration.
                 */
-               u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
-                                                           vcpu->arch.tsc_offset;
+               u64 offset, ratio;
+
+               if (msr_info->host_initiated) {
+                       offset = vcpu->arch.l1_tsc_offset;
+                       ratio = vcpu->arch.l1_tsc_scaling_ratio;
+               } else {
+                       offset = vcpu->arch.tsc_offset;
+                       ratio = vcpu->arch.tsc_scaling_ratio;
+               }
 
-               msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
+               msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset;
                break;
        }
        case MSR_MTRRcap:
@@ -3646,7 +3753,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = vcpu->arch.apf.msr_int_val;
                break;
        case MSR_KVM_ASYNC_PF_ACK:
-               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
                        return 1;
 
                msr_info->data = 0;
@@ -3879,6 +3986,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_HYPERV_TLBFLUSH:
        case KVM_CAP_HYPERV_SEND_IPI:
        case KVM_CAP_HYPERV_CPUID:
+       case KVM_CAP_HYPERV_ENFORCE_CPUID:
        case KVM_CAP_SYS_HYPERV_CPUID:
        case KVM_CAP_PCI_SEGMENT:
        case KVM_CAP_DEBUGREGS:
@@ -3909,8 +4017,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_SGX_ATTRIBUTE:
 #endif
        case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+       case KVM_CAP_SREGS2:
+       case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
                r = 1;
                break;
+       case KVM_CAP_EXIT_HYPERCALL:
+               r = KVM_EXIT_HYPERCALL_VALID_MASK;
+               break;
        case KVM_CAP_SET_GUEST_DEBUG2:
                return KVM_GUESTDBG_VALID_MASK;
 #ifdef CONFIG_KVM_XEN
@@ -4138,7 +4251,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                        mark_tsc_unstable("KVM discovered backwards TSC");
 
                if (kvm_check_tsc_unstable()) {
-                       u64 offset = kvm_compute_tsc_offset(vcpu,
+                       u64 offset = kvm_compute_l1_tsc_offset(vcpu,
                                                vcpu->arch.last_guest_tsc);
                        kvm_vcpu_write_tsc_offset(vcpu, offset);
                        vcpu->arch.tsc_catchup = 1;
@@ -4204,12 +4317,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 
        static_call(kvm_x86_vcpu_put)(vcpu);
        vcpu->arch.last_host_tsc = rdtsc();
-       /*
-        * If userspace has set any breakpoints or watchpoints, dr6 is restored
-        * on every vmexit, but if not, we might have a stale dr6 from the
-        * guest. do_debug expects dr6 to be cleared after it runs, do the same.
-        */
-       set_debugreg(0, 6);
 }
 
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -4252,8 +4359,17 @@ static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
 
 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
 {
-       return kvm_arch_interrupt_allowed(vcpu) &&
-               kvm_cpu_accept_dm_intr(vcpu);
+       /*
+        * Do not cause an interrupt window exit if an exception
+        * is pending or an event needs reinjection; userspace
+        * might want to inject the interrupt manually using KVM_SET_REGS
+        * or KVM_SET_SREGS.  For that to work, we must be at an
+        * instruction boundary and with no events half-injected.
+        */
+       return (kvm_arch_interrupt_allowed(vcpu) &&
+               kvm_cpu_accept_dm_intr(vcpu) &&
+               !kvm_event_needs_reinjection(vcpu) &&
+               !vcpu->arch.exception.pending);
 }
 
 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
@@ -4457,7 +4573,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
        memset(&events->reserved, 0, sizeof(events->reserved));
 }
 
-static void kvm_smm_changed(struct kvm_vcpu *vcpu);
+static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
 
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                                              struct kvm_vcpu_events *events)
@@ -4517,13 +4633,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
        if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
-               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
-                       if (events->smi.smm)
-                               vcpu->arch.hflags |= HF_SMM_MASK;
-                       else
-                               vcpu->arch.hflags &= ~HF_SMM_MASK;
-                       kvm_smm_changed(vcpu);
-               }
+               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
+                       kvm_smm_changed(vcpu, events->smi.smm);
 
                vcpu->arch.smi_pending = events->smi.pending;
 
@@ -4604,20 +4715,21 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
         */
        valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
        while (valid) {
+               u32 size, offset, ecx, edx;
                u64 xfeature_mask = valid & -valid;
                int xfeature_nr = fls64(xfeature_mask) - 1;
-               void *src = get_xsave_addr(xsave, xfeature_nr);
-
-               if (src) {
-                       u32 size, offset, ecx, edx;
-                       cpuid_count(XSTATE_CPUID, xfeature_nr,
-                                   &size, &offset, &ecx, &edx);
-                       if (xfeature_nr == XFEATURE_PKRU)
-                               memcpy(dest + offset, &vcpu->arch.pkru,
-                                      sizeof(vcpu->arch.pkru));
-                       else
-                               memcpy(dest + offset, src, size);
+               void *src;
+
+               cpuid_count(XSTATE_CPUID, xfeature_nr,
+                           &size, &offset, &ecx, &edx);
 
+               if (xfeature_nr == XFEATURE_PKRU) {
+                       memcpy(dest + offset, &vcpu->arch.pkru,
+                              sizeof(vcpu->arch.pkru));
+               } else {
+                       src = get_xsave_addr(xsave, xfeature_nr);
+                       if (src)
+                               memcpy(dest + offset, src, size);
                }
 
                valid -= xfeature_mask;
@@ -4647,18 +4759,20 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
         */
        valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
        while (valid) {
+               u32 size, offset, ecx, edx;
                u64 xfeature_mask = valid & -valid;
                int xfeature_nr = fls64(xfeature_mask) - 1;
-               void *dest = get_xsave_addr(xsave, xfeature_nr);
-
-               if (dest) {
-                       u32 size, offset, ecx, edx;
-                       cpuid_count(XSTATE_CPUID, xfeature_nr,
-                                   &size, &offset, &ecx, &edx);
-                       if (xfeature_nr == XFEATURE_PKRU)
-                               memcpy(&vcpu->arch.pkru, src + offset,
-                                      sizeof(vcpu->arch.pkru));
-                       else
+
+               cpuid_count(XSTATE_CPUID, xfeature_nr,
+                           &size, &offset, &ecx, &edx);
+
+               if (xfeature_nr == XFEATURE_PKRU) {
+                       memcpy(&vcpu->arch.pkru, src + offset,
+                              sizeof(vcpu->arch.pkru));
+               } else {
+                       void *dest = get_xsave_addr(xsave, xfeature_nr);
+
+                       if (dest)
                                memcpy(dest, src + offset, size);
                }
 
@@ -4807,6 +4921,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 
                return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
 
+       case KVM_CAP_HYPERV_ENFORCE_CPUID:
+               return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
+
        case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
                vcpu->arch.pv_cpuid.enforce = cap->args[0];
                if (vcpu->arch.pv_cpuid.enforce)
@@ -4825,6 +4942,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        void __user *argp = (void __user *)arg;
        int r;
        union {
+               struct kvm_sregs2 *sregs2;
                struct kvm_lapic_state *lapic;
                struct kvm_xsave *xsave;
                struct kvm_xcrs *xcrs;
@@ -5197,6 +5315,28 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                break;
        }
 #endif
+       case KVM_GET_SREGS2: {
+               u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!u.sregs2)
+                       goto out;
+               __get_sregs2(vcpu, u.sregs2);
+               r = -EFAULT;
+               if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_SREGS2: {
+               u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
+               if (IS_ERR(u.sregs2)) {
+                       r = PTR_ERR(u.sregs2);
+                       u.sregs2 = NULL;
+                       goto out;
+               }
+               r = __set_sregs2(vcpu, u.sregs2);
+               break;
+       }
        default:
                r = -EINVAL;
        }
@@ -5516,6 +5656,21 @@ split_irqchip_unlock:
                if (kvm_x86_ops.vm_copy_enc_context_from)
                        r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
                return r;
+       case KVM_CAP_EXIT_HYPERCALL:
+               if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
+                       r = -EINVAL;
+                       break;
+               }
+               kvm->arch.hypercall_exit_enabled = cap->args[0];
+               r = 0;
+               break;
+       case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+               r = -EINVAL;
+               if (cap->args[0] & ~1)
+                       break;
+               kvm->arch.exit_on_emulation_error = cap->args[0];
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@ -5630,6 +5785,41 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
        return 0;
 }
 
+#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+static int kvm_arch_suspend_notifier(struct kvm *kvm)
+{
+       struct kvm_vcpu *vcpu;
+       int i, ret = 0;
+
+       mutex_lock(&kvm->lock);
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (!vcpu->arch.pv_time_enabled)
+                       continue;
+
+               ret = kvm_set_guest_paused(vcpu);
+               if (ret) {
+                       kvm_err("Failed to pause guest VCPU%d: %d\n",
+                               vcpu->vcpu_id, ret);
+                       break;
+               }
+       }
+       mutex_unlock(&kvm->lock);
+
+       return ret ? NOTIFY_BAD : NOTIFY_DONE;
+}
+
+int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
+{
+       switch (state) {
+       case PM_HIBERNATION_PREPARE:
+       case PM_SUSPEND_PREPARE:
+               return kvm_arch_suspend_notifier(kvm);
+       }
+
+       return NOTIFY_DONE;
+}
+#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+
 long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg)
 {
@@ -6378,9 +6568,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
         * there is no pkey in EPT page table for L1 guest or EPT
         * shadow page table for L2 guest.
         */
-       if (vcpu_match_mmio_gva(vcpu, gva)
-           && !permission_fault(vcpu, vcpu->arch.walk_mmu,
-                                vcpu->arch.mmio_access, 0, access)) {
+       if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
+           !permission_fault(vcpu, vcpu->arch.walk_mmu,
+                             vcpu->arch.mmio_access, 0, access))) {
                *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
                                        (gva & (PAGE_SIZE - 1));
                trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -7104,23 +7294,22 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
        return emul_to_vcpu(ctxt)->arch.hflags;
 }
 
-static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt)
 {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 
-       vcpu->arch.hflags = emul_flags;
-       kvm_mmu_reset_context(vcpu);
+       kvm_smm_changed(vcpu, false);
 }
 
-static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
+static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt,
                                  const char *smstate)
 {
-       return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate);
+       return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate);
 }
 
-static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
+static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
 {
-       kvm_smm_changed(emul_to_vcpu(ctxt));
+       kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
 }
 
 static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
@@ -7169,9 +7358,9 @@ static const struct x86_emulate_ops emulate_ops = {
        .guest_has_fxsr      = emulator_guest_has_fxsr,
        .set_nmi_mask        = emulator_set_nmi_mask,
        .get_hflags          = emulator_get_hflags,
-       .set_hflags          = emulator_set_hflags,
-       .pre_leave_smm       = emulator_pre_leave_smm,
-       .post_leave_smm      = emulator_post_leave_smm,
+       .exiting_smm         = emulator_exiting_smm,
+       .leave_smm           = emulator_leave_smm,
+       .triple_fault        = emulator_triple_fault,
        .set_xcr             = emulator_set_xcr,
 };
 
@@ -7277,8 +7466,33 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 }
 EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
 
+static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+{
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+       u32 insn_size = ctxt->fetch.end - ctxt->fetch.data;
+       struct kvm_run *run = vcpu->run;
+
+       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+       run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
+       run->emulation_failure.ndata = 0;
+       run->emulation_failure.flags = 0;
+
+       if (insn_size) {
+               run->emulation_failure.ndata = 3;
+               run->emulation_failure.flags |=
+                       KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
+               run->emulation_failure.insn_size = insn_size;
+               memset(run->emulation_failure.insn_bytes, 0x90,
+                      sizeof(run->emulation_failure.insn_bytes));
+               memcpy(run->emulation_failure.insn_bytes,
+                      ctxt->fetch.data, insn_size);
+       }
+}
+
 static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
 {
+       struct kvm *kvm = vcpu->kvm;
+
        ++vcpu->stat.insn_emulation_fail;
        trace_kvm_emulate_insn_failed(vcpu);
 
@@ -7287,10 +7501,9 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
                return 1;
        }
 
-       if (emulation_type & EMULTYPE_SKIP) {
-               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-               vcpu->run->internal.ndata = 0;
+       if (kvm->arch.exit_on_emulation_error ||
+           (emulation_type & EMULTYPE_SKIP)) {
+               prepare_emulation_failure_exit(vcpu);
                return 0;
        }
 
@@ -7432,11 +7645,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
 
-static void kvm_smm_changed(struct kvm_vcpu *vcpu)
+static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
 {
-       if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
-               /* This is a good place to trace that we are exiting SMM.  */
-               trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+       trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
+
+       if (entering_smm) {
+               vcpu->arch.hflags |= HF_SMM_MASK;
+       } else {
+               vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
 
                /* Process a latched INIT or SMI, if any.  */
                kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -8361,16 +8577,17 @@ bool kvm_apicv_activated(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_apicv_activated);
 
-void kvm_apicv_init(struct kvm *kvm, bool enable)
+static void kvm_apicv_init(struct kvm *kvm)
 {
-       if (enable)
+       mutex_init(&kvm->arch.apicv_update_lock);
+
+       if (enable_apicv)
                clear_bit(APICV_INHIBIT_REASON_DISABLE,
                          &kvm->arch.apicv_inhibit_reasons);
        else
                set_bit(APICV_INHIBIT_REASON_DISABLE,
                        &kvm->arch.apicv_inhibit_reasons);
 }
-EXPORT_SYMBOL_GPL(kvm_apicv_init);
 
 static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
 {
@@ -8406,6 +8623,17 @@ no_yield:
        return;
 }
 
+static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
+{
+       u64 ret = vcpu->run->hypercall.ret;
+
+       if (!is_64_bit_mode(vcpu))
+               ret = (u32)ret;
+       kvm_rax_write(vcpu, ret);
+       ++vcpu->stat.hypercalls;
+       return kvm_skip_emulated_instruction(vcpu);
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
        unsigned long nr, a0, a1, a2, a3, ret;
@@ -8471,6 +8699,28 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                kvm_sched_yield(vcpu, a0);
                ret = 0;
                break;
+       case KVM_HC_MAP_GPA_RANGE: {
+               u64 gpa = a0, npages = a1, attrs = a2;
+
+               ret = -KVM_ENOSYS;
+               if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
+                       break;
+
+               if (!PAGE_ALIGNED(gpa) || !npages ||
+                   gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
+                       ret = -KVM_EINVAL;
+                       break;
+               }
+
+               vcpu->run->exit_reason        = KVM_EXIT_HYPERCALL;
+               vcpu->run->hypercall.nr       = KVM_HC_MAP_GPA_RANGE;
+               vcpu->run->hypercall.args[0]  = gpa;
+               vcpu->run->hypercall.args[1]  = npages;
+               vcpu->run->hypercall.args[2]  = attrs;
+               vcpu->run->hypercall.longmode = op_64_bit;
+               vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+               return 0;
+       }
        default:
                ret = -KVM_ENOSYS;
                break;
@@ -8554,9 +8804,6 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 
 int kvm_check_nested_events(struct kvm_vcpu *vcpu)
 {
-       if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
-               return -EIO;
-
        if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
                kvm_x86_ops.nested_ops->triple_fault(vcpu);
                return 1;
@@ -8572,7 +8819,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
        static_call(kvm_x86_queue_exception)(vcpu);
 }
 
-static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
+static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
 {
        int r;
        bool can_inject = true;
@@ -8619,7 +8866,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
        if (is_guest_mode(vcpu)) {
                r = kvm_check_nested_events(vcpu);
                if (r < 0)
-                       goto busy;
+                       goto out;
        }
 
        /* try to inject new event if pending */
@@ -8661,7 +8908,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
        if (vcpu->arch.smi_pending) {
                r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
                if (r < 0)
-                       goto busy;
+                       goto out;
                if (r) {
                        vcpu->arch.smi_pending = false;
                        ++vcpu->arch.smi_count;
@@ -8674,7 +8921,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
        if (vcpu->arch.nmi_pending) {
                r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
                if (r < 0)
-                       goto busy;
+                       goto out;
                if (r) {
                        --vcpu->arch.nmi_pending;
                        vcpu->arch.nmi_injected = true;
@@ -8689,7 +8936,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
        if (kvm_cpu_has_injectable_intr(vcpu)) {
                r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
                if (r < 0)
-                       goto busy;
+                       goto out;
                if (r) {
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
                        static_call(kvm_x86_set_irq)(vcpu);
@@ -8705,11 +8952,14 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
                *req_immediate_exit = true;
 
        WARN_ON(vcpu->arch.exception.pending);
-       return;
+       return 0;
 
-busy:
-       *req_immediate_exit = true;
-       return;
+out:
+       if (r == -EBUSY) {
+               *req_immediate_exit = true;
+               r = 0;
+       }
+       return r;
 }
 
 static void process_nmi(struct kvm_vcpu *vcpu)
@@ -8888,10 +9138,9 @@ static void enter_smm(struct kvm_vcpu *vcpu)
 {
        struct kvm_segment cs, ds;
        struct desc_ptr dt;
+       unsigned long cr0;
        char buf[512];
-       u32 cr0;
 
-       trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
        memset(buf, 0, 512);
 #ifdef CONFIG_X86_64
        if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
@@ -8901,13 +9150,13 @@ static void enter_smm(struct kvm_vcpu *vcpu)
                enter_smm_save_state_32(vcpu, buf);
 
        /*
-        * Give pre_enter_smm() a chance to make ISA-specific changes to the
-        * vCPU state (e.g. leave guest mode) after we've saved the state into
-        * the SMM state-save area.
+        * Give enter_smm() a chance to make ISA-specific changes to the vCPU
+        * state (e.g. leave guest mode) after we've saved the state into the
+        * SMM state-save area.
         */
-       static_call(kvm_x86_pre_enter_smm)(vcpu, buf);
+       static_call(kvm_x86_enter_smm)(vcpu, buf);
 
-       vcpu->arch.hflags |= HF_SMM_MASK;
+       kvm_smm_changed(vcpu, true);
        kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
 
        if (static_call(kvm_x86_get_nmi_mask)(vcpu))
@@ -8990,60 +9239,68 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
 
 void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
 {
+       bool activate;
+
        if (!lapic_in_kernel(vcpu))
                return;
 
-       vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
+       mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+
+       activate = kvm_apicv_activated(vcpu->kvm);
+       if (vcpu->arch.apicv_active == activate)
+               goto out;
+
+       vcpu->arch.apicv_active = activate;
        kvm_apic_update_apicv(vcpu);
        static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
+
+       /*
+        * When APICv gets disabled, we may still have injected interrupts
+        * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
+        * still active when the interrupt got accepted. Make sure
+        * inject_pending_event() is called to check for that.
+        */
+       if (!vcpu->arch.apicv_active)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+out:
+       mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
 
-/*
- * NOTE: Do not hold any lock prior to calling this.
- *
- * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
- * locked, because it calls __x86_set_memory_region() which does
- * synchronize_srcu(&kvm->srcu).
- */
-void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
 {
-       struct kvm_vcpu *except;
-       unsigned long old, new, expected;
+       unsigned long old, new;
 
        if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
            !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
                return;
 
-       old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
-       do {
-               expected = new = old;
-               if (activate)
-                       __clear_bit(bit, &new);
-               else
-                       __set_bit(bit, &new);
-               if (new == old)
-                       break;
-               old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
-       } while (old != expected);
+       old = new = kvm->arch.apicv_inhibit_reasons;
 
-       if (!!old == !!new)
-               return;
-
-       trace_kvm_apicv_update_request(activate, bit);
-       if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
-               static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate);
+       if (activate)
+               __clear_bit(bit, &new);
+       else
+               __set_bit(bit, &new);
+
+       if (!!old != !!new) {
+               trace_kvm_apicv_update_request(activate, bit);
+               kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
+               kvm->arch.apicv_inhibit_reasons = new;
+               if (new) {
+                       unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
+                       kvm_zap_gfn_range(kvm, gfn, gfn+1);
+               }
+       } else
+               kvm->arch.apicv_inhibit_reasons = new;
+}
+EXPORT_SYMBOL_GPL(__kvm_request_apicv_update);
 
-       /*
-        * Sending request to update APICV for all other vcpus,
-        * while update the calling vcpu immediately instead of
-        * waiting for another #VMEXIT to handle the request.
-        */
-       except = kvm_get_running_vcpu();
-       kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
-                                        except);
-       if (except)
-               kvm_vcpu_update_apicv(except);
+void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+{
+       mutex_lock(&kvm->arch.apicv_update_lock);
+       __kvm_request_apicv_update(kvm, activate, bit);
+       mutex_unlock(&kvm->arch.apicv_update_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
 
@@ -9140,6 +9397,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        }
 
        if (kvm_request_pending(vcpu)) {
+               if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) {
+                       r = -EIO;
+                       goto out;
+               }
                if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
                        if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
                                r = 0;
@@ -9171,7 +9432,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                }
                if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
                        kvm_vcpu_flush_tlb_current(vcpu);
-               if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+               if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
                        kvm_vcpu_flush_tlb_guest(vcpu);
 
                if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
@@ -9264,13 +9525,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
            kvm_xen_has_interrupt(vcpu)) {
                ++vcpu->stat.req_event;
-               kvm_apic_accept_events(vcpu);
+               r = kvm_apic_accept_events(vcpu);
+               if (r < 0) {
+                       r = 0;
+                       goto out;
+               }
                if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
                        r = 1;
                        goto out;
                }
 
-               inject_pending_event(vcpu, &req_immediate_exit);
+               r = inject_pending_event(vcpu, &req_immediate_exit);
+               if (r < 0) {
+                       r = 0;
+                       goto out;
+               }
                if (req_int_win)
                        static_call(kvm_x86_enable_irq_window)(vcpu);
 
@@ -9345,8 +9614,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                set_debugreg(vcpu->arch.eff_db[1], 1);
                set_debugreg(vcpu->arch.eff_db[2], 2);
                set_debugreg(vcpu->arch.eff_db[3], 3);
-               set_debugreg(vcpu->arch.dr6, 6);
-               vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
+       } else if (unlikely(hw_breakpoint_active())) {
+               set_debugreg(0, 7);
        }
 
        for (;;) {
@@ -9374,7 +9643,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
                kvm_update_dr0123(vcpu);
                kvm_update_dr7(vcpu);
-               vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
        }
 
        /*
@@ -9472,7 +9740,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
                        return 1;
        }
 
-       kvm_apic_accept_events(vcpu);
+       if (kvm_apic_accept_events(vcpu) < 0)
+               return 0;
        switch(vcpu->arch.mp_state) {
        case KVM_MP_STATE_HALTED:
        case KVM_MP_STATE_AP_RESET_HOLD:
@@ -9634,7 +9903,7 @@ static void kvm_save_current_fpu(struct fpu *fpu)
                memcpy(&fpu->state, &current->thread.fpu.state,
                       fpu_kernel_xstate_size);
        else
-               copy_fpregs_to_fpstate(fpu);
+               save_fpregs_to_fpstate(fpu);
 }
 
 /* Swap (qemu) user FPU context for the guest FPU context. */
@@ -9650,7 +9919,7 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
         */
        if (vcpu->arch.guest_fpu)
                /* PKRU is separately restored in kvm_x86_ops.run. */
-               __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
+               __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
                                        ~XFEATURE_MASK_PKRU);
 
        fpregs_mark_activate();
@@ -9671,7 +9940,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
        if (vcpu->arch.guest_fpu)
                kvm_save_current_fpu(vcpu->arch.guest_fpu);
 
-       copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
+       restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
 
        fpregs_mark_activate();
        fpregs_unlock();
@@ -9696,7 +9965,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                        goto out;
                }
                kvm_vcpu_block(vcpu);
-               kvm_apic_accept_events(vcpu);
+               if (kvm_apic_accept_events(vcpu) < 0) {
+                       r = 0;
+                       goto out;
+               }
                kvm_clear_request(KVM_REQ_UNHALT, vcpu);
                r = -EAGAIN;
                if (signal_pending(current)) {
@@ -9707,7 +9979,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                goto out;
        }
 
-       if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+       if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
+           (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
                r = -EINVAL;
                goto out;
        }
@@ -9845,7 +10118,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
 
-static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
        struct desc_ptr dt;
 
@@ -9878,14 +10151,36 @@ skip_protected_regs:
        sregs->cr8 = kvm_get_cr8(vcpu);
        sregs->efer = vcpu->arch.efer;
        sregs->apic_base = kvm_get_apic_base(vcpu);
+}
 
-       memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+       __get_sregs_common(vcpu, sregs);
+
+       if (vcpu->arch.guest_state_protected)
+               return;
 
        if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
                set_bit(vcpu->arch.interrupt.nr,
                        (unsigned long *)sregs->interrupt_bitmap);
 }
 
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+       int i;
+
+       __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
+
+       if (vcpu->arch.guest_state_protected)
+               return;
+
+       if (is_pae_paging(vcpu)) {
+               for (i = 0 ; i < 4 ; i++)
+                       sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
+               sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
+       }
+}
+
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
 {
@@ -9898,11 +10193,17 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
+       int r;
+
        vcpu_load(vcpu);
        if (kvm_mpx_supported())
                kvm_load_guest_fpu(vcpu);
 
-       kvm_apic_accept_events(vcpu);
+       r = kvm_apic_accept_events(vcpu);
+       if (r < 0)
+               goto out;
+       r = 0;
+
        if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
             vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
            vcpu->arch.pv.pv_unhalted)
@@ -9910,10 +10211,11 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
        else
                mp_state->mp_state = vcpu->arch.mp_state;
 
+out:
        if (kvm_mpx_supported())
                kvm_put_guest_fpu(vcpu);
        vcpu_put(vcpu);
-       return 0;
+       return r;
 }
 
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
@@ -9997,24 +10299,23 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        return kvm_is_valid_cr4(vcpu, sregs->cr4);
 }
 
-static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
+               int *mmu_reset_needed, bool update_pdptrs)
 {
        struct msr_data apic_base_msr;
-       int mmu_reset_needed = 0;
-       int pending_vec, max_bits, idx;
+       int idx;
        struct desc_ptr dt;
-       int ret = -EINVAL;
 
        if (!kvm_is_valid_sregs(vcpu, sregs))
-               goto out;
+               return -EINVAL;
 
        apic_base_msr.data = sregs->apic_base;
        apic_base_msr.host_initiated = true;
        if (kvm_set_apic_base(vcpu, &apic_base_msr))
-               goto out;
+               return -EINVAL;
 
        if (vcpu->arch.guest_state_protected)
-               goto skip_protected_regs;
+               return 0;
 
        dt.size = sregs->idt.limit;
        dt.address = sregs->idt.base;
@@ -10024,31 +10325,30 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        static_call(kvm_x86_set_gdt)(vcpu, &dt);
 
        vcpu->arch.cr2 = sregs->cr2;
-       mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+       *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
        vcpu->arch.cr3 = sregs->cr3;
        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
 
        kvm_set_cr8(vcpu, sregs->cr8);
 
-       mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+       *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
        static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
 
-       mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+       *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
        static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
        vcpu->arch.cr0 = sregs->cr0;
 
-       mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+       *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
        static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
 
-       idx = srcu_read_lock(&vcpu->kvm->srcu);
-       if (is_pae_paging(vcpu)) {
-               load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
-               mmu_reset_needed = 1;
+       if (update_pdptrs) {
+               idx = srcu_read_lock(&vcpu->kvm->srcu);
+               if (is_pae_paging(vcpu)) {
+                       load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+                       *mmu_reset_needed = 1;
+               }
+               srcu_read_unlock(&vcpu->kvm->srcu, idx);
        }
-       srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
-       if (mmu_reset_needed)
-               kvm_mmu_reset_context(vcpu);
 
        kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
        kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@ -10068,20 +10368,63 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
            !is_protmode(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
-skip_protected_regs:
+       return 0;
+}
+
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+       int pending_vec, max_bits;
+       int mmu_reset_needed = 0;
+       int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
+
+       if (ret)
+               return ret;
+
+       if (mmu_reset_needed)
+               kvm_mmu_reset_context(vcpu);
+
        max_bits = KVM_NR_INTERRUPTS;
        pending_vec = find_first_bit(
                (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+
        if (pending_vec < max_bits) {
                kvm_queue_interrupt(vcpu, pending_vec, false);
                pr_debug("Set back pending irq %d\n", pending_vec);
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
+       return 0;
+}
 
-       kvm_make_request(KVM_REQ_EVENT, vcpu);
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+       int mmu_reset_needed = 0;
+       bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
+       bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
+               !(sregs2->efer & EFER_LMA);
+       int i, ret;
 
-       ret = 0;
-out:
-       return ret;
+       if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
+               return -EINVAL;
+
+       if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
+               return -EINVAL;
+
+       ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
+                                &mmu_reset_needed, !valid_pdptrs);
+       if (ret)
+               return ret;
+
+       if (valid_pdptrs) {
+               for (i = 0; i < 4 ; i++)
+                       kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
+
+               kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+               mmu_reset_needed = 1;
+               vcpu->arch.pdptrs_from_userspace = true;
+       }
+       if (mmu_reset_needed)
+               kvm_mmu_reset_context(vcpu);
+       return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
@@ -10242,9 +10585,6 @@ static void store_regs(struct kvm_vcpu *vcpu)
 
 static int sync_regs(struct kvm_vcpu *vcpu)
 {
-       if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
-               return -EINVAL;
-
        if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
                __set_regs(vcpu, &vcpu->run->s.regs.regs);
                vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
@@ -10305,13 +10645,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        struct page *page;
        int r;
 
+       vcpu->arch.last_vmentry_cpu = -1;
+
        if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        else
                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
 
-       kvm_set_tsc_khz(vcpu, max_tsc_khz);
-
        r = kvm_mmu_create(vcpu);
        if (r < 0)
                return r;
@@ -10371,6 +10711,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        vcpu->arch.pending_external_vector = -1;
        vcpu->arch.preempted_in_kernel = false;
 
+#if IS_ENABLED(CONFIG_HYPERV)
+       vcpu->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
        r = static_call(kvm_x86_vcpu_create)(vcpu);
        if (r)
                goto free_guest_fpu;
@@ -10379,8 +10723,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
        kvm_vcpu_mtrr_init(vcpu);
        vcpu_load(vcpu);
+       kvm_set_tsc_khz(vcpu, max_tsc_khz);
        kvm_vcpu_reset(vcpu, false);
-       kvm_init_mmu(vcpu, false);
+       kvm_init_mmu(vcpu);
        vcpu_put(vcpu);
        return 0;
 
@@ -10454,6 +10799,10 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
+       unsigned long old_cr0 = kvm_read_cr0(vcpu);
+       unsigned long new_cr0;
+       u32 eax, dummy;
+
        kvm_lapic_reset(vcpu, init_event);
 
        vcpu->arch.hflags = 0;
@@ -10519,10 +10868,65 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        vcpu->arch.regs_avail = ~0;
        vcpu->arch.regs_dirty = ~0;
 
+       /*
+        * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
+        * if no CPUID match is found.  Note, it's impossible to get a match at
+        * RESET since KVM emulates RESET before exposing the vCPU to userspace,
+        * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET.
+        * But, go through the motions in case that's ever remedied.
+        */
+       eax = 1;
+       if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true))
+               eax = 0x600;
+       kvm_rdx_write(vcpu, eax);
+
        vcpu->arch.ia32_xss = 0;
 
        static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
+
+       kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+       kvm_rip_write(vcpu, 0xfff0);
+
+       /*
+        * CR0.CD/NW are set on RESET, preserved on INIT.  Note, some versions
+        * of Intel's SDM list CD/NW as being set on INIT, but they contradict
+        * (or qualify) that with a footnote stating that CD/NW are preserved.
+        */
+       new_cr0 = X86_CR0_ET;
+       if (init_event)
+               new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD));
+       else
+               new_cr0 |= X86_CR0_NW | X86_CR0_CD;
+
+       static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
+       static_call(kvm_x86_set_cr4)(vcpu, 0);
+       static_call(kvm_x86_set_efer)(vcpu, 0);
+       static_call(kvm_x86_update_exception_bitmap)(vcpu);
+
+       /*
+        * Reset the MMU context if paging was enabled prior to INIT (which is
+        * implied if CR0.PG=1 as CR0 will be '0' prior to RESET).  Unlike the
+        * standard CR0/CR4/EFER modification paths, only CR0.PG needs to be
+        * checked because it is unconditionally cleared on INIT and all other
+        * paging related bits are ignored if paging is disabled, i.e. CR0.WP,
+        * CR4, and EFER changes are all irrelevant if CR0.PG was '0'.
+        */
+       if (old_cr0 & X86_CR0_PG)
+               kvm_mmu_reset_context(vcpu);
+
+       /*
+        * Intel's SDM states that all TLB entries are flushed on INIT.  AMD's
+        * APM states the TLBs are untouched by INIT, but it also states that
+        * the TLBs are flushed on "External initialization of the processor."
+        * Flush the guest TLB regardless of vendor, there is no meaningful
+        * benefit in relying on the guest to flush the TLB immediately after
+        * INIT.  A spurious TLB flush is benign and likely negligible from a
+        * performance perspective.
+        */
+       if (init_event)
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_vcpu_reset);
 
 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
 {
@@ -10754,12 +11158,19 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        kvm->arch.guest_can_read_msr_platform_info = true;
 
+#if IS_ENABLED(CONFIG_HYPERV)
+       spin_lock_init(&kvm->arch.hv_root_tdp_lock);
+       kvm->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
 
+       kvm_apicv_init(kvm);
        kvm_hv_init_vm(kvm);
        kvm_page_track_init(kvm);
        kvm_mmu_init_vm(kvm);
+       kvm_xen_init_vm(kvm);
 
        return static_call(kvm_x86_vm_init)(kvm);
 }
@@ -10917,17 +11328,23 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kvm_hv_destroy_vm(kvm);
 }
 
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+static void memslot_rmap_free(struct kvm_memory_slot *slot)
 {
        int i;
 
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                kvfree(slot->arch.rmap[i]);
                slot->arch.rmap[i] = NULL;
+       }
+}
 
-               if (i == 0)
-                       continue;
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+       int i;
+
+       memslot_rmap_free(slot);
 
+       for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
                kvfree(slot->arch.lpage_info[i - 1]);
                slot->arch.lpage_info[i - 1] = NULL;
        }
@@ -10935,11 +11352,78 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
        kvm_page_track_free_memslot(slot);
 }
 
-static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
-                                     unsigned long npages)
+static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
+                             unsigned long npages)
 {
+       const int sz = sizeof(*slot->arch.rmap[0]);
        int i;
 
+       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               int level = i + 1;
+               int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
+
+               WARN_ON(slot->arch.rmap[i]);
+
+               slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
+               if (!slot->arch.rmap[i]) {
+                       memslot_rmap_free(slot);
+                       return -ENOMEM;
+               }
+       }
+
+       return 0;
+}
+
+int alloc_all_memslots_rmaps(struct kvm *kvm)
+{
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *slot;
+       int r, i;
+
+       /*
+        * Check if memslots alreday have rmaps early before acquiring
+        * the slots_arch_lock below.
+        */
+       if (kvm_memslots_have_rmaps(kvm))
+               return 0;
+
+       mutex_lock(&kvm->slots_arch_lock);
+
+       /*
+        * Read memslots_have_rmaps again, under the slots arch lock,
+        * before allocating the rmaps
+        */
+       if (kvm_memslots_have_rmaps(kvm)) {
+               mutex_unlock(&kvm->slots_arch_lock);
+               return 0;
+       }
+
+       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+               slots = __kvm_memslots(kvm, i);
+               kvm_for_each_memslot(slot, slots) {
+                       r = memslot_rmap_alloc(slot, slot->npages);
+                       if (r) {
+                               mutex_unlock(&kvm->slots_arch_lock);
+                               return r;
+                       }
+               }
+       }
+
+       /*
+        * Ensure that memslots_have_rmaps becomes true strictly after
+        * all the rmap pointers are set.
+        */
+       smp_store_release(&kvm->arch.memslots_have_rmaps, true);
+       mutex_unlock(&kvm->slots_arch_lock);
+       return 0;
+}
+
+static int kvm_alloc_memslot_metadata(struct kvm *kvm,
+                                     struct kvm_memory_slot *slot,
+                                     unsigned long npages)
+{
+       int i, r;
+
        /*
         * Clear out the previous array pointers for the KVM_MR_MOVE case.  The
         * old arrays will be freed by __kvm_set_memory_region() if installing
@@ -10947,22 +11431,19 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
         */
        memset(&slot->arch, 0, sizeof(slot->arch));
 
-       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+       if (kvm_memslots_have_rmaps(kvm)) {
+               r = memslot_rmap_alloc(slot, npages);
+               if (r)
+                       return r;
+       }
+
+       for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
                struct kvm_lpage_info *linfo;
                unsigned long ugfn;
                int lpages;
                int level = i + 1;
 
-               lpages = gfn_to_index(slot->base_gfn + npages - 1,
-                                     slot->base_gfn, level) + 1;
-
-               slot->arch.rmap[i] =
-                       kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
-                                GFP_KERNEL_ACCOUNT);
-               if (!slot->arch.rmap[i])
-                       goto out_free;
-               if (i == 0)
-                       continue;
+               lpages = __kvm_mmu_slot_lpages(slot, npages, level);
 
                linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
                if (!linfo)
@@ -10993,12 +11474,9 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
        return 0;
 
 out_free:
-       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
-               kvfree(slot->arch.rmap[i]);
-               slot->arch.rmap[i] = NULL;
-               if (i == 0)
-                       continue;
+       memslot_rmap_free(slot);
 
+       for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
                kvfree(slot->arch.lpage_info[i - 1]);
                slot->arch.lpage_info[i - 1] = NULL;
        }
@@ -11027,7 +11505,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                enum kvm_mr_change change)
 {
        if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
-               return kvm_alloc_memslot_metadata(memslot,
+               return kvm_alloc_memslot_metadata(kvm, memslot,
                                                  mem->memory_size >> PAGE_SHIFT);
        return 0;
 }
@@ -11049,7 +11527,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
 
 static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
                                     struct kvm_memory_slot *old,
-                                    struct kvm_memory_slot *new,
+                                    const struct kvm_memory_slot *new,
                                     enum kvm_mr_change change)
 {
        bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES;
@@ -11103,36 +11581,19 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
                 */
                kvm_mmu_zap_collapsible_sptes(kvm, new);
        } else {
-               /* By default, write-protect everything to log writes. */
-               int level = PG_LEVEL_4K;
+               /*
+                * Initially-all-set does not require write protecting any page,
+                * because they're all assumed to be dirty.
+                */
+               if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+                       return;
 
                if (kvm_x86_ops.cpu_dirty_log_size) {
-                       /*
-                        * Clear all dirty bits, unless pages are treated as
-                        * dirty from the get-go.
-                        */
-                       if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
-                               kvm_mmu_slot_leaf_clear_dirty(kvm, new);
-
-                       /*
-                        * Write-protect large pages on write so that dirty
-                        * logging happens at 4k granularity.  No need to
-                        * write-protect small SPTEs since write accesses are
-                        * logged by the CPU via dirty bits.
-                        */
-                       level = PG_LEVEL_2M;
-               } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
-                       /*
-                        * If we're with initial-all-set, we don't need
-                        * to write protect any small page because
-                        * they're reported as dirty already.  However
-                        * we still need to write-protect huge pages
-                        * so that the page split can happen lazily on
-                        * the first write to the huge page.
-                        */
-                       level = PG_LEVEL_2M;
+                       kvm_mmu_slot_leaf_clear_dirty(kvm, new);
+                       kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
+               } else {
+                       kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
                }
-               kvm_mmu_slot_remove_write_access(kvm, new, level);
        }
 }
 
@@ -11146,10 +11607,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                kvm_mmu_change_mmu_pages(kvm,
                                kvm_mmu_calculate_default_mmu_pages(kvm));
 
-       /*
-        * FIXME: const-ify all uses of struct kvm_memory_slot.
-        */
-       kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
+       kvm_mmu_slot_apply_flags(kvm, old, new, change);
 
        /* Free the arrays associated with the old memslot. */
        if (change == KVM_MR_MOVE)
@@ -11701,8 +12159,6 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
 {
        bool pcid_enabled;
        struct x86_exception e;
-       unsigned i;
-       unsigned long roots_to_free = 0;
        struct {
                u64 pcid;
                u64 gla;
@@ -11736,23 +12192,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
                        return 1;
                }
 
-               if (kvm_get_active_pcid(vcpu) == operand.pcid) {
-                       kvm_mmu_sync_roots(vcpu);
-                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-               }
-
-               for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
-                           == operand.pcid)
-                               roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
-
-               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
-               /*
-                * If neither the current cr3 nor any of the prev_roots use the
-                * given PCID, then nothing needs to be done here because a
-                * resync will happen anyway before switching to any other CR3.
-                */
-
+               kvm_invalidate_pcid(vcpu, operand.pcid);
                return kvm_skip_emulated_instruction(vcpu);
 
        case INVPCID_TYPE_ALL_NON_GLOBAL:
@@ -11765,7 +12205,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
 
                fallthrough;
        case INVPCID_TYPE_ALL_INCL_GLOBAL:
-               kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
                return kvm_skip_emulated_instruction(vcpu);
 
        default: