KVM: stats: Separate generic stats from architecture specific ones
[linux-2.6-microblaze.git] / arch / x86 / kvm / x86.c
index 3c5a33a..7120233 100644 (file)
@@ -103,6 +103,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 
 static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
 
+#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
+
 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
 
@@ -114,6 +116,9 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 static void store_regs(struct kvm_vcpu *vcpu);
 static int sync_regs(struct kvm_vcpu *vcpu);
 
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+
 struct kvm_x86_ops kvm_x86_ops __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
@@ -210,6 +215,9 @@ EXPORT_SYMBOL_GPL(host_efer);
 bool __read_mostly allow_smaller_maxphyaddr = 0;
 EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
 
+bool __read_mostly enable_apicv = true;
+EXPORT_SYMBOL_GPL(enable_apicv);
+
 u64 __read_mostly host_xss;
 EXPORT_SYMBOL_GPL(host_xss);
 u64 __read_mostly supported_xss;
@@ -227,10 +235,10 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        VCPU_STAT("irq_window", irq_window_exits),
        VCPU_STAT("nmi_window", nmi_window_exits),
        VCPU_STAT("halt_exits", halt_exits),
-       VCPU_STAT("halt_successful_poll", halt_successful_poll),
-       VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
-       VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
-       VCPU_STAT("halt_wakeup", halt_wakeup),
+       VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
+       VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
+       VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
+       VCPU_STAT("halt_wakeup", generic.halt_wakeup),
        VCPU_STAT("hypercalls", hypercalls),
        VCPU_STAT("request_irq", request_irq_exits),
        VCPU_STAT("irq_exits", irq_exits),
@@ -242,11 +250,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        VCPU_STAT("nmi_injections", nmi_injections),
        VCPU_STAT("req_event", req_event),
        VCPU_STAT("l1d_flush", l1d_flush),
-       VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
-       VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+       VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
+       VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
        VCPU_STAT("nested_run", nested_run),
        VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
        VCPU_STAT("directed_yield_successful", directed_yield_successful),
+       VCPU_STAT("guest_mode", guest_mode),
        VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
        VM_STAT("mmu_pte_write", mmu_pte_write),
        VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
@@ -254,7 +263,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        VM_STAT("mmu_recycled", mmu_recycled),
        VM_STAT("mmu_cache_miss", mmu_cache_miss),
        VM_STAT("mmu_unsync", mmu_unsync),
-       VM_STAT("remote_tlb_flush", remote_tlb_flush),
+       VM_STAT("remote_tlb_flush", generic.remote_tlb_flush),
        VM_STAT("largepages", lpages, .mode = 0444),
        VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
        VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
@@ -779,13 +788,6 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 
-static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
-                              void *data, int offset, int len, u32 access)
-{
-       return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
-                                      data, offset, len, access);
-}
-
 static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
@@ -820,6 +822,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 
        memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
        kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+       vcpu->arch.pdptrs_from_userspace = false;
 
 out:
 
@@ -827,30 +830,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(load_pdptrs);
 
-bool pdptrs_changed(struct kvm_vcpu *vcpu)
-{
-       u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
-       int offset;
-       gfn_t gfn;
-       int r;
-
-       if (!is_pae_paging(vcpu))
-               return false;
-
-       if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
-               return true;
-
-       gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
-       offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
-       r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
-                                      PFERR_USER_MASK | PFERR_WRITE_MASK);
-       if (r < 0)
-               return true;
-
-       return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
-}
-EXPORT_SYMBOL_GPL(pdptrs_changed);
-
 void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
 {
        unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
@@ -1085,25 +1064,46 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
+static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
+{
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       unsigned long roots_to_free = 0;
+       int i;
+
+       /*
+        * If neither the current CR3 nor any of the prev_roots use the given
+        * PCID, then nothing needs to be done here because a resync will
+        * happen anyway before switching to any other CR3.
+        */
+       if (kvm_get_active_pcid(vcpu) == pcid) {
+               kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+       }
+
+       for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+               if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
+                       roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+       kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+}
+
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
        bool skip_tlb_flush = false;
+       unsigned long pcid = 0;
 #ifdef CONFIG_X86_64
        bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
 
        if (pcid_enabled) {
                skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
                cr3 &= ~X86_CR3_PCID_NOFLUSH;
+               pcid = cr3 & X86_CR3_PCID_MASK;
        }
 #endif
 
-       if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
-               if (!skip_tlb_flush) {
-                       kvm_mmu_sync_roots(vcpu);
-                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-               }
-               return 0;
-       }
+       /* PDPTRs are always reloaded for PAE paging. */
+       if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
+               goto handle_tlb_flush;
 
        /*
         * Do not condition the GPA check on long mode, this helper is used to
@@ -1116,10 +1116,23 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
                return 1;
 
-       kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
+       if (cr3 != kvm_read_cr3(vcpu))
+               kvm_mmu_new_pgd(vcpu, cr3);
+
        vcpu->arch.cr3 = cr3;
        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
 
+handle_tlb_flush:
+       /*
+        * A load of CR3 that flushes the TLB flushes only the current PCID,
+        * even if PCID is disabled, in which case PCID=0 is flushed.  It's a
+        * moot point in the end because _disabling_ PCID will flush all PCIDs,
+        * and it's impossible to use a non-zero PCID when PCID is disabled,
+        * i.e. only PCID=0 can be relevant.
+        */
+       if (!skip_tlb_flush)
+               kvm_invalidate_pcid(vcpu, pcid);
+
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
@@ -3550,7 +3563,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_LASTBRANCHTOIP:
        case MSR_IA32_LASTINTFROMIP:
        case MSR_IA32_LASTINTTOIP:
-       case MSR_K8_SYSCFG:
+       case MSR_AMD64_SYSCFG:
        case MSR_K8_TSEG_ADDR:
        case MSR_K8_TSEG_MASK:
        case MSR_VM_HSAVE_PA:
@@ -3951,6 +3964,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_HYPERV_TLBFLUSH:
        case KVM_CAP_HYPERV_SEND_IPI:
        case KVM_CAP_HYPERV_CPUID:
+       case KVM_CAP_HYPERV_ENFORCE_CPUID:
        case KVM_CAP_SYS_HYPERV_CPUID:
        case KVM_CAP_PCI_SEGMENT:
        case KVM_CAP_DEBUGREGS:
@@ -3981,8 +3995,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_SGX_ATTRIBUTE:
 #endif
        case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+       case KVM_CAP_SREGS2:
                r = 1;
                break;
+       case KVM_CAP_EXIT_HYPERCALL:
+               r = KVM_EXIT_HYPERCALL_VALID_MASK;
+               break;
        case KVM_CAP_SET_GUEST_DEBUG2:
                return KVM_GUESTDBG_VALID_MASK;
 #ifdef CONFIG_KVM_XEN
@@ -4529,7 +4547,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
        memset(&events->reserved, 0, sizeof(events->reserved));
 }
 
-static void kvm_smm_changed(struct kvm_vcpu *vcpu);
+static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
 
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                                              struct kvm_vcpu_events *events)
@@ -4589,13 +4607,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
        if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
-               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
-                       if (events->smi.smm)
-                               vcpu->arch.hflags |= HF_SMM_MASK;
-                       else
-                               vcpu->arch.hflags &= ~HF_SMM_MASK;
-                       kvm_smm_changed(vcpu);
-               }
+               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
+                       kvm_smm_changed(vcpu, events->smi.smm);
 
                vcpu->arch.smi_pending = events->smi.pending;
 
@@ -4879,6 +4892,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 
                return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
 
+       case KVM_CAP_HYPERV_ENFORCE_CPUID:
+               return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
+
        case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
                vcpu->arch.pv_cpuid.enforce = cap->args[0];
                if (vcpu->arch.pv_cpuid.enforce)
@@ -4897,6 +4913,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        void __user *argp = (void __user *)arg;
        int r;
        union {
+               struct kvm_sregs2 *sregs2;
                struct kvm_lapic_state *lapic;
                struct kvm_xsave *xsave;
                struct kvm_xcrs *xcrs;
@@ -5269,6 +5286,28 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                break;
        }
 #endif
+       case KVM_GET_SREGS2: {
+               u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!u.sregs2)
+                       goto out;
+               __get_sregs2(vcpu, u.sregs2);
+               r = -EFAULT;
+               if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_SREGS2: {
+               u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
+               if (IS_ERR(u.sregs2)) {
+                       r = PTR_ERR(u.sregs2);
+                       u.sregs2 = NULL;
+                       goto out;
+               }
+               r = __set_sregs2(vcpu, u.sregs2);
+               break;
+       }
        default:
                r = -EINVAL;
        }
@@ -5588,6 +5627,14 @@ split_irqchip_unlock:
                if (kvm_x86_ops.vm_copy_enc_context_from)
                        r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
                return r;
+       case KVM_CAP_EXIT_HYPERCALL:
+               if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
+                       r = -EINVAL;
+                       break;
+               }
+               kvm->arch.hypercall_exit_enabled = cap->args[0];
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@ -7211,23 +7258,22 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
        return emul_to_vcpu(ctxt)->arch.hflags;
 }
 
-static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt)
 {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 
-       vcpu->arch.hflags = emul_flags;
-       kvm_mmu_reset_context(vcpu);
+       kvm_smm_changed(vcpu, false);
 }
 
-static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
+static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt,
                                  const char *smstate)
 {
-       return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate);
+       return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate);
 }
 
-static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
+static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
 {
-       kvm_smm_changed(emul_to_vcpu(ctxt));
+       kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
 }
 
 static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
@@ -7276,9 +7322,9 @@ static const struct x86_emulate_ops emulate_ops = {
        .guest_has_fxsr      = emulator_guest_has_fxsr,
        .set_nmi_mask        = emulator_set_nmi_mask,
        .get_hflags          = emulator_get_hflags,
-       .set_hflags          = emulator_set_hflags,
-       .pre_leave_smm       = emulator_pre_leave_smm,
-       .post_leave_smm      = emulator_post_leave_smm,
+       .exiting_smm         = emulator_exiting_smm,
+       .leave_smm           = emulator_leave_smm,
+       .triple_fault        = emulator_triple_fault,
        .set_xcr             = emulator_set_xcr,
 };
 
@@ -7539,11 +7585,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
 
-static void kvm_smm_changed(struct kvm_vcpu *vcpu)
+static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
 {
-       if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
-               /* This is a good place to trace that we are exiting SMM.  */
-               trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+       trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
+
+       if (entering_smm) {
+               vcpu->arch.hflags |= HF_SMM_MASK;
+       } else {
+               vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
 
                /* Process a latched INIT or SMI, if any.  */
                kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -8468,16 +8517,15 @@ bool kvm_apicv_activated(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_apicv_activated);
 
-void kvm_apicv_init(struct kvm *kvm, bool enable)
+static void kvm_apicv_init(struct kvm *kvm)
 {
-       if (enable)
+       if (enable_apicv)
                clear_bit(APICV_INHIBIT_REASON_DISABLE,
                          &kvm->arch.apicv_inhibit_reasons);
        else
                set_bit(APICV_INHIBIT_REASON_DISABLE,
                        &kvm->arch.apicv_inhibit_reasons);
 }
-EXPORT_SYMBOL_GPL(kvm_apicv_init);
 
 static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
 {
@@ -8513,6 +8561,17 @@ no_yield:
        return;
 }
 
+static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
+{
+       u64 ret = vcpu->run->hypercall.ret;
+
+       if (!is_64_bit_mode(vcpu))
+               ret = (u32)ret;
+       kvm_rax_write(vcpu, ret);
+       ++vcpu->stat.hypercalls;
+       return kvm_skip_emulated_instruction(vcpu);
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
        unsigned long nr, a0, a1, a2, a3, ret;
@@ -8578,6 +8637,28 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                kvm_sched_yield(vcpu, a0);
                ret = 0;
                break;
+       case KVM_HC_MAP_GPA_RANGE: {
+               u64 gpa = a0, npages = a1, attrs = a2;
+
+               ret = -KVM_ENOSYS;
+               if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
+                       break;
+
+               if (!PAGE_ALIGNED(gpa) || !npages ||
+                   gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
+                       ret = -KVM_EINVAL;
+                       break;
+               }
+
+               vcpu->run->exit_reason        = KVM_EXIT_HYPERCALL;
+               vcpu->run->hypercall.nr       = KVM_HC_MAP_GPA_RANGE;
+               vcpu->run->hypercall.args[0]  = gpa;
+               vcpu->run->hypercall.args[1]  = npages;
+               vcpu->run->hypercall.args[2]  = attrs;
+               vcpu->run->hypercall.longmode = op_64_bit;
+               vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+               return 0;
+       }
        default:
                ret = -KVM_ENOSYS;
                break;
@@ -8998,7 +9079,6 @@ static void enter_smm(struct kvm_vcpu *vcpu)
        char buf[512];
        u32 cr0;
 
-       trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
        memset(buf, 0, 512);
 #ifdef CONFIG_X86_64
        if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
@@ -9008,13 +9088,13 @@ static void enter_smm(struct kvm_vcpu *vcpu)
                enter_smm_save_state_32(vcpu, buf);
 
        /*
-        * Give pre_enter_smm() a chance to make ISA-specific changes to the
-        * vCPU state (e.g. leave guest mode) after we've saved the state into
-        * the SMM state-save area.
+        * Give enter_smm() a chance to make ISA-specific changes to the vCPU
+        * state (e.g. leave guest mode) after we've saved the state into the
+        * SMM state-save area.
         */
-       static_call(kvm_x86_pre_enter_smm)(vcpu, buf);
+       static_call(kvm_x86_enter_smm)(vcpu, buf);
 
-       vcpu->arch.hflags |= HF_SMM_MASK;
+       kvm_smm_changed(vcpu, true);
        kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
 
        if (static_call(kvm_x86_get_nmi_mask)(vcpu))
@@ -9103,6 +9183,15 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
        vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
        kvm_apic_update_apicv(vcpu);
        static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
+
+       /*
+        * When APICv gets disabled, we may still have injected interrupts
+        * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
+        * still active when the interrupt got accepted. Make sure
+        * inject_pending_event() is called to check for that.
+        */
+       if (!vcpu->arch.apicv_active)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
 
@@ -9278,7 +9367,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                }
                if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
                        kvm_vcpu_flush_tlb_current(vcpu);
-               if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+               if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
                        kvm_vcpu_flush_tlb_guest(vcpu);
 
                if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
@@ -9964,7 +10053,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
 
-static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
        struct desc_ptr dt;
 
@@ -9997,14 +10086,36 @@ skip_protected_regs:
        sregs->cr8 = kvm_get_cr8(vcpu);
        sregs->efer = vcpu->arch.efer;
        sregs->apic_base = kvm_get_apic_base(vcpu);
+}
 
-       memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+       __get_sregs_common(vcpu, sregs);
+
+       if (vcpu->arch.guest_state_protected)
+               return;
 
        if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
                set_bit(vcpu->arch.interrupt.nr,
                        (unsigned long *)sregs->interrupt_bitmap);
 }
 
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+       int i;
+
+       __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
+
+       if (vcpu->arch.guest_state_protected)
+               return;
+
+       if (is_pae_paging(vcpu)) {
+               for (i = 0 ; i < 4 ; i++)
+                       sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
+               sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
+       }
+}
+
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
 {
@@ -10123,24 +10234,23 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        return kvm_is_valid_cr4(vcpu, sregs->cr4);
 }
 
-static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
+               int *mmu_reset_needed, bool update_pdptrs)
 {
        struct msr_data apic_base_msr;
-       int mmu_reset_needed = 0;
-       int pending_vec, max_bits, idx;
+       int idx;
        struct desc_ptr dt;
-       int ret = -EINVAL;
 
        if (!kvm_is_valid_sregs(vcpu, sregs))
-               goto out;
+               return -EINVAL;
 
        apic_base_msr.data = sregs->apic_base;
        apic_base_msr.host_initiated = true;
        if (kvm_set_apic_base(vcpu, &apic_base_msr))
-               goto out;
+               return -EINVAL;
 
        if (vcpu->arch.guest_state_protected)
-               goto skip_protected_regs;
+               return 0;
 
        dt.size = sregs->idt.limit;
        dt.address = sregs->idt.base;
@@ -10150,31 +10260,30 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        static_call(kvm_x86_set_gdt)(vcpu, &dt);
 
        vcpu->arch.cr2 = sregs->cr2;
-       mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+       *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
        vcpu->arch.cr3 = sregs->cr3;
        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
 
        kvm_set_cr8(vcpu, sregs->cr8);
 
-       mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+       *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
        static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
 
-       mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+       *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
        static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
        vcpu->arch.cr0 = sregs->cr0;
 
-       mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+       *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
        static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
 
-       idx = srcu_read_lock(&vcpu->kvm->srcu);
-       if (is_pae_paging(vcpu)) {
-               load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
-               mmu_reset_needed = 1;
+       if (update_pdptrs) {
+               idx = srcu_read_lock(&vcpu->kvm->srcu);
+               if (is_pae_paging(vcpu)) {
+                       load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+                       *mmu_reset_needed = 1;
+               }
+               srcu_read_unlock(&vcpu->kvm->srcu, idx);
        }
-       srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
-       if (mmu_reset_needed)
-               kvm_mmu_reset_context(vcpu);
 
        kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
        kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@ -10194,20 +10303,63 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
            !is_protmode(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
-skip_protected_regs:
+       return 0;
+}
+
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+       int pending_vec, max_bits;
+       int mmu_reset_needed = 0;
+       int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
+
+       if (ret)
+               return ret;
+
+       if (mmu_reset_needed)
+               kvm_mmu_reset_context(vcpu);
+
        max_bits = KVM_NR_INTERRUPTS;
        pending_vec = find_first_bit(
                (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+
        if (pending_vec < max_bits) {
                kvm_queue_interrupt(vcpu, pending_vec, false);
                pr_debug("Set back pending irq %d\n", pending_vec);
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
+       return 0;
+}
 
-       kvm_make_request(KVM_REQ_EVENT, vcpu);
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+       int mmu_reset_needed = 0;
+       bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
+       bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
+               !(sregs2->efer & EFER_LMA);
+       int i, ret;
 
-       ret = 0;
-out:
-       return ret;
+       if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
+               return -EINVAL;
+
+       if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
+               return -EINVAL;
+
+       ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
+                                &mmu_reset_needed, !valid_pdptrs);
+       if (ret)
+               return ret;
+
+       if (valid_pdptrs) {
+               for (i = 0; i < 4 ; i++)
+                       kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
+
+               kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+               mmu_reset_needed = 1;
+               vcpu->arch.pdptrs_from_userspace = true;
+       }
+       if (mmu_reset_needed)
+               kvm_mmu_reset_context(vcpu);
+       return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
@@ -10495,6 +10647,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        vcpu->arch.pending_external_vector = -1;
        vcpu->arch.preempted_in_kernel = false;
 
+#if IS_ENABLED(CONFIG_HYPERV)
+       vcpu->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
        r = static_call(kvm_x86_vcpu_create)(vcpu);
        if (r)
                goto free_guest_fpu;
@@ -10505,7 +10661,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        vcpu_load(vcpu);
        kvm_set_tsc_khz(vcpu, max_tsc_khz);
        kvm_vcpu_reset(vcpu, false);
-       kvm_init_mmu(vcpu, false);
+       kvm_init_mmu(vcpu);
        vcpu_put(vcpu);
        return 0;
 
@@ -10764,6 +10920,9 @@ int kvm_arch_hardware_setup(void *opaque)
        int r;
 
        rdmsrl_safe(MSR_EFER, &host_efer);
+       if (WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_NX) &&
+                        !(host_efer & EFER_NX)))
+               return -EIO;
 
        if (boot_cpu_has(X86_FEATURE_XSAVES))
                rdmsrl(MSR_IA32_XSS, host_xss);
@@ -10879,9 +11038,15 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        kvm->arch.guest_can_read_msr_platform_info = true;
 
+#if IS_ENABLED(CONFIG_HYPERV)
+       spin_lock_init(&kvm->arch.hv_root_tdp_lock);
+       kvm->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
 
+       kvm_apicv_init(kvm);
        kvm_hv_init_vm(kvm);
        kvm_page_track_init(kvm);
        kvm_mmu_init_vm(kvm);
@@ -11878,8 +12043,6 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
 {
        bool pcid_enabled;
        struct x86_exception e;
-       unsigned i;
-       unsigned long roots_to_free = 0;
        struct {
                u64 pcid;
                u64 gla;
@@ -11913,23 +12076,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
                        return 1;
                }
 
-               if (kvm_get_active_pcid(vcpu) == operand.pcid) {
-                       kvm_mmu_sync_roots(vcpu);
-                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-               }
-
-               for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
-                           == operand.pcid)
-                               roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
-
-               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
-               /*
-                * If neither the current cr3 nor any of the prev_roots use the
-                * given PCID, then nothing needs to be done here because a
-                * resync will happen anyway before switching to any other CR3.
-                */
-
+               kvm_invalidate_pcid(vcpu, operand.pcid);
                return kvm_skip_emulated_instruction(vcpu);
 
        case INVPCID_TYPE_ALL_NON_GLOBAL:
@@ -11942,7 +12089,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
 
                fallthrough;
        case INVPCID_TYPE_ALL_INCL_GLOBAL:
-               kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
                return kvm_skip_emulated_instruction(vcpu);
 
        default: