Merge branch 'kvm-ppc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus...
authorPaolo Bonzini <pbonzini@redhat.com>
Mon, 20 Feb 2017 10:54:22 +0000 (11:54 +0100)
committerPaolo Bonzini <pbonzini@redhat.com>
Mon, 20 Feb 2017 10:54:22 +0000 (11:54 +0100)
Paul Mackerras writes:
"Please do a pull from my kvm-ppc-next branch to get some fixes which I
would like to have in 4.11.  There are four small commits there; two
are fixes for potential host crashes in the new HPT resizing code, and
the other two are changes to printks to make KVM on PPC a little less
noisy."

16 files changed:
Documentation/virtual/kvm/api.txt
arch/arm/kvm/arm.c
arch/mips/kvm/mips.c
arch/powerpc/kvm/powerpc.c
arch/s390/kvm/kvm-s390.c
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/cpuid.c
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
drivers/ptp/ptp_kvm.c
include/linux/kvm_host.h
include/uapi/linux/kvm.h
virt/kvm/kvm_main.c

index e4f2cdc..0694509 100644 (file)
@@ -3389,7 +3389,18 @@ struct kvm_run {
 Request that KVM_RUN return when it becomes possible to inject external
 interrupts into the guest.  Useful in conjunction with KVM_INTERRUPT.
 
-       __u8 padding1[7];
+       __u8 immediate_exit;
+
+This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN
+exits immediately, returning -EINTR.  In the common scenario where a
+signal is used to "kick" a VCPU out of KVM_RUN, this field can be used
+to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability.
+Rather than blocking the signal outside KVM_RUN, userspace can set up
+a signal handler that sets run->immediate_exit to a non-zero value.
+
+This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available.
+
+       __u8 padding1[6];
 
        /* out */
        __u32 exit_reason;
index 21c493a..c9a2103 100644 (file)
@@ -206,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_ARM_PSCI_0_2:
        case KVM_CAP_READONLY_MEM:
        case KVM_CAP_MP_STATE:
+       case KVM_CAP_IMMEDIATE_EXIT:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -604,6 +605,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                        return ret;
        }
 
+       if (run->immediate_exit)
+               return -EINTR;
+
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
index 31ee5ee..ed81e5a 100644 (file)
@@ -397,7 +397,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
-       int r = 0;
+       int r = -EINTR;
        sigset_t sigsaved;
 
        if (vcpu->sigset_active)
@@ -409,6 +409,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                vcpu->mmio_needed = 0;
        }
 
+       if (run->immediate_exit)
+               goto out;
+
        lose_fpu(1);
 
        local_irq_disable();
@@ -429,6 +432,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
        guest_exit_irqoff();
        local_irq_enable();
 
+out:
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
@@ -1021,6 +1025,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_ENABLE_CAP:
        case KVM_CAP_READONLY_MEM:
        case KVM_CAP_SYNC_MMU:
+       case KVM_CAP_IMMEDIATE_EXIT:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
index fcb253b..2b38d82 100644 (file)
@@ -511,6 +511,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_ONE_REG:
        case KVM_CAP_IOEVENTFD:
        case KVM_CAP_DEVICE_CTRL:
+       case KVM_CAP_IMMEDIATE_EXIT:
                r = 1;
                break;
        case KVM_CAP_PPC_PAIRED_SINGLES:
@@ -1118,7 +1119,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 #endif
        }
 
-       r = kvmppc_vcpu_run(run, vcpu);
+       if (run->immediate_exit)
+               r = -EINTR;
+       else
+               r = kvmppc_vcpu_run(run, vcpu);
 
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
index 502de74..99e35fe 100644 (file)
@@ -370,6 +370,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_S390_IRQCHIP:
        case KVM_CAP_VM_ATTRIBUTES:
        case KVM_CAP_MP_STATE:
+       case KVM_CAP_IMMEDIATE_EXIT:
        case KVM_CAP_S390_INJECT_IRQ:
        case KVM_CAP_S390_USER_SIGP:
        case KVM_CAP_S390_USER_STSI:
@@ -2798,6 +2799,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        int rc;
        sigset_t sigsaved;
 
+       if (kvm_run->immediate_exit)
+               return -EINTR;
+
        if (guestdbg_exit_pending(vcpu)) {
                kvm_s390_prepare_debug_exit(vcpu);
                return 0;
index 417502c..74ef58c 100644 (file)
@@ -55,7 +55,6 @@
 #define KVM_REQ_TRIPLE_FAULT      10
 #define KVM_REQ_MMU_SYNC          11
 #define KVM_REQ_CLOCK_UPDATE      12
-#define KVM_REQ_DEACTIVATE_FPU    13
 #define KVM_REQ_EVENT             14
 #define KVM_REQ_APF_HALT          15
 #define KVM_REQ_STEAL_UPDATE      16
@@ -936,8 +935,6 @@ struct kvm_x86_ops {
        unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
        void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
        u32 (*get_pkru)(struct kvm_vcpu *vcpu);
-       void (*fpu_activate)(struct kvm_vcpu *vcpu);
-       void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
 
        void (*tlb_flush)(struct kvm_vcpu *vcpu);
 
@@ -969,7 +966,7 @@ struct kvm_x86_ops {
        void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
        void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
        void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
-       void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
+       int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
        int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
        int (*get_tdp_level)(void);
        u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
index c0e2036..1d155cc 100644 (file)
@@ -123,8 +123,6 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
        if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
                best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
-       kvm_x86_ops->fpu_activate(vcpu);
-
        /*
         * The existing code assumes virtual address is 48-bit in the canonical
         * address checks; exit if it is ever changed.
index 33b799f..bad6a25 100644 (file)
@@ -341,7 +341,7 @@ static int find_highest_vector(void *bitmap)
             vec >= 0; vec -= APIC_VECTORS_PER_REG) {
                reg = bitmap + REG_POS(vec);
                if (*reg)
-                       return fls(*reg) - 1 + vec;
+                       return __fls(*reg) + vec;
        }
 
        return -1;
@@ -361,27 +361,32 @@ static u8 count_vectors(void *bitmap)
        return count;
 }
 
-void __kvm_apic_update_irr(u32 *pir, void *regs)
+int __kvm_apic_update_irr(u32 *pir, void *regs)
 {
-       u32 i, pir_val;
+       u32 i, vec;
+       u32 pir_val, irr_val;
+       int max_irr = -1;
 
-       for (i = 0; i <= 7; i++) {
+       for (i = vec = 0; i <= 7; i++, vec += 32) {
                pir_val = READ_ONCE(pir[i]);
+               irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
                if (pir_val) {
-                       pir_val = xchg(&pir[i], 0);
-                       *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
+                       irr_val |= xchg(&pir[i], 0);
+                       *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
                }
+               if (irr_val)
+                       max_irr = __fls(irr_val) + vec;
        }
+
+       return max_irr;
 }
 EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
 
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       __kvm_apic_update_irr(pir, apic->regs);
-
-       kvm_make_request(KVM_REQ_EVENT, vcpu);
+       return __kvm_apic_update_irr(pir, apic->regs);
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 
@@ -401,8 +406,6 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
        if (!apic->irr_pending)
                return -1;
 
-       if (apic->vcpu->arch.apicv_active)
-               kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
        result = apic_search_irr(apic);
        ASSERT(result == -1 || result >= 16);
 
@@ -416,9 +419,10 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
        vcpu = apic->vcpu;
 
        if (unlikely(vcpu->arch.apicv_active)) {
-               /* try to update RVI */
+               /* need to update RVI */
                apic_clear_vector(vec, apic->regs + APIC_IRR);
-               kvm_make_request(KVM_REQ_EVENT, vcpu);
+               kvm_x86_ops->hwapic_irr_update(vcpu,
+                               apic_find_highest_irr(apic));
        } else {
                apic->irr_pending = false;
                apic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -508,6 +512,7 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
         */
        return apic_find_highest_irr(vcpu->arch.apic);
 }
+EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
 
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                             int vector, int level, int trig_mode,
@@ -524,16 +529,14 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 
 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
 {
-
-       return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
-                                     sizeof(val));
+       return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val,
+                                          sizeof(val));
 }
 
 static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
 {
-
-       return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
-                                     sizeof(*val));
+       return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val,
+                                         sizeof(*val));
 }
 
 static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
@@ -572,7 +575,11 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 
 static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
 {
-       int highest_irr = apic_find_highest_irr(apic);
+       int highest_irr;
+       if (kvm_x86_ops->sync_pir_to_irr && apic->vcpu->arch.apicv_active)
+               highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
+       else
+               highest_irr = apic_find_highest_irr(apic);
        if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
                return -1;
        return highest_irr;
@@ -2204,8 +2211,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
                                1 : count_vectors(apic->regs + APIC_ISR);
        apic->highest_isr_cache = -1;
        if (vcpu->arch.apicv_active) {
-               if (kvm_x86_ops->apicv_post_state_restore)
-                       kvm_x86_ops->apicv_post_state_restore(vcpu);
+               kvm_x86_ops->apicv_post_state_restore(vcpu);
                kvm_x86_ops->hwapic_irr_update(vcpu,
                                apic_find_highest_irr(apic));
                kvm_x86_ops->hwapic_isr_update(vcpu,
@@ -2279,8 +2285,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
        if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
                return;
 
-       if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
-                                 sizeof(u32)))
+       if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
+                                      sizeof(u32)))
                return;
 
        apic_set_tpr(vcpu->arch.apic, data & 0xff);
@@ -2332,14 +2338,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
                max_isr = 0;
        data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
 
-       kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
-                               sizeof(u32));
+       kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
+                                   sizeof(u32));
 }
 
 int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
 {
        if (vapic_addr) {
-               if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+               if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
                                        &vcpu->arch.apic->vapic_cache,
                                        vapic_addr, sizeof(u32)))
                        return -EINVAL;
@@ -2433,7 +2439,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
        vcpu->arch.pv_eoi.msr_val = data;
        if (!pv_eoi_enabled(vcpu))
                return 0;
-       return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+       return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data,
                                         addr, sizeof(u8));
 }
 
index 05abd83..bcbe811 100644 (file)
@@ -71,8 +71,8 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                           int short_hand, unsigned int dest, int dest_mode);
 
-void __kvm_apic_update_irr(u32 *pir, void *regs);
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
+int __kvm_apic_update_irr(u32 *pir, void *regs);
+int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
                     struct dest_map *dest_map);
index d0414f0..d1efe2c 100644 (file)
@@ -971,8 +971,8 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
  * a particular vCPU.
  */
 #define SVM_VM_DATA_HASH_BITS  8
-DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
-static spinlock_t svm_vm_data_hash_lock;
+static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
 
 /* Note:
  * This function is called from IOMMU driver to notify
@@ -1077,8 +1077,6 @@ static __init int svm_hardware_setup(void)
                } else {
                        pr_info("AVIC enabled\n");
 
-                       hash_init(svm_vm_data_hash);
-                       spin_lock_init(&svm_vm_data_hash_lock);
                        amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
                }
        }
@@ -1159,7 +1157,6 @@ static void init_vmcb(struct vcpu_svm *svm)
        struct vmcb_control_area *control = &svm->vmcb->control;
        struct vmcb_save_area *save = &svm->vmcb->save;
 
-       svm->vcpu.fpu_active = 1;
        svm->vcpu.arch.hflags = 0;
 
        set_cr_intercept(svm, INTERCEPT_CR0_READ);
@@ -1901,15 +1898,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
        ulong gcr0 = svm->vcpu.arch.cr0;
        u64 *hcr0 = &svm->vmcb->save.cr0;
 
-       if (!svm->vcpu.fpu_active)
-               *hcr0 |= SVM_CR0_SELECTIVE_MASK;
-       else
-               *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
-                       | (gcr0 & SVM_CR0_SELECTIVE_MASK);
+       *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
+               | (gcr0 & SVM_CR0_SELECTIVE_MASK);
 
        mark_dirty(svm->vmcb, VMCB_CR);
 
-       if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
+       if (gcr0 == *hcr0) {
                clr_cr_intercept(svm, INTERCEPT_CR0_READ);
                clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
        } else {
@@ -1940,8 +1934,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        if (!npt_enabled)
                cr0 |= X86_CR0_PG | X86_CR0_WP;
 
-       if (!vcpu->fpu_active)
-               cr0 |= X86_CR0_TS;
        /*
         * re-enable caching here because the QEMU bios
         * does not do it - this results in some delay at
@@ -2160,22 +2152,6 @@ static int ac_interception(struct vcpu_svm *svm)
        return 1;
 }
 
-static void svm_fpu_activate(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       clr_exception_intercept(svm, NM_VECTOR);
-
-       svm->vcpu.fpu_active = 1;
-       update_cr0_intercept(svm);
-}
-
-static int nm_interception(struct vcpu_svm *svm)
-{
-       svm_fpu_activate(&svm->vcpu);
-       return 1;
-}
-
 static bool is_erratum_383(void)
 {
        int err, i;
@@ -2573,9 +2549,6 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
                if (!npt_enabled && svm->apf_reason == 0)
                        return NESTED_EXIT_HOST;
                break;
-       case SVM_EXIT_EXCP_BASE + NM_VECTOR:
-               nm_interception(svm);
-               break;
        default:
                break;
        }
@@ -4020,7 +3993,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
        [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
        [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
-       [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
        [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
        [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
        [SVM_EXIT_INTR]                         = intr_interception,
@@ -4359,11 +4331,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
        return;
 }
 
-static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
-{
-       return;
-}
-
 static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
 {
        kvm_lapic_set_irr(vec, vcpu->arch.apic);
@@ -5079,14 +5046,6 @@ static bool svm_has_wbinvd_exit(void)
        return true;
 }
 
-static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       set_exception_intercept(svm, NM_VECTOR);
-       update_cr0_intercept(svm);
-}
-
 #define PRE_EX(exit)  { .exit_code = (exit), \
                        .stage = X86_ICPT_PRE_EXCEPT, }
 #define POST_EX(exit) { .exit_code = (exit), \
@@ -5347,9 +5306,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
        .get_pkru = svm_get_pkru,
 
-       .fpu_activate = svm_fpu_activate,
-       .fpu_deactivate = svm_fpu_deactivate,
-
        .tlb_flush = svm_flush_tlb,
 
        .run = svm_vcpu_run,
@@ -5373,7 +5329,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .get_enable_apicv = svm_get_enable_apicv,
        .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
        .load_eoi_exitmap = svm_load_eoi_exitmap,
-       .sync_pir_to_irr = svm_sync_pir_to_irr,
        .hwapic_irr_update = svm_hwapic_irr_update,
        .hwapic_isr_update = svm_hwapic_isr_update,
        .apicv_post_state_restore = avic_post_state_restore,
index 7c3e426..9856b73 100644 (file)
@@ -1856,7 +1856,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
        u32 eb;
 
        eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
-            (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
+            (1u << DB_VECTOR) | (1u << AC_VECTOR);
        if ((vcpu->guest_debug &
             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
            (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1865,8 +1865,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
                eb = ~0;
        if (enable_ept)
                eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
-       if (vcpu->fpu_active)
-               eb &= ~(1u << NM_VECTOR);
 
        /* When we are running a nested L2 guest and L1 specified for it a
         * certain exception bitmap, we must trap the same exceptions and pass
@@ -2340,25 +2338,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
        }
 }
 
-static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
-{
-       ulong cr0;
-
-       if (vcpu->fpu_active)
-               return;
-       vcpu->fpu_active = 1;
-       cr0 = vmcs_readl(GUEST_CR0);
-       cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
-       cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
-       vmcs_writel(GUEST_CR0, cr0);
-       update_exception_bitmap(vcpu);
-       vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
-       if (is_guest_mode(vcpu))
-               vcpu->arch.cr0_guest_owned_bits &=
-                       ~get_vmcs12(vcpu)->cr0_guest_host_mask;
-       vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-}
-
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
 
 /*
@@ -2377,33 +2356,6 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
                (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
 }
 
-static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
-       /* Note that there is no vcpu->fpu_active = 0 here. The caller must
-        * set this *before* calling this function.
-        */
-       vmx_decache_cr0_guest_bits(vcpu);
-       vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
-       update_exception_bitmap(vcpu);
-       vcpu->arch.cr0_guest_owned_bits = 0;
-       vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-       if (is_guest_mode(vcpu)) {
-               /*
-                * L1's specified read shadow might not contain the TS bit,
-                * so now that we turned on shadowing of this bit, we need to
-                * set this bit of the shadow. Like in nested_vmx_run we need
-                * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
-                * up-to-date here because we just decached cr0.TS (and we'll
-                * only update vmcs12->guest_cr0 on nested exit).
-                */
-               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-               vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
-                       (vcpu->arch.cr0 & X86_CR0_TS);
-               vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
-       } else
-               vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
-}
-
 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 {
        unsigned long rflags, save_rflags;
@@ -4232,9 +4184,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        if (enable_ept)
                ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
 
-       if (!vcpu->fpu_active)
-               hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
-
        vmcs_writel(CR0_READ_SHADOW, cr0);
        vmcs_writel(GUEST_CR0, hw_cr0);
        vcpu->arch.cr0 = cr0;
@@ -5051,26 +5000,12 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
        if (pi_test_and_set_pir(vector, &vmx->pi_desc))
                return;
 
-       r = pi_test_and_set_on(&vmx->pi_desc);
-       kvm_make_request(KVM_REQ_EVENT, vcpu);
-       if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
-               kvm_vcpu_kick(vcpu);
-}
-
-static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       if (!pi_test_on(&vmx->pi_desc))
+       /* If a previous notification has sent the IPI, nothing to do.  */
+       if (pi_test_and_set_on(&vmx->pi_desc))
                return;
 
-       pi_clear_on(&vmx->pi_desc);
-       /*
-        * IOMMU can write to PIR.ON, so the barrier matters even on UP.
-        * But on x86 this is just a compiler barrier anyway.
-        */
-       smp_mb__after_atomic();
-       kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+       if (!kvm_vcpu_trigger_posted_interrupt(vcpu))
+               kvm_vcpu_kick(vcpu);
 }
 
 /*
@@ -5335,7 +5270,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        /* 22.2.1, 20.8.1 */
        vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
 
-       vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
+       vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
+       vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
+
        set_cr4_guest_host_mask(vmx);
 
        if (vmx_xsaves_supported())
@@ -5439,7 +5376,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        vmx_set_cr0(vcpu, cr0); /* enter rmode */
        vmx_set_cr4(vcpu, 0);
        vmx_set_efer(vcpu, 0);
-       vmx_fpu_activate(vcpu);
+
        update_exception_bitmap(vcpu);
 
        vpid_sync_context(vmx->vpid);
@@ -5473,26 +5410,20 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
 
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
-       u32 cpu_based_vm_exec_control;
-
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+                     CPU_BASED_VIRTUAL_INTR_PENDING);
 }
 
 static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
-       u32 cpu_based_vm_exec_control;
-
        if (!cpu_has_virtual_nmis() ||
            vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
                enable_irq_window(vcpu);
                return;
        }
 
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+                     CPU_BASED_VIRTUAL_NMI_PENDING);
 }
 
 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -5718,11 +5649,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
        if (is_nmi(intr_info))
                return 1;  /* already handled by vmx_vcpu_run() */
 
-       if (is_no_device(intr_info)) {
-               vmx_fpu_activate(vcpu);
-               return 1;
-       }
-
        if (is_invalid_opcode(intr_info)) {
                if (is_guest_mode(vcpu)) {
                        kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5912,22 +5838,6 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
                return kvm_set_cr4(vcpu, val);
 }
 
-/* called to set cr0 as appropriate for clts instruction exit. */
-static void handle_clts(struct kvm_vcpu *vcpu)
-{
-       if (is_guest_mode(vcpu)) {
-               /*
-                * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
-                * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
-                * just pretend it's off (also in arch.cr0 for fpu_activate).
-                */
-               vmcs_writel(CR0_READ_SHADOW,
-                       vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
-               vcpu->arch.cr0 &= ~X86_CR0_TS;
-       } else
-               vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
-}
-
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification, val;
@@ -5973,9 +5883,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                }
                break;
        case 2: /* clts */
-               handle_clts(vcpu);
+               WARN_ONCE(1, "Guest should always own CR0.TS");
+               vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
                trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
-               vmx_fpu_activate(vcpu);
                return kvm_skip_emulated_instruction(vcpu);
        case 1: /*mov from cr*/
                switch (cr) {
@@ -6151,12 +6061,8 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
 
 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 {
-       u32 cpu_based_vm_exec_control;
-
-       /* clear pending irq */
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+                       CPU_BASED_VIRTUAL_INTR_PENDING);
 
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 
@@ -6382,6 +6288,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
                        EPT_VIOLATION_EXECUTABLE))
                      ? PFERR_PRESENT_MASK : 0;
 
+       vcpu->arch.gpa_available = true;
        vcpu->arch.exit_qualification = exit_qualification;
 
        return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -6399,6 +6306,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
        }
 
        ret = handle_mmio_page_fault(vcpu, gpa, true);
+       vcpu->arch.gpa_available = true;
        if (likely(ret == RET_MMIO_PF_EMULATE))
                return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
                                              EMULATE_DONE;
@@ -6420,12 +6328,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 
 static int handle_nmi_window(struct kvm_vcpu *vcpu)
 {
-       u32 cpu_based_vm_exec_control;
-
-       /* clear pending NMI */
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+                       CPU_BASED_VIRTUAL_NMI_PENDING);
        ++vcpu->stat.nmi_window_exits;
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 
@@ -6663,8 +6567,10 @@ static __init int hardware_setup(void)
        if (!cpu_has_vmx_ple())
                ple_gap = 0;
 
-       if (!cpu_has_vmx_apicv())
+       if (!cpu_has_vmx_apicv()) {
                enable_apicv = 0;
+               kvm_x86_ops->sync_pir_to_irr = NULL;
+       }
 
        if (cpu_has_vmx_tsc_scaling()) {
                kvm_has_tsc_control = true;
@@ -7134,6 +7040,53 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
        return 0;
 }
 
+static int enter_vmx_operation(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct vmcs *shadow_vmcs;
+
+       if (cpu_has_vmx_msr_bitmap()) {
+               vmx->nested.msr_bitmap =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+               if (!vmx->nested.msr_bitmap)
+                       goto out_msr_bitmap;
+       }
+
+       vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+       if (!vmx->nested.cached_vmcs12)
+               goto out_cached_vmcs12;
+
+       if (enable_shadow_vmcs) {
+               shadow_vmcs = alloc_vmcs();
+               if (!shadow_vmcs)
+                       goto out_shadow_vmcs;
+               /* mark vmcs as shadow */
+               shadow_vmcs->revision_id |= (1u << 31);
+               /* init shadow vmcs */
+               vmcs_clear(shadow_vmcs);
+               vmx->vmcs01.shadow_vmcs = shadow_vmcs;
+       }
+
+       INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
+       vmx->nested.vmcs02_num = 0;
+
+       hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+                    HRTIMER_MODE_REL_PINNED);
+       vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+
+       vmx->nested.vmxon = true;
+       return 0;
+
+out_shadow_vmcs:
+       kfree(vmx->nested.cached_vmcs12);
+
+out_cached_vmcs12:
+       free_page((unsigned long)vmx->nested.msr_bitmap);
+
+out_msr_bitmap:
+       return -ENOMEM;
+}
+
 /*
  * Emulate the VMXON instruction.
  * Currently, we just remember that VMX is active, and do not save or even
@@ -7144,9 +7097,9 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
  */
 static int handle_vmon(struct kvm_vcpu *vcpu)
 {
+       int ret;
        struct kvm_segment cs;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct vmcs *shadow_vmcs;
        const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
                | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 
@@ -7186,49 +7139,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 
        if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
                return 1;
-
-       if (cpu_has_vmx_msr_bitmap()) {
-               vmx->nested.msr_bitmap =
-                               (unsigned long *)__get_free_page(GFP_KERNEL);
-               if (!vmx->nested.msr_bitmap)
-                       goto out_msr_bitmap;
-       }
-
-       vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
-       if (!vmx->nested.cached_vmcs12)
-               goto out_cached_vmcs12;
-
-       if (enable_shadow_vmcs) {
-               shadow_vmcs = alloc_vmcs();
-               if (!shadow_vmcs)
-                       goto out_shadow_vmcs;
-               /* mark vmcs as shadow */
-               shadow_vmcs->revision_id |= (1u << 31);
-               /* init shadow vmcs */
-               vmcs_clear(shadow_vmcs);
-               vmx->vmcs01.shadow_vmcs = shadow_vmcs;
-       }
-
-       INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
-       vmx->nested.vmcs02_num = 0;
-
-       hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
-                    HRTIMER_MODE_REL_PINNED);
-       vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
-
-       vmx->nested.vmxon = true;
+       ret = enter_vmx_operation(vcpu);
+       if (ret)
+               return ret;
 
        nested_vmx_succeed(vcpu);
        return kvm_skip_emulated_instruction(vcpu);
-
-out_shadow_vmcs:
-       kfree(vmx->nested.cached_vmcs12);
-
-out_cached_vmcs12:
-       free_page((unsigned long)vmx->nested.msr_bitmap);
-
-out_msr_bitmap:
-       return -ENOMEM;
 }
 
 /*
@@ -7677,6 +7594,18 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
        return kvm_skip_emulated_instruction(vcpu);
 }
 
+static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+       vmx->nested.current_vmptr = vmptr;
+       if (enable_shadow_vmcs) {
+               vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+                             SECONDARY_EXEC_SHADOW_VMCS);
+               vmcs_write64(VMCS_LINK_POINTER,
+                            __pa(vmx->vmcs01.shadow_vmcs));
+               vmx->nested.sync_shadow_vmcs = true;
+       }
+}
+
 /* Emulate the VMPTRLD instruction */
 static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
@@ -7707,7 +7636,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
                }
 
                nested_release_vmcs12(vmx);
-               vmx->nested.current_vmptr = vmptr;
                vmx->nested.current_vmcs12 = new_vmcs12;
                vmx->nested.current_vmcs12_page = page;
                /*
@@ -7716,14 +7644,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
                 */
                memcpy(vmx->nested.cached_vmcs12,
                       vmx->nested.current_vmcs12, VMCS12_SIZE);
-
-               if (enable_shadow_vmcs) {
-                       vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
-                                     SECONDARY_EXEC_SHADOW_VMCS);
-                       vmcs_write64(VMCS_LINK_POINTER,
-                                    __pa(vmx->vmcs01.shadow_vmcs));
-                       vmx->nested.sync_shadow_vmcs = true;
-               }
+               set_current_vmptr(vmx, vmptr);
        }
 
        nested_vmx_succeed(vcpu);
@@ -8517,6 +8438,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        u32 vectoring_info = vmx->idt_vectoring_info;
 
        trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+       vcpu->arch.gpa_available = false;
 
        /*
         * Flush logged GPAs PML buffer, this will make dirty_bitmap more
@@ -8735,6 +8657,27 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
        }
 }
 
+static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int max_irr;
+
+       WARN_ON(!vcpu->arch.apicv_active);
+       if (pi_test_on(&vmx->pi_desc)) {
+               pi_clear_on(&vmx->pi_desc);
+               /*
+                * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+                * But on x86 this is just a compiler barrier anyway.
+                */
+               smp_mb__after_atomic();
+               max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+       } else {
+               max_irr = kvm_lapic_find_highest_irr(vcpu);
+       }
+       vmx_hwapic_irr_update(vcpu, max_irr);
+       return max_irr;
+}
+
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
        if (!kvm_vcpu_apicv_active(vcpu))
@@ -8746,6 +8689,14 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
        vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
 }
 
+static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       pi_clear_on(&vmx->pi_desc);
+       memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
+}
+
 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 {
        u32 exit_intr_info;
@@ -9591,17 +9542,16 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
                kvm_inject_page_fault(vcpu, fault);
 }
 
-static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+                                              struct vmcs12 *vmcs12);
+
+static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                                        struct vmcs12 *vmcs12)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       int maxphyaddr = cpuid_maxphyaddr(vcpu);
+       u64 hpa;
 
        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-               if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
-                   vmcs12->apic_access_addr >> maxphyaddr)
-                       return false;
-
                /*
                 * Translate L1 physical address to host physical
                 * address for vmcs02. Keep the page pinned, so this
@@ -9612,59 +9562,80 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                        nested_release_page(vmx->nested.apic_access_page);
                vmx->nested.apic_access_page =
                        nested_get_page(vcpu, vmcs12->apic_access_addr);
+               /*
+                * If translation failed, no matter: This feature asks
+                * to exit when accessing the given address, and if it
+                * can never be accessed, this feature won't do
+                * anything anyway.
+                */
+               if (vmx->nested.apic_access_page) {
+                       hpa = page_to_phys(vmx->nested.apic_access_page);
+                       vmcs_write64(APIC_ACCESS_ADDR, hpa);
+               } else {
+                       vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+                                       SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+               }
+       } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
+                  cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
+               vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+                             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+               kvm_vcpu_reload_apic_access_page(vcpu);
        }
 
        if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-               if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
-                   vmcs12->virtual_apic_page_addr >> maxphyaddr)
-                       return false;
-
                if (vmx->nested.virtual_apic_page) /* shouldn't happen */
                        nested_release_page(vmx->nested.virtual_apic_page);
                vmx->nested.virtual_apic_page =
                        nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
 
                /*
-                * Failing the vm entry is _not_ what the processor does
-                * but it's basically the only possibility we have.
-                * We could still enter the guest if CR8 load exits are
-                * enabled, CR8 store exits are enabled, and virtualize APIC
-                * access is disabled; in this case the processor would never
-                * use the TPR shadow and we could simply clear the bit from
-                * the execution control.  But such a configuration is useless,
-                * so let's keep the code simple.
+                * If translation failed, VM entry will fail because
+                * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
+                * Failing the vm entry is _not_ what the processor
+                * does but it's basically the only possibility we
+                * have.  We could still enter the guest if CR8 load
+                * exits are enabled, CR8 store exits are enabled, and
+                * virtualize APIC access is disabled; in this case
+                * the processor would never use the TPR shadow and we
+                * could simply clear the bit from the execution
+                * control.  But such a configuration is useless, so
+                * let's keep the code simple.
                 */
-               if (!vmx->nested.virtual_apic_page)
-                       return false;
+               if (vmx->nested.virtual_apic_page) {
+                       hpa = page_to_phys(vmx->nested.virtual_apic_page);
+                       vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
+               }
        }
 
        if (nested_cpu_has_posted_intr(vmcs12)) {
-               if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
-                   vmcs12->posted_intr_desc_addr >> maxphyaddr)
-                       return false;
-
                if (vmx->nested.pi_desc_page) { /* shouldn't happen */
                        kunmap(vmx->nested.pi_desc_page);
                        nested_release_page(vmx->nested.pi_desc_page);
                }
                vmx->nested.pi_desc_page =
                        nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
-               if (!vmx->nested.pi_desc_page)
-                       return false;
-
                vmx->nested.pi_desc =
                        (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
                if (!vmx->nested.pi_desc) {
                        nested_release_page_clean(vmx->nested.pi_desc_page);
-                       return false;
+                       return;
                }
                vmx->nested.pi_desc =
                        (struct pi_desc *)((void *)vmx->nested.pi_desc +
                        (unsigned long)(vmcs12->posted_intr_desc_addr &
                        (PAGE_SIZE - 1)));
+               vmcs_write64(POSTED_INTR_DESC_ADDR,
+                       page_to_phys(vmx->nested.pi_desc_page) +
+                       (unsigned long)(vmcs12->posted_intr_desc_addr &
+                       (PAGE_SIZE - 1)));
        }
-
-       return true;
+       if (cpu_has_vmx_msr_bitmap() &&
+           nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
+           nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
+               ;
+       else
+               vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+                               CPU_BASED_USE_MSR_BITMAPS);
 }
 
 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
@@ -9980,7 +9951,7 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
  * is assigned to entry_failure_code on failure.
  */
 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
-                              unsigned long *entry_failure_code)
+                              u32 *entry_failure_code)
 {
        if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
                if (!nested_cr3_valid(vcpu, cr3)) {
@@ -10020,7 +9991,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
  * is assigned to entry_failure_code on failure.
  */
 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-                         unsigned long *entry_failure_code)
+                         bool from_vmentry, u32 *entry_failure_code)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 exec_control;
@@ -10063,21 +10034,26 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
        vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
 
-       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+       if (from_vmentry &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
                kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
                vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
        } else {
                kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
                vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
        }
-       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-               vmcs12->vm_entry_intr_info_field);
-       vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-               vmcs12->vm_entry_exception_error_code);
-       vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-               vmcs12->vm_entry_instruction_len);
-       vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-               vmcs12->guest_interruptibility_info);
+       if (from_vmentry) {
+               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                            vmcs12->vm_entry_intr_info_field);
+               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+                            vmcs12->vm_entry_exception_error_code);
+               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+                            vmcs12->vm_entry_instruction_len);
+               vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+                            vmcs12->guest_interruptibility_info);
+       } else {
+               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+       }
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
        vmx_set_rflags(vcpu, vmcs12->guest_rflags);
        vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
@@ -10106,12 +10082,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
                vmx->nested.pi_pending = false;
                vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
-               vmcs_write64(POSTED_INTR_DESC_ADDR,
-                       page_to_phys(vmx->nested.pi_desc_page) +
-                       (unsigned long)(vmcs12->posted_intr_desc_addr &
-                       (PAGE_SIZE - 1)));
-       } else
+       } else {
                exec_control &= ~PIN_BASED_POSTED_INTR;
+       }
 
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
 
@@ -10156,26 +10129,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
                        exec_control |= vmcs12->secondary_vm_exec_control;
 
-               if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
-                       /*
-                        * If translation failed, no matter: This feature asks
-                        * to exit when accessing the given address, and if it
-                        * can never be accessed, this feature won't do
-                        * anything anyway.
-                        */
-                       if (!vmx->nested.apic_access_page)
-                               exec_control &=
-                                 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-                       else
-                               vmcs_write64(APIC_ACCESS_ADDR,
-                                 page_to_phys(vmx->nested.apic_access_page));
-               } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
-                           cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
-                       exec_control |=
-                               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-                       kvm_vcpu_reload_apic_access_page(vcpu);
-               }
-
                if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
                        vmcs_write64(EOI_EXIT_BITMAP0,
                                vmcs12->eoi_exit_bitmap0);
@@ -10190,6 +10143,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                }
 
                nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
+
+               /*
+                * Write an illegal value to APIC_ACCESS_ADDR. Later,
+                * nested_get_vmcs12_pages will either fix it up or
+                * remove the VM execution control.
+                */
+               if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+                       vmcs_write64(APIC_ACCESS_ADDR, -1ull);
+
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
        }
 
@@ -10226,19 +10188,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        exec_control &= ~CPU_BASED_TPR_SHADOW;
        exec_control |= vmcs12->cpu_based_vm_exec_control;
 
+       /*
+        * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
+        * nested_get_vmcs12_pages can't fix it up, the illegal value
+        * will result in a VM entry failure.
+        */
        if (exec_control & CPU_BASED_TPR_SHADOW) {
-               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-                               page_to_phys(vmx->nested.virtual_apic_page));
+               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
                vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
        }
 
-       if (cpu_has_vmx_msr_bitmap() &&
-           exec_control & CPU_BASED_USE_MSR_BITMAPS &&
-           nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
-               ; /* MSR_BITMAP will be set by following vmx_set_efer. */
-       else
-               exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
-
        /*
         * Merging of IO bitmap not currently supported.
         * Rather, exit every time.
@@ -10270,16 +10229,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                        ~VM_ENTRY_IA32E_MODE) |
                (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
 
-       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
+       if (from_vmentry &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
                vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
                vcpu->arch.pat = vmcs12->guest_ia32_pat;
-       } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+       } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
-
+       }
 
        set_cr4_guest_host_mask(vmx);
 
-       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
+       if (from_vmentry &&
+           vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
                vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
 
        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
@@ -10318,8 +10279,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        }
 
        /*
-        * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
-        * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
+        * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
+        * bits which we consider mandatory enabled.
         * The CR0_READ_SHADOW is what L2 should have expected to read given
         * the specifications by L1; It's not enough to take
         * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
@@ -10331,7 +10292,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmx_set_cr4(vcpu, vmcs12->guest_cr4);
        vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
 
-       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+       if (from_vmentry &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
                vcpu->arch.efer = vmcs12->guest_ia32_efer;
        else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
                vcpu->arch.efer |= (EFER_LMA | EFER_LME);
@@ -10365,73 +10327,22 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        return 0;
 }
 
-/*
- * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
- * for running an L2 nested guest.
- */
-static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
-       struct vmcs12 *vmcs12;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       int cpu;
-       struct loaded_vmcs *vmcs02;
-       bool ia32e;
-       u32 msr_entry_idx;
-       unsigned long exit_qualification;
-
-       if (!nested_vmx_check_permission(vcpu))
-               return 1;
-
-       if (!nested_vmx_check_vmcs12(vcpu))
-               goto out;
-
-       vmcs12 = get_vmcs12(vcpu);
-
-       if (enable_shadow_vmcs)
-               copy_shadow_to_vmcs12(vmx);
-
-       /*
-        * The nested entry process starts with enforcing various prerequisites
-        * on vmcs12 as required by the Intel SDM, and act appropriately when
-        * they fail: As the SDM explains, some conditions should cause the
-        * instruction to fail, while others will cause the instruction to seem
-        * to succeed, but return an EXIT_REASON_INVALID_STATE.
-        * To speed up the normal (success) code path, we should avoid checking
-        * for misconfigurations which will anyway be caught by the processor
-        * when using the merged vmcs02.
-        */
-       if (vmcs12->launch_state == launch) {
-               nested_vmx_failValid(vcpu,
-                       launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
-                              : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
-               goto out;
-       }
 
        if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
-           vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
-               nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-               goto out;
-       }
+           vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
-       if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
-               nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-               goto out;
-       }
-
-       if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
-               nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-               goto out;
-       }
+       if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
-       if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
-               nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-               goto out;
-       }
+       if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
-       if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
-               nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-               goto out;
-       }
+       if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
        if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
                                vmx->nested.nested_vmx_procbased_ctls_low,
@@ -10448,28 +10359,30 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
            !vmx_control_verify(vmcs12->vm_entry_controls,
                                vmx->nested.nested_vmx_entry_ctls_low,
                                vmx->nested.nested_vmx_entry_ctls_high))
-       {
-               nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-               goto out;
-       }
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
        if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
            !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
-           !nested_cr3_valid(vcpu, vmcs12->host_cr3)) {
-               nested_vmx_failValid(vcpu,
-                       VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
-               goto out;
-       }
+           !nested_cr3_valid(vcpu, vmcs12->host_cr3))
+               return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
+
+       return 0;
+}
+
+static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+                                 u32 *exit_qual)
+{
+       bool ia32e;
+
+       *exit_qual = ENTRY_FAIL_DEFAULT;
 
        if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
-           !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) {
-               nested_vmx_entry_failure(vcpu, vmcs12,
-                       EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+           !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
                return 1;
-       }
-       if (vmcs12->vmcs_link_pointer != -1ull) {
-               nested_vmx_entry_failure(vcpu, vmcs12,
-                       EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
+
+       if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
+           vmcs12->vmcs_link_pointer != -1ull) {
+               *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
                return 1;
        }
 
@@ -10482,16 +10395,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
         *   CR0.PG) is 1.
         */
-       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
+       if (to_vmx(vcpu)->nested.nested_run_pending &&
+           (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
                ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
                if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
                    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
                    ((vmcs12->guest_cr0 & X86_CR0_PG) &&
-                    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
-                       nested_vmx_entry_failure(vcpu, vmcs12,
-                               EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+                    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
                        return 1;
-               }
        }
 
        /*
@@ -10505,28 +10416,26 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                         VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
                if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
                    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
-                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
-                       nested_vmx_entry_failure(vcpu, vmcs12,
-                               EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
                        return 1;
-               }
        }
 
-       /*
-        * We're finally done with prerequisite checking, and can start with
-        * the nested entry.
-        */
+       return 0;
+}
+
+static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       struct loaded_vmcs *vmcs02;
+       int cpu;
+       u32 msr_entry_idx;
+       u32 exit_qual;
 
        vmcs02 = nested_get_current_vmcs02(vmx);
        if (!vmcs02)
                return -ENOMEM;
 
-       /*
-        * After this point, the trap flag no longer triggers a singlestep trap
-        * on the vm entry instructions. Don't call
-        * kvm_skip_emulated_instruction.
-        */
-       skip_emulated_instruction(vcpu);
        enter_guest_mode(vcpu);
 
        if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
@@ -10541,14 +10450,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
        vmx_segment_cache_clear(vmx);
 
-       if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) {
+       if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
                leave_guest_mode(vcpu);
                vmx_load_vmcs01(vcpu);
                nested_vmx_entry_failure(vcpu, vmcs12,
-                               EXIT_REASON_INVALID_STATE, exit_qualification);
+                                        EXIT_REASON_INVALID_STATE, exit_qual);
                return 1;
        }
 
+       nested_get_vmcs12_pages(vcpu, vmcs12);
+
        msr_entry_idx = nested_vmx_load_msr(vcpu,
                                            vmcs12->vm_entry_msr_load_addr,
                                            vmcs12->vm_entry_msr_load_count);
@@ -10562,17 +10473,90 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
        vmcs12->launch_state = 1;
 
-       if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
-               return kvm_vcpu_halt(vcpu);
-
-       vmx->nested.nested_run_pending = 1;
-
        /*
         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
         * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
         * returned as far as L1 is concerned. It will only return (and set
         * the success flag) when L2 exits (see nested_vmx_vmexit()).
         */
+       return 0;
+}
+
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+       struct vmcs12 *vmcs12;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 exit_qual;
+       int ret;
+
+       if (!nested_vmx_check_permission(vcpu))
+               return 1;
+
+       if (!nested_vmx_check_vmcs12(vcpu))
+               goto out;
+
+       vmcs12 = get_vmcs12(vcpu);
+
+       if (enable_shadow_vmcs)
+               copy_shadow_to_vmcs12(vmx);
+
+       /*
+        * The nested entry process starts with enforcing various prerequisites
+        * on vmcs12 as required by the Intel SDM, and act appropriately when
+        * they fail: As the SDM explains, some conditions should cause the
+        * instruction to fail, while others will cause the instruction to seem
+        * to succeed, but return an EXIT_REASON_INVALID_STATE.
+        * To speed up the normal (success) code path, we should avoid checking
+        * for misconfigurations which will anyway be caught by the processor
+        * when using the merged vmcs02.
+        */
+       if (vmcs12->launch_state == launch) {
+               nested_vmx_failValid(vcpu,
+                       launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+                              : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+               goto out;
+       }
+
+       ret = check_vmentry_prereqs(vcpu, vmcs12);
+       if (ret) {
+               nested_vmx_failValid(vcpu, ret);
+               goto out;
+       }
+
+       /*
+        * After this point, the trap flag no longer triggers a singlestep trap
+        * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
+        * This is not 100% correct; for performance reasons, we delegate most
+        * of the checks on host state to the processor.  If those fail,
+        * the singlestep trap is missed.
+        */
+       skip_emulated_instruction(vcpu);
+
+       ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
+       if (ret) {
+               nested_vmx_entry_failure(vcpu, vmcs12,
+                                        EXIT_REASON_INVALID_STATE, exit_qual);
+               return 1;
+       }
+
+       /*
+        * We're finally done with prerequisite checking, and can start with
+        * the nested entry.
+        */
+
+       ret = enter_vmx_non_root_mode(vcpu, true);
+       if (ret)
+               return ret;
+
+       if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
+               return kvm_vcpu_halt(vcpu);
+
+       vmx->nested.nested_run_pending = 1;
+
        return 1;
 
 out:
@@ -10713,21 +10697,13 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
 }
 
 /*
- * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
- * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
- * and this function updates it to reflect the changes to the guest state while
- * L2 was running (and perhaps made some exits which were handled directly by L0
- * without going back to L1), and to reflect the exit reason.
- * Note that we do not have to copy here all VMCS fields, just those that
- * could have changed by the L2 guest or the exit - i.e., the guest-state and
- * exit-information fields only. Other fields are modified by L1 with VMWRITE,
- * which already writes to vmcs12 directly.
+ * Update the guest state fields of vmcs12 to reflect changes that
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
+ * VM-entry controls is also updated, since this is really a guest
+ * state bit.)
  */
-static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-                          u32 exit_reason, u32 exit_intr_info,
-                          unsigned long exit_qualification)
+static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
-       /* update guest state fields: */
        vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
        vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
 
@@ -10833,6 +10809,25 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
        if (nested_cpu_has_xsaves(vmcs12))
                vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
+}
+
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+                          u32 exit_reason, u32 exit_intr_info,
+                          unsigned long exit_qualification)
+{
+       /* update guest state fields: */
+       sync_vmcs12(vcpu, vmcs12);
 
        /* update exit information fields: */
 
@@ -10883,7 +10878,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                                   struct vmcs12 *vmcs12)
 {
        struct kvm_segment seg;
-       unsigned long entry_failure_code;
+       u32 entry_failure_code;
 
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -10898,24 +10893,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
        /*
         * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
-        * actually changed, because it depends on the current state of
-        * fpu_active (which may have changed).
-        * Note that vmx_set_cr0 refers to efer set above.
+        * actually changed, because vmx_set_cr0 refers to efer set above.
+        *
+        * CR0_GUEST_HOST_MASK is already set in the original vmcs01
+        * (KVM doesn't change it);
         */
+       vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
        vmx_set_cr0(vcpu, vmcs12->host_cr0);
-       /*
-        * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
-        * to apply the same changes to L1's vmcs. We just set cr0 correctly,
-        * but we also need to update cr0_guest_host_mask and exception_bitmap.
-        */
-       update_exception_bitmap(vcpu);
-       vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
-       vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 
-       /*
-        * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
-        * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
-        */
+       /* Same as above - no reason to call set_cr4_guest_host_mask().  */
        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
        kvm_set_cr4(vcpu, vmcs12->host_cr4);
 
@@ -11544,9 +11530,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
        .get_pkru = vmx_get_pkru,
 
-       .fpu_activate = vmx_fpu_activate,
-       .fpu_deactivate = vmx_fpu_deactivate,
-
        .tlb_flush = vmx_flush_tlb,
 
        .run = vmx_vcpu_run,
@@ -11571,6 +11554,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .get_enable_apicv = vmx_get_enable_apicv,
        .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
        .load_eoi_exitmap = vmx_load_eoi_exitmap,
+       .apicv_post_state_restore = vmx_apicv_post_state_restore,
        .hwapic_irr_update = vmx_hwapic_irr_update,
        .hwapic_isr_update = vmx_hwapic_isr_update,
        .sync_pir_to_irr = vmx_sync_pir_to_irr,
index 2f64e5d..c484040 100644 (file)
@@ -1811,7 +1811,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
        struct kvm_vcpu_arch *vcpu = &v->arch;
        struct pvclock_vcpu_time_info guest_hv_clock;
 
-       if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+       if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time,
                &guest_hv_clock, sizeof(guest_hv_clock))))
                return;
 
@@ -1832,9 +1832,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
        BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
 
        vcpu->hv_clock.version = guest_hv_clock.version + 1;
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock.version));
+       kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+                                   &vcpu->hv_clock,
+                                   sizeof(vcpu->hv_clock.version));
 
        smp_wmb();
 
@@ -1848,16 +1848,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 
        trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
 
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock));
+       kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+                                   &vcpu->hv_clock,
+                                   sizeof(vcpu->hv_clock));
 
        smp_wmb();
 
        vcpu->hv_clock.version++;
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock.version));
+       kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+                                   &vcpu->hv_clock,
+                                   sizeof(vcpu->hv_clock.version));
 }
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2090,7 +2090,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
                return 0;
        }
 
-       if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+       if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa,
                                        sizeof(u32)))
                return 1;
 
@@ -2109,7 +2109,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
                return;
 
-       if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+       if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime,
                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
                return;
 
@@ -2120,7 +2120,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 
        vcpu->arch.st.steal.version += 1;
 
-       kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+       kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 
        smp_wmb();
@@ -2129,14 +2129,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
                vcpu->arch.st.last_steal;
        vcpu->arch.st.last_steal = current->sched_info.run_delay;
 
-       kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+       kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 
        smp_wmb();
 
        vcpu->arch.st.steal.version += 1;
 
-       kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+       kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 }
 
@@ -2241,7 +2241,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if (!(data & 1))
                        break;
 
-               if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+               if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
                     &vcpu->arch.pv_time, data & ~1ULL,
                     sizeof(struct pvclock_vcpu_time_info)))
                        vcpu->arch.pv_time_enabled = false;
@@ -2262,7 +2262,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if (data & KVM_STEAL_RESERVED_MASK)
                        return 1;
 
-               if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+               if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime,
                                                data & KVM_STEAL_VALID_BITS,
                                                sizeof(struct kvm_steal_time)))
                        return 1;
@@ -2672,6 +2672,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_DISABLE_QUIRKS:
        case KVM_CAP_SET_BOOT_CPU_ID:
        case KVM_CAP_SPLIT_IRQCHIP:
+       case KVM_CAP_IMMEDIATE_EXIT:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
        case KVM_CAP_ASSIGN_DEV_IRQ:
        case KVM_CAP_PCI_2_3:
@@ -2875,7 +2876,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 
        vcpu->arch.st.steal.preempted = 1;
 
-       kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
+       kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime,
                        &vcpu->arch.st.steal.preempted,
                        offsetof(struct kvm_steal_time, preempted),
                        sizeof(vcpu->arch.st.steal.preempted));
@@ -2909,7 +2910,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
                                    struct kvm_lapic_state *s)
 {
-       if (vcpu->arch.apicv_active)
+       if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
                kvm_x86_ops->sync_pir_to_irr(vcpu);
 
        return kvm_apic_get_state(vcpu, s);
@@ -6659,7 +6660,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
        if (irqchip_split(vcpu->kvm))
                kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
        else {
-               if (vcpu->arch.apicv_active)
+               if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
                        kvm_x86_ops->sync_pir_to_irr(vcpu);
                kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
        }
@@ -6750,10 +6751,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        r = 0;
                        goto out;
                }
-               if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
-                       vcpu->fpu_active = 0;
-                       kvm_x86_ops->fpu_deactivate(vcpu);
-               }
                if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
                        /* Page is swapped out. Do synthetic halt */
                        vcpu->arch.apf.halted = true;
@@ -6813,20 +6810,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        kvm_hv_process_stimers(vcpu);
        }
 
-       /*
-        * KVM_REQ_EVENT is not set when posted interrupts are set by
-        * VT-d hardware, so we have to update RVI unconditionally.
-        */
-       if (kvm_lapic_enabled(vcpu)) {
-               /*
-                * Update architecture specific hints for APIC
-                * virtual interrupt delivery.
-                */
-               if (vcpu->arch.apicv_active)
-                       kvm_x86_ops->hwapic_irr_update(vcpu,
-                               kvm_lapic_find_highest_irr(vcpu));
-       }
-
        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
                ++vcpu->stat.req_event;
                kvm_apic_accept_events(vcpu);
@@ -6869,22 +6852,40 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        preempt_disable();
 
        kvm_x86_ops->prepare_guest_switch(vcpu);
-       if (vcpu->fpu_active)
-               kvm_load_guest_fpu(vcpu);
+       kvm_load_guest_fpu(vcpu);
+
+       /*
+        * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
+        * IPI are then delayed after guest entry, which ensures that they
+        * result in virtual interrupt delivery.
+        */
+       local_irq_disable();
        vcpu->mode = IN_GUEST_MODE;
 
        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
        /*
-        * We should set ->mode before check ->requests,
-        * Please see the comment in kvm_make_all_cpus_request.
-        * This also orders the write to mode from any reads
-        * to the page tables done while the VCPU is running.
-        * Please see the comment in kvm_flush_remote_tlbs.
+        * 1) We should set ->mode before checking ->requests.  Please see
+        * the comment in kvm_make_all_cpus_request.
+        *
+        * 2) For APICv, we should set ->mode before checking PIR.ON.  This
+        * pairs with the memory barrier implicit in pi_test_and_set_on
+        * (see vmx_deliver_posted_interrupt).
+        *
+        * 3) This also orders the write to mode from any reads to the page
+        * tables done while the VCPU is running.  Please see the comment
+        * in kvm_flush_remote_tlbs.
         */
        smp_mb__after_srcu_read_unlock();
 
-       local_irq_disable();
+       /*
+        * This handles the case where a posted interrupt was
+        * notified with kvm_vcpu_kick.
+        */
+       if (kvm_lapic_enabled(vcpu)) {
+               if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
+                       kvm_x86_ops->sync_pir_to_irr(vcpu);
+       }
 
        if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
            || need_resched() || signal_pending(current)) {
@@ -7023,6 +7024,9 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 
 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 {
+       if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
+               kvm_x86_ops->check_nested_events(vcpu, false);
+
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted);
 }
@@ -7194,7 +7198,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        } else
                WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
-       r = vcpu_run(vcpu);
+       if (kvm_run->immediate_exit)
+               r = -EINTR;
+       else
+               r = vcpu_run(vcpu);
 
 out:
        post_kvm_run_save(vcpu);
@@ -8389,9 +8396,6 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
-       if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
-               kvm_x86_ops->check_nested_events(vcpu, false);
-
        return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
 }
 
@@ -8528,9 +8532,8 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 
 static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
 {
-
-       return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
-                                     sizeof(val));
+       return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val,
+                                          sizeof(val));
 }
 
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
index 0a54e83..09b4df7 100644 (file)
@@ -176,12 +176,19 @@ static void __exit ptp_kvm_exit(void)
 
 static int __init ptp_kvm_init(void)
 {
+       long ret;
+
        clock_pair_gpa = slow_virt_to_phys(&clock_pair);
        hv_clock = pvclock_pvti_cpu0_va();
 
        if (!hv_clock)
                return -ENODEV;
 
+       ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
+                       KVM_CLOCK_PAIRING_WALLCLOCK);
+       if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
+               return -ENODEV;
+
        kvm_ptp_clock.caps = ptp_kvm_caps;
 
        kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL);
index cda457b..8d69d51 100644 (file)
@@ -221,7 +221,6 @@ struct kvm_vcpu {
        struct mutex mutex;
        struct kvm_run *run;
 
-       int fpu_active;
        int guest_fpu_loaded, guest_xcr0_loaded;
        struct swait_queue_head wq;
        struct pid *pid;
@@ -641,18 +640,18 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
                          unsigned long len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
-int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-                          void *data, unsigned long len);
+int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+                              void *data, unsigned long len);
 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
                         int offset, int len);
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
                    unsigned long len);
-int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-                          void *data, unsigned long len);
-int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-                          void *data, int offset, unsigned long len);
-int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-                             gpa_t gpa, unsigned long len);
+int kvm_vcpu_write_guest_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
+                               void *data, unsigned long len);
+int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
+                                      void *data, int offset, unsigned long len);
+int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
+                                  gpa_t gpa, unsigned long len);
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
index 7964b97..f51d508 100644 (file)
@@ -218,7 +218,8 @@ struct kvm_hyperv_exit {
 struct kvm_run {
        /* in */
        __u8 request_interrupt_window;
-       __u8 padding1[7];
+       __u8 immediate_exit;
+       __u8 padding1[6];
 
        /* out */
        __u32 exit_reason;
@@ -881,6 +882,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SPAPR_RESIZE_HPT 133
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
+#define KVM_CAP_IMMEDIATE_EXIT 136
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
index 482612b..cc4d6e0 100644 (file)
@@ -506,11 +506,6 @@ static struct kvm_memslots *kvm_alloc_memslots(void)
        if (!slots)
                return NULL;
 
-       /*
-        * Init kvm generation close to the maximum to easily test the
-        * code of handling generation number wrap-around.
-        */
-       slots->generation = -150;
        for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
                slots->id_to_index[i] = slots->memslots[i].id = i;
 
@@ -641,9 +636,16 @@ static struct kvm *kvm_create_vm(unsigned long type)
 
        r = -ENOMEM;
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
-               kvm->memslots[i] = kvm_alloc_memslots();
-               if (!kvm->memslots[i])
+               struct kvm_memslots *slots = kvm_alloc_memslots();
+               if (!slots)
                        goto out_err_no_srcu;
+               /*
+                * Generations must be different for each address space.
+                * Init kvm generation close to the maximum to easily test the
+                * code of handling generation number wrap-around.
+                */
+               slots->generation = i * 2 - 150;
+               rcu_assign_pointer(kvm->memslots[i], slots);
        }
 
        if (init_srcu_struct(&kvm->srcu))
@@ -870,8 +872,14 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
         * Increment the new memslot generation a second time. This prevents
         * vm exits that race with memslot updates from caching a memslot
         * generation that will (potentially) be valid forever.
+        *
+        * Generations must be unique even across address spaces.  We do not need
+        * a global counter for that, instead the generation space is evenly split
+        * across address spaces.  For example, with two address spaces, address
+        * space 0 will use generations 0, 4, 8, ... while * address space 1 will
+        * use generations 2, 6, 10, 14, ...
         */
-       slots->generation++;
+       slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1;
 
        kvm_arch_memslots_updated(kvm, slots);
 
@@ -1094,37 +1102,31 @@ int kvm_get_dirty_log(struct kvm *kvm,
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
-       int r, i, as_id, id;
+       int i, as_id, id;
        unsigned long n;
        unsigned long any = 0;
 
-       r = -EINVAL;
        as_id = log->slot >> 16;
        id = (u16)log->slot;
        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
-               goto out;
+               return -EINVAL;
 
        slots = __kvm_memslots(kvm, as_id);
        memslot = id_to_memslot(slots, id);
-       r = -ENOENT;
        if (!memslot->dirty_bitmap)
-               goto out;
+               return -ENOENT;
 
        n = kvm_dirty_bitmap_bytes(memslot);
 
        for (i = 0; !any && i < n/sizeof(long); ++i)
                any = memslot->dirty_bitmap[i];
 
-       r = -EFAULT;
        if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
-               goto out;
+               return -EFAULT;
 
        if (any)
                *is_dirty = 1;
-
-       r = 0;
-out:
-       return r;
+       return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
 
@@ -1156,24 +1158,22 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
-       int r, i, as_id, id;
+       int i, as_id, id;
        unsigned long n;
        unsigned long *dirty_bitmap;
        unsigned long *dirty_bitmap_buffer;
 
-       r = -EINVAL;
        as_id = log->slot >> 16;
        id = (u16)log->slot;
        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
-               goto out;
+               return -EINVAL;
 
        slots = __kvm_memslots(kvm, as_id);
        memslot = id_to_memslot(slots, id);
 
        dirty_bitmap = memslot->dirty_bitmap;
-       r = -ENOENT;
        if (!dirty_bitmap)
-               goto out;
+               return -ENOENT;
 
        n = kvm_dirty_bitmap_bytes(memslot);
 
@@ -1202,14 +1202,9 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
        }
 
        spin_unlock(&kvm->mmu_lock);
-
-       r = -EFAULT;
        if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
-               goto out;
-
-       r = 0;
-out:
-       return r;
+               return -EFAULT;
+       return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
 #endif
@@ -1937,10 +1932,10 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
 
-int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-                             gpa_t gpa, unsigned long len)
+static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
+                                      struct gfn_to_hva_cache *ghc,
+                                      gpa_t gpa, unsigned long len)
 {
-       struct kvm_memslots *slots = kvm_memslots(kvm);
        int offset = offset_in_page(gpa);
        gfn_t start_gfn = gpa >> PAGE_SHIFT;
        gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
@@ -1950,7 +1945,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
        ghc->gpa = gpa;
        ghc->generation = slots->generation;
        ghc->len = len;
-       ghc->memslot = gfn_to_memslot(kvm, start_gfn);
+       ghc->memslot = __gfn_to_memslot(slots, start_gfn);
        ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
        if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
                ghc->hva += offset;
@@ -1960,7 +1955,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                 * verify that the entire region is valid here.
                 */
                while (start_gfn <= end_gfn) {
-                       ghc->memslot = gfn_to_memslot(kvm, start_gfn);
+                       ghc->memslot = __gfn_to_memslot(slots, start_gfn);
                        ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
                                                   &nr_pages_avail);
                        if (kvm_is_error_hva(ghc->hva))
@@ -1972,22 +1967,29 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
        }
        return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
 
-int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-                          void *data, int offset, unsigned long len)
+int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+                             gpa_t gpa, unsigned long len)
 {
-       struct kvm_memslots *slots = kvm_memslots(kvm);
+       struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
+       return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva_cache_init);
+
+int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+                                      void *data, int offset, unsigned long len)
+{
+       struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
        int r;
        gpa_t gpa = ghc->gpa + offset;
 
        BUG_ON(len + offset > ghc->len);
 
        if (slots->generation != ghc->generation)
-               kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len);
+               __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
 
        if (unlikely(!ghc->memslot))
-               return kvm_write_guest(kvm, gpa, data, len);
+               return kvm_vcpu_write_guest(vcpu, gpa, data, len);
 
        if (kvm_is_error_hva(ghc->hva))
                return -EFAULT;
@@ -1999,28 +2001,28 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
+EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_offset_cached);
 
-int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-                          void *data, unsigned long len)
+int kvm_vcpu_write_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+                              void *data, unsigned long len)
 {
-       return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
+       return kvm_vcpu_write_guest_offset_cached(vcpu, ghc, data, 0, len);
 }
-EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
+EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_cached);
 
-int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-                          void *data, unsigned long len)
+int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+                              void *data, unsigned long len)
 {
-       struct kvm_memslots *slots = kvm_memslots(kvm);
+       struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
        int r;
 
        BUG_ON(len > ghc->len);
 
        if (slots->generation != ghc->generation)
-               kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len);
+               __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
 
        if (unlikely(!ghc->memslot))
-               return kvm_read_guest(kvm, ghc->gpa, data, len);
+               return kvm_vcpu_read_guest(vcpu, ghc->gpa, data, len);
 
        if (kvm_is_error_hva(ghc->hva))
                return -EFAULT;
@@ -2031,7 +2033,7 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
+EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_cached);
 
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
@@ -3133,10 +3135,9 @@ static long kvm_vm_compat_ioctl(struct file *filp,
                struct compat_kvm_dirty_log compat_log;
                struct kvm_dirty_log log;
 
-               r = -EFAULT;
                if (copy_from_user(&compat_log, (void __user *)arg,
                                   sizeof(compat_log)))
-                       goto out;
+                       return -EFAULT;
                log.slot         = compat_log.slot;
                log.padding1     = compat_log.padding1;
                log.padding2     = compat_log.padding2;
@@ -3148,8 +3149,6 @@ static long kvm_vm_compat_ioctl(struct file *filp,
        default:
                r = kvm_vm_ioctl(filp, ioctl, arg);
        }
-
-out:
        return r;
 }
 #endif