Merge tag 'noinstr-x86-kvm-2020-05-16' of git://git.kernel.org/pub/scm/linux/kernel...

author Paolo Bonzini <pbonzini@redhat.com>

Wed, 20 May 2020 07:40:09 +0000 (03:40 -0400)

committer Paolo Bonzini <pbonzini@redhat.com>

Wed, 20 May 2020 07:40:09 +0000 (03:40 -0400)
author Paolo Bonzini <pbonzini@redhat.com>
Wed, 20 May 2020 07:40:09 +0000 (03:40 -0400)
committer Paolo Bonzini <pbonzini@redhat.com>
Wed, 20 May 2020 07:40:09 +0000 (03:40 -0400)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst

index efbbe57..d871dac 100644 (file)
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5802,6 +5802,23 @@ If present, this capability can be enabled for a VM, meaning that KVM
  will allow the transition to secure guest mode.  Otherwise KVM will
  veto the transition.
  
+7.20 KVM_CAP_HALT_POLL
+----------------------
+
+:Architectures: all
+:Target: VM
+:Parameters: args[0] is the maximum poll time in nanoseconds
+:Returns: 0 on success; -1 on error
+
+This capability overrides the kvm module parameter halt_poll_ns for the
+target VM.
+
+VCPU polling allows a VCPU to poll for wakeup events instead of immediately
+scheduling during guest halts. The maximum time a VCPU can spend polling is
+controlled by the kvm module parameter halt_poll_ns. This capability allows
+the maximum halt time to specified on a per-VM basis, effectively overriding
+the module parameter for the target VM.
+
  8. Other capabilities.
  ======================
  
diff --git a/Documentation/virt/kvm/cpuid.rst b/Documentation/virt/kvm/cpuid.rst

index 01b081f..f721c89 100644 (file)
--- a/Documentation/virt/kvm/cpuid.rst
+++ b/Documentation/virt/kvm/cpuid.rst
@@ -50,8 +50,8 @@ KVM_FEATURE_NOP_IO_DELAY          1           not necessary to perform delays
  KVM_FEATURE_MMU_OP                2           deprecated
  
  KVM_FEATURE_CLOCKSOURCE2          3           kvmclock available at msrs
-
                                                0x4b564d00 and 0x4b564d01
+
  KVM_FEATURE_ASYNC_PF              4           async pf can be enabled by
                                                writing to msr 0x4b564d02
  
diff --git a/Documentation/virt/kvm/nested-vmx.rst b/Documentation/virt/kvm/nested-vmx.rst

index 592b0ab..89851cb 100644 (file)
--- a/Documentation/virt/kvm/nested-vmx.rst
+++ b/Documentation/virt/kvm/nested-vmx.rst
@@ -116,10 +116,7 @@ struct shadow_vmcs is ever changed.
                 natural_width cr4_guest_host_mask;
                 natural_width cr0_read_shadow;
                 natural_width cr4_read_shadow;
-               natural_width cr3_target_value0;
-               natural_width cr3_target_value1;
-               natural_width cr3_target_value2;
-               natural_width cr3_target_value3;
+               natural_width dead_space[4]; /* Last remnants of cr3_target_value[0-3]. */
                 natural_width exit_qualification;
                 natural_width guest_linear_address;
                 natural_width guest_cr0;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h

index 32c8a67..3833736 100644 (file)
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -415,6 +415,8 @@ struct kvm_vm_stat {
  struct kvm_vcpu_stat {
         u64 halt_successful_poll;
         u64 halt_attempted_poll;
+       u64 halt_poll_success_ns;
+       u64 halt_poll_fail_ns;
         u64 halt_poll_invalid;
         u64 halt_wakeup;
         u64 hvc_exit_stat;
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c

index 50a279d..55ebb9e 100644 (file)
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -29,20 +29,19 @@
  
  #include "trace.h"
  
-#define VM_STAT(x) { #x, offsetof(struct kvm, stat.x), KVM_STAT_VM }
-#define VCPU_STAT(x) { #x, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU }
-
  struct kvm_stats_debugfs_item debugfs_entries[] = {
-       VCPU_STAT(halt_successful_poll),
-       VCPU_STAT(halt_attempted_poll),
-       VCPU_STAT(halt_poll_invalid),
-       VCPU_STAT(halt_wakeup),
-       VCPU_STAT(hvc_exit_stat),
-       VCPU_STAT(wfe_exit_stat),
-       VCPU_STAT(wfi_exit_stat),
-       VCPU_STAT(mmio_exit_user),
-       VCPU_STAT(mmio_exit_kernel),
-       VCPU_STAT(exits),
+       VCPU_STAT("halt_successful_poll", halt_successful_poll),
+       VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+       VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+       VCPU_STAT("halt_wakeup", halt_wakeup),
+       VCPU_STAT("hvc_exit_stat", hvc_exit_stat),
+       VCPU_STAT("wfe_exit_stat", wfe_exit_stat),
+       VCPU_STAT("wfi_exit_stat", wfi_exit_stat),
+       VCPU_STAT("mmio_exit_user", mmio_exit_user),
+       VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel),
+       VCPU_STAT("exits", exits),
+       VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+       VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
         { NULL }
  };
  
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h

index 2c343c3..e28b5a9 100644 (file)
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -174,6 +174,8 @@ struct kvm_vcpu_stat {
  #endif
         u64 halt_successful_poll;
         u64 halt_attempted_poll;
+       u64 halt_poll_success_ns;
+       u64 halt_poll_fail_ns;
         u64 halt_poll_invalid;
         u64 halt_wakeup;
  };
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c

index 8f05dd0..99ed08a 100644 (file)
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -39,40 +39,41 @@
  #define VECTORSPACING 0x100    /* for EI/VI mode */
  #endif
  
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x)
  struct kvm_stats_debugfs_item debugfs_entries[] = {
-       { "wait",         VCPU_STAT(wait_exits),         KVM_STAT_VCPU },
-       { "cache",        VCPU_STAT(cache_exits),        KVM_STAT_VCPU },
-       { "signal",       VCPU_STAT(signal_exits),       KVM_STAT_VCPU },
-       { "interrupt",    VCPU_STAT(int_exits),          KVM_STAT_VCPU },
-       { "cop_unusable", VCPU_STAT(cop_unusable_exits), KVM_STAT_VCPU },
-       { "tlbmod",       VCPU_STAT(tlbmod_exits),       KVM_STAT_VCPU },
-       { "tlbmiss_ld",   VCPU_STAT(tlbmiss_ld_exits),   KVM_STAT_VCPU },
-       { "tlbmiss_st",   VCPU_STAT(tlbmiss_st_exits),   KVM_STAT_VCPU },
-       { "addrerr_st",   VCPU_STAT(addrerr_st_exits),   KVM_STAT_VCPU },
-       { "addrerr_ld",   VCPU_STAT(addrerr_ld_exits),   KVM_STAT_VCPU },
-       { "syscall",      VCPU_STAT(syscall_exits),      KVM_STAT_VCPU },
-       { "resvd_inst",   VCPU_STAT(resvd_inst_exits),   KVM_STAT_VCPU },
-       { "break_inst",   VCPU_STAT(break_inst_exits),   KVM_STAT_VCPU },
-       { "trap_inst",    VCPU_STAT(trap_inst_exits),    KVM_STAT_VCPU },
-       { "msa_fpe",      VCPU_STAT(msa_fpe_exits),      KVM_STAT_VCPU },
-       { "fpe",          VCPU_STAT(fpe_exits),          KVM_STAT_VCPU },
-       { "msa_disabled", VCPU_STAT(msa_disabled_exits), KVM_STAT_VCPU },
-       { "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU },
+       VCPU_STAT("wait", wait_exits),
+       VCPU_STAT("cache", cache_exits),
+       VCPU_STAT("signal", signal_exits),
+       VCPU_STAT("interrupt", int_exits),
+       VCPU_STAT("cop_unusable", cop_unusable_exits),
+       VCPU_STAT("tlbmod", tlbmod_exits),
+       VCPU_STAT("tlbmiss_ld", tlbmiss_ld_exits),
+       VCPU_STAT("tlbmiss_st", tlbmiss_st_exits),
+       VCPU_STAT("addrerr_st", addrerr_st_exits),
+       VCPU_STAT("addrerr_ld", addrerr_ld_exits),
+       VCPU_STAT("syscall", syscall_exits),
+       VCPU_STAT("resvd_inst", resvd_inst_exits),
+       VCPU_STAT("break_inst", break_inst_exits),
+       VCPU_STAT("trap_inst", trap_inst_exits),
+       VCPU_STAT("msa_fpe", msa_fpe_exits),
+       VCPU_STAT("fpe", fpe_exits),
+       VCPU_STAT("msa_disabled", msa_disabled_exits),
+       VCPU_STAT("flush_dcache", flush_dcache_exits),
  #ifdef CONFIG_KVM_MIPS_VZ
-       { "vz_gpsi",      VCPU_STAT(vz_gpsi_exits),      KVM_STAT_VCPU },
-       { "vz_gsfc",      VCPU_STAT(vz_gsfc_exits),      KVM_STAT_VCPU },
-       { "vz_hc",        VCPU_STAT(vz_hc_exits),        KVM_STAT_VCPU },
-       { "vz_grr",       VCPU_STAT(vz_grr_exits),       KVM_STAT_VCPU },
-       { "vz_gva",       VCPU_STAT(vz_gva_exits),       KVM_STAT_VCPU },
-       { "vz_ghfc",      VCPU_STAT(vz_ghfc_exits),      KVM_STAT_VCPU },
-       { "vz_gpa",       VCPU_STAT(vz_gpa_exits),       KVM_STAT_VCPU },
-       { "vz_resvd",     VCPU_STAT(vz_resvd_exits),     KVM_STAT_VCPU },
+       VCPU_STAT("vz_gpsi", vz_gpsi_exits),
+       VCPU_STAT("vz_gsfc", vz_gsfc_exits),
+       VCPU_STAT("vz_hc", vz_hc_exits),
+       VCPU_STAT("vz_grr", vz_grr_exits),
+       VCPU_STAT("vz_gva", vz_gva_exits),
+       VCPU_STAT("vz_ghfc", vz_ghfc_exits),
+       VCPU_STAT("vz_gpa", vz_gpa_exits),
+       VCPU_STAT("vz_resvd", vz_resvd_exits),
  #endif
-       { "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU },
-       { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), KVM_STAT_VCPU },
-       { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid), KVM_STAT_VCPU },
-       { "halt_wakeup",  VCPU_STAT(halt_wakeup),        KVM_STAT_VCPU },
+       VCPU_STAT("halt_successful_poll", halt_successful_poll),
+       VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+       VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+       VCPU_STAT("halt_wakeup", halt_wakeup),
+       VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+       VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
         {NULL}
  };
  
@@ -284,8 +285,7 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer)
         kvm_mips_callbacks->queue_timer_int(vcpu);
  
         vcpu->arch.wait = 0;
-       if (swq_has_sleeper(&vcpu->wq))
-               swake_up_one(&vcpu->wq);
+       rcuwait_wake_up(&vcpu->wait);
  
         return kvm_mips_count_timeout(vcpu);
  }
@@ -439,8 +439,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
         return -ENOIOCTLCMD;
  }
  
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
  {
+       struct kvm_run *run = vcpu->run;
         int r = -EINTR;
  
         vcpu_load(vcpu);
@@ -511,8 +512,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
  
         dvcpu->arch.wait = 0;
  
-       if (swq_has_sleeper(&dvcpu->wq))
-               swake_up_one(&dvcpu->wq);
+       rcuwait_wake_up(&dvcpu->wait);
  
         return 0;
  }
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h

index 506e4df..6e5d85b 100644 (file)
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -78,7 +78,7 @@ struct kvmppc_vcore {
         struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS];
         struct list_head preempt_list;
         spinlock_t lock;
-       struct swait_queue_head wq;
+       struct rcuwait wait;
         spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
         u64 stolen_tb;
         u64 preempt_tb;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h

index 1dc6310..337047b 100644 (file)
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -751,7 +751,7 @@ struct kvm_vcpu_arch {
         u8 irq_pending; /* Used by XIVE to signal pending guest irqs */
         u32 last_inst;
  
-       struct swait_queue_head *wqp;
+       struct rcuwait *waitp;
         struct kvmppc_vcore *vcore;
         int ret;
         int trap;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c

index 5690a1f..37508a3 100644 (file)
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -36,41 +36,38 @@
  #include "book3s.h"
  #include "trace.h"
  
-#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
-#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
-
  /* #define EXIT_DEBUG */
  
  struct kvm_stats_debugfs_item debugfs_entries[] = {
-       { "exits",       VCPU_STAT(sum_exits) },
-       { "mmio",        VCPU_STAT(mmio_exits) },
-       { "sig",         VCPU_STAT(signal_exits) },
-       { "sysc",        VCPU_STAT(syscall_exits) },
-       { "inst_emu",    VCPU_STAT(emulated_inst_exits) },
-       { "dec",         VCPU_STAT(dec_exits) },
-       { "ext_intr",    VCPU_STAT(ext_intr_exits) },
-       { "queue_intr",  VCPU_STAT(queue_intr) },
-       { "halt_poll_success_ns",       VCPU_STAT(halt_poll_success_ns) },
-       { "halt_poll_fail_ns",          VCPU_STAT(halt_poll_fail_ns) },
-       { "halt_wait_ns",               VCPU_STAT(halt_wait_ns) },
-       { "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
-       { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), },
-       { "halt_successful_wait",       VCPU_STAT(halt_successful_wait) },
-       { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
-       { "halt_wakeup", VCPU_STAT(halt_wakeup) },
-       { "pf_storage",  VCPU_STAT(pf_storage) },
-       { "sp_storage",  VCPU_STAT(sp_storage) },
-       { "pf_instruc",  VCPU_STAT(pf_instruc) },
-       { "sp_instruc",  VCPU_STAT(sp_instruc) },
-       { "ld",          VCPU_STAT(ld) },
-       { "ld_slow",     VCPU_STAT(ld_slow) },
-       { "st",          VCPU_STAT(st) },
-       { "st_slow",     VCPU_STAT(st_slow) },
-       { "pthru_all",       VCPU_STAT(pthru_all) },
-       { "pthru_host",      VCPU_STAT(pthru_host) },
-       { "pthru_bad_aff",   VCPU_STAT(pthru_bad_aff) },
-       { "largepages_2M",    VM_STAT(num_2M_pages, .mode = 0444) },
-       { "largepages_1G",    VM_STAT(num_1G_pages, .mode = 0444) },
+       VCPU_STAT("exits", sum_exits),
+       VCPU_STAT("mmio", mmio_exits),
+       VCPU_STAT("sig", signal_exits),
+       VCPU_STAT("sysc", syscall_exits),
+       VCPU_STAT("inst_emu", emulated_inst_exits),
+       VCPU_STAT("dec", dec_exits),
+       VCPU_STAT("ext_intr", ext_intr_exits),
+       VCPU_STAT("queue_intr", queue_intr),
+       VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+       VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+       VCPU_STAT("halt_wait_ns", halt_wait_ns),
+       VCPU_STAT("halt_successful_poll", halt_successful_poll),
+       VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+       VCPU_STAT("halt_successful_wait", halt_successful_wait),
+       VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+       VCPU_STAT("halt_wakeup", halt_wakeup),
+       VCPU_STAT("pf_storage", pf_storage),
+       VCPU_STAT("sp_storage", sp_storage),
+       VCPU_STAT("pf_instruc", pf_instruc),
+       VCPU_STAT("sp_instruc", sp_instruc),
+       VCPU_STAT("ld", ld),
+       VCPU_STAT("ld_slow", ld_slow),
+       VCPU_STAT("st", st),
+       VCPU_STAT("st_slow", st_slow),
+       VCPU_STAT("pthru_all", pthru_all),
+       VCPU_STAT("pthru_host", pthru_host),
+       VCPU_STAT("pthru_bad_aff", pthru_bad_aff),
+       VM_STAT("largepages_2M", num_2M_pages, .mode = 0444),
+       VM_STAT("largepages_1G", num_1G_pages, .mode = 0444),
         { NULL }
  };
  
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c

index 93493f0..7f59c47 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -230,13 +230,11 @@ static bool kvmppc_ipi_thread(int cpu)
  static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
  {
         int cpu;
-       struct swait_queue_head *wqp;
+       struct rcuwait *waitp;
  
-       wqp = kvm_arch_vcpu_wq(vcpu);
-       if (swq_has_sleeper(wqp)) {
-               swake_up_one(wqp);
+       waitp = kvm_arch_vcpu_get_wait(vcpu);
+       if (rcuwait_wake_up(waitp))
                 ++vcpu->stat.halt_wakeup;
-       }
  
         cpu = READ_ONCE(vcpu->arch.thread_cpu);
         if (cpu >= 0 && kvmppc_ipi_thread(cpu))
@@ -2125,7 +2123,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int id)
  
         spin_lock_init(&vcore->lock);
         spin_lock_init(&vcore->stoltb_lock);
-       init_swait_queue_head(&vcore->wq);
+       rcuwait_init(&vcore->wait);
         vcore->preempt_tb = TB_NIL;
         vcore->lpcr = kvm->arch.lpcr;
         vcore->first_vcpuid = id;
@@ -3784,7 +3782,6 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
         ktime_t cur, start_poll, start_wait;
         int do_sleep = 1;
         u64 block_ns;
-       DECLARE_SWAITQUEUE(wait);
  
         /* Poll for pending exceptions and ceded state */
         cur = start_poll = ktime_get();
@@ -3812,10 +3809,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
                 }
         }
  
-       prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE);
-
+       prepare_to_rcuwait(&vc->wait);
+       set_current_state(TASK_INTERRUPTIBLE);
         if (kvmppc_vcore_check_block(vc)) {
-               finish_swait(&vc->wq, &wait);
+               finish_rcuwait(&vc->wait);
                 do_sleep = 0;
                 /* If we polled, count this as a successful poll */
                 if (vc->halt_poll_ns)
@@ -3829,7 +3826,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
         trace_kvmppc_vcore_blocked(vc, 0);
         spin_unlock(&vc->lock);
         schedule();
-       finish_swait(&vc->wq, &wait);
+       finish_rcuwait(&vc->wait);
         spin_lock(&vc->lock);
         vc->vcore_state = VCORE_INACTIVE;
         trace_kvmppc_vcore_blocked(vc, 1);
@@ -3940,7 +3937,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                         kvmppc_start_thread(vcpu, vc);
                         trace_kvm_guest_enter(vcpu);
                 } else if (vc->vcore_state == VCORE_SLEEPING) {
-                       swake_up_one(&vc->wq);
+                       rcuwait_wake_up(&vc->wait);
                 }
  
         }
@@ -4279,7 +4276,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
         }
         user_vrsave = mfspr(SPRN_VRSAVE);
  
-       vcpu->arch.wqp = &vcpu->arch.vcore->wq;
+       vcpu->arch.waitp = &vcpu->arch.vcore->wait;
         vcpu->arch.pgdir = kvm->mm->pgd;
         vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
  
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c

index 6c18ea8..888afe8 100644 (file)
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -35,29 +35,28 @@
  
  unsigned long kvmppc_booke_handlers;
  
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-
  struct kvm_stats_debugfs_item debugfs_entries[] = {
-       { "mmio",       VCPU_STAT(mmio_exits) },
-       { "sig",        VCPU_STAT(signal_exits) },
-       { "itlb_r",     VCPU_STAT(itlb_real_miss_exits) },
-       { "itlb_v",     VCPU_STAT(itlb_virt_miss_exits) },
-       { "dtlb_r",     VCPU_STAT(dtlb_real_miss_exits) },
-       { "dtlb_v",     VCPU_STAT(dtlb_virt_miss_exits) },
-       { "sysc",       VCPU_STAT(syscall_exits) },
-       { "isi",        VCPU_STAT(isi_exits) },
-       { "dsi",        VCPU_STAT(dsi_exits) },
-       { "inst_emu",   VCPU_STAT(emulated_inst_exits) },
-       { "dec",        VCPU_STAT(dec_exits) },
-       { "ext_intr",   VCPU_STAT(ext_intr_exits) },
-       { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
-       { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
-       { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
-       { "halt_wakeup", VCPU_STAT(halt_wakeup) },
-       { "doorbell", VCPU_STAT(dbell_exits) },
-       { "guest doorbell", VCPU_STAT(gdbell_exits) },
-       { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+       VCPU_STAT("mmio", mmio_exits),
+       VCPU_STAT("sig", signal_exits),
+       VCPU_STAT("itlb_r", itlb_real_miss_exits),
+       VCPU_STAT("itlb_v", itlb_virt_miss_exits),
+       VCPU_STAT("dtlb_r", dtlb_real_miss_exits),
+       VCPU_STAT("dtlb_v", dtlb_virt_miss_exits),
+       VCPU_STAT("sysc", syscall_exits),
+       VCPU_STAT("isi", isi_exits),
+       VCPU_STAT("dsi", dsi_exits),
+       VCPU_STAT("inst_emu", emulated_inst_exits),
+       VCPU_STAT("dec", dec_exits),
+       VCPU_STAT("ext_intr", ext_intr_exits),
+       VCPU_STAT("halt_successful_poll", halt_successful_poll),
+       VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+       VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+       VCPU_STAT("halt_wakeup", halt_wakeup),
+       VCPU_STAT("doorbell", dbell_exits),
+       VCPU_STAT("guest doorbell", gdbell_exits),
+       VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+       VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+       VM_STAT("remote_tlb_flush", remote_tlb_flush),
         { NULL }
  };
  
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c

index ad2f172..27ccff6 100644 (file)
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -752,7 +752,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
         if (err)
                 goto out_vcpu_uninit;
  
-       vcpu->arch.wqp = &vcpu->wq;
+       vcpu->arch.waitp = &vcpu->wait;
         kvmppc_create_vcpu_debugfs(vcpu, vcpu->vcpu_id);
         return 0;
  
@@ -1765,8 +1765,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
         return r;
  }
  
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
  {
+       struct kvm_run *run = vcpu->run;
         int r;
  
         vcpu_load(vcpu);
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h

index d6bcd34..176f74c 100644 (file)
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -375,6 +375,8 @@ struct kvm_vcpu_stat {
         u64 halt_poll_invalid;
         u64 halt_no_poll_steal;
         u64 halt_wakeup;
+       u64 halt_poll_success_ns;
+       u64 halt_poll_fail_ns;
         u64 instruction_lctl;
         u64 instruction_lctlg;
         u64 instruction_stctl;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c

index d05bb04..a560a36 100644 (file)
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -57,110 +57,109 @@
  #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
                            (KVM_MAX_VCPUS + LOCAL_IRQS))
  
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-
  struct kvm_stats_debugfs_item debugfs_entries[] = {
-       { "userspace_handled", VCPU_STAT(exit_userspace) },
-       { "exit_null", VCPU_STAT(exit_null) },
-       { "exit_validity", VCPU_STAT(exit_validity) },
-       { "exit_stop_request", VCPU_STAT(exit_stop_request) },
-       { "exit_external_request", VCPU_STAT(exit_external_request) },
-       { "exit_io_request", VCPU_STAT(exit_io_request) },
-       { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) },
-       { "exit_instruction", VCPU_STAT(exit_instruction) },
-       { "exit_pei", VCPU_STAT(exit_pei) },
-       { "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
-       { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
-       { "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
-       { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
-       { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
-       { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
-       { "halt_no_poll_steal", VCPU_STAT(halt_no_poll_steal) },
-       { "halt_wakeup", VCPU_STAT(halt_wakeup) },
-       { "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
-       { "instruction_lctl", VCPU_STAT(instruction_lctl) },
-       { "instruction_stctl", VCPU_STAT(instruction_stctl) },
-       { "instruction_stctg", VCPU_STAT(instruction_stctg) },
-       { "deliver_ckc", VCPU_STAT(deliver_ckc) },
-       { "deliver_cputm", VCPU_STAT(deliver_cputm) },
-       { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
-       { "deliver_external_call", VCPU_STAT(deliver_external_call) },
-       { "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
-       { "deliver_virtio", VCPU_STAT(deliver_virtio) },
-       { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
-       { "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) },
-       { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
-       { "deliver_program", VCPU_STAT(deliver_program) },
-       { "deliver_io", VCPU_STAT(deliver_io) },
-       { "deliver_machine_check", VCPU_STAT(deliver_machine_check) },
-       { "exit_wait_state", VCPU_STAT(exit_wait_state) },
-       { "inject_ckc", VCPU_STAT(inject_ckc) },
-       { "inject_cputm", VCPU_STAT(inject_cputm) },
-       { "inject_external_call", VCPU_STAT(inject_external_call) },
-       { "inject_float_mchk", VM_STAT(inject_float_mchk) },
-       { "inject_emergency_signal", VCPU_STAT(inject_emergency_signal) },
-       { "inject_io", VM_STAT(inject_io) },
-       { "inject_mchk", VCPU_STAT(inject_mchk) },
-       { "inject_pfault_done", VM_STAT(inject_pfault_done) },
-       { "inject_program", VCPU_STAT(inject_program) },
-       { "inject_restart", VCPU_STAT(inject_restart) },
-       { "inject_service_signal", VM_STAT(inject_service_signal) },
-       { "inject_set_prefix", VCPU_STAT(inject_set_prefix) },
-       { "inject_stop_signal", VCPU_STAT(inject_stop_signal) },
-       { "inject_pfault_init", VCPU_STAT(inject_pfault_init) },
-       { "inject_virtio", VM_STAT(inject_virtio) },
-       { "instruction_epsw", VCPU_STAT(instruction_epsw) },
-       { "instruction_gs", VCPU_STAT(instruction_gs) },
-       { "instruction_io_other", VCPU_STAT(instruction_io_other) },
-       { "instruction_lpsw", VCPU_STAT(instruction_lpsw) },
-       { "instruction_lpswe", VCPU_STAT(instruction_lpswe) },
-       { "instruction_pfmf", VCPU_STAT(instruction_pfmf) },
-       { "instruction_ptff", VCPU_STAT(instruction_ptff) },
-       { "instruction_stidp", VCPU_STAT(instruction_stidp) },
-       { "instruction_sck", VCPU_STAT(instruction_sck) },
-       { "instruction_sckpf", VCPU_STAT(instruction_sckpf) },
-       { "instruction_spx", VCPU_STAT(instruction_spx) },
-       { "instruction_stpx", VCPU_STAT(instruction_stpx) },
-       { "instruction_stap", VCPU_STAT(instruction_stap) },
-       { "instruction_iske", VCPU_STAT(instruction_iske) },
-       { "instruction_ri", VCPU_STAT(instruction_ri) },
-       { "instruction_rrbe", VCPU_STAT(instruction_rrbe) },
-       { "instruction_sske", VCPU_STAT(instruction_sske) },
-       { "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) },
-       { "instruction_essa", VCPU_STAT(instruction_essa) },
-       { "instruction_stsi", VCPU_STAT(instruction_stsi) },
-       { "instruction_stfl", VCPU_STAT(instruction_stfl) },
-       { "instruction_tb", VCPU_STAT(instruction_tb) },
-       { "instruction_tpi", VCPU_STAT(instruction_tpi) },
-       { "instruction_tprot", VCPU_STAT(instruction_tprot) },
-       { "instruction_tsch", VCPU_STAT(instruction_tsch) },
-       { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
-       { "instruction_sie", VCPU_STAT(instruction_sie) },
-       { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
-       { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
-       { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
-       { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) },
-       { "instruction_sigp_cond_emergency", VCPU_STAT(instruction_sigp_cond_emergency) },
-       { "instruction_sigp_start", VCPU_STAT(instruction_sigp_start) },
-       { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
-       { "instruction_sigp_stop_store_status", VCPU_STAT(instruction_sigp_stop_store_status) },
-       { "instruction_sigp_store_status", VCPU_STAT(instruction_sigp_store_status) },
-       { "instruction_sigp_store_adtl_status", VCPU_STAT(instruction_sigp_store_adtl_status) },
-       { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
-       { "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) },
-       { "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) },
-       { "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) },
-       { "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) },
-       { "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) },
-       { "instruction_diag_10", VCPU_STAT(diagnose_10) },
-       { "instruction_diag_44", VCPU_STAT(diagnose_44) },
-       { "instruction_diag_9c", VCPU_STAT(diagnose_9c) },
-       { "diag_9c_ignored", VCPU_STAT(diagnose_9c_ignored) },
-       { "instruction_diag_258", VCPU_STAT(diagnose_258) },
-       { "instruction_diag_308", VCPU_STAT(diagnose_308) },
-       { "instruction_diag_500", VCPU_STAT(diagnose_500) },
-       { "instruction_diag_other", VCPU_STAT(diagnose_other) },
+       VCPU_STAT("userspace_handled", exit_userspace),
+       VCPU_STAT("exit_null", exit_null),
+       VCPU_STAT("exit_validity", exit_validity),
+       VCPU_STAT("exit_stop_request", exit_stop_request),
+       VCPU_STAT("exit_external_request", exit_external_request),
+       VCPU_STAT("exit_io_request", exit_io_request),
+       VCPU_STAT("exit_external_interrupt", exit_external_interrupt),
+       VCPU_STAT("exit_instruction", exit_instruction),
+       VCPU_STAT("exit_pei", exit_pei),
+       VCPU_STAT("exit_program_interruption", exit_program_interruption),
+       VCPU_STAT("exit_instr_and_program_int", exit_instr_and_program),
+       VCPU_STAT("exit_operation_exception", exit_operation_exception),
+       VCPU_STAT("halt_successful_poll", halt_successful_poll),
+       VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+       VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+       VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal),
+       VCPU_STAT("halt_wakeup", halt_wakeup),
+       VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+       VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+       VCPU_STAT("instruction_lctlg", instruction_lctlg),
+       VCPU_STAT("instruction_lctl", instruction_lctl),
+       VCPU_STAT("instruction_stctl", instruction_stctl),
+       VCPU_STAT("instruction_stctg", instruction_stctg),
+       VCPU_STAT("deliver_ckc", deliver_ckc),
+       VCPU_STAT("deliver_cputm", deliver_cputm),
+       VCPU_STAT("deliver_emergency_signal", deliver_emergency_signal),
+       VCPU_STAT("deliver_external_call", deliver_external_call),
+       VCPU_STAT("deliver_service_signal", deliver_service_signal),
+       VCPU_STAT("deliver_virtio", deliver_virtio),
+       VCPU_STAT("deliver_stop_signal", deliver_stop_signal),
+       VCPU_STAT("deliver_prefix_signal", deliver_prefix_signal),
+       VCPU_STAT("deliver_restart_signal", deliver_restart_signal),
+       VCPU_STAT("deliver_program", deliver_program),
+       VCPU_STAT("deliver_io", deliver_io),
+       VCPU_STAT("deliver_machine_check", deliver_machine_check),
+       VCPU_STAT("exit_wait_state", exit_wait_state),
+       VCPU_STAT("inject_ckc", inject_ckc),
+       VCPU_STAT("inject_cputm", inject_cputm),
+       VCPU_STAT("inject_external_call", inject_external_call),
+       VM_STAT("inject_float_mchk", inject_float_mchk),
+       VCPU_STAT("inject_emergency_signal", inject_emergency_signal),
+       VM_STAT("inject_io", inject_io),
+       VCPU_STAT("inject_mchk", inject_mchk),
+       VM_STAT("inject_pfault_done", inject_pfault_done),
+       VCPU_STAT("inject_program", inject_program),
+       VCPU_STAT("inject_restart", inject_restart),
+       VM_STAT("inject_service_signal", inject_service_signal),
+       VCPU_STAT("inject_set_prefix", inject_set_prefix),
+       VCPU_STAT("inject_stop_signal", inject_stop_signal),
+       VCPU_STAT("inject_pfault_init", inject_pfault_init),
+       VM_STAT("inject_virtio", inject_virtio),
+       VCPU_STAT("instruction_epsw", instruction_epsw),
+       VCPU_STAT("instruction_gs", instruction_gs),
+       VCPU_STAT("instruction_io_other", instruction_io_other),
+       VCPU_STAT("instruction_lpsw", instruction_lpsw),
+       VCPU_STAT("instruction_lpswe", instruction_lpswe),
+       VCPU_STAT("instruction_pfmf", instruction_pfmf),
+       VCPU_STAT("instruction_ptff", instruction_ptff),
+       VCPU_STAT("instruction_stidp", instruction_stidp),
+       VCPU_STAT("instruction_sck", instruction_sck),
+       VCPU_STAT("instruction_sckpf", instruction_sckpf),
+       VCPU_STAT("instruction_spx", instruction_spx),
+       VCPU_STAT("instruction_stpx", instruction_stpx),
+       VCPU_STAT("instruction_stap", instruction_stap),
+       VCPU_STAT("instruction_iske", instruction_iske),
+       VCPU_STAT("instruction_ri", instruction_ri),
+       VCPU_STAT("instruction_rrbe", instruction_rrbe),
+       VCPU_STAT("instruction_sske", instruction_sske),
+       VCPU_STAT("instruction_ipte_interlock", instruction_ipte_interlock),
+       VCPU_STAT("instruction_essa", instruction_essa),
+       VCPU_STAT("instruction_stsi", instruction_stsi),
+       VCPU_STAT("instruction_stfl", instruction_stfl),
+       VCPU_STAT("instruction_tb", instruction_tb),
+       VCPU_STAT("instruction_tpi", instruction_tpi),
+       VCPU_STAT("instruction_tprot", instruction_tprot),
+       VCPU_STAT("instruction_tsch", instruction_tsch),
+       VCPU_STAT("instruction_sthyi", instruction_sthyi),
+       VCPU_STAT("instruction_sie", instruction_sie),
+       VCPU_STAT("instruction_sigp_sense", instruction_sigp_sense),
+       VCPU_STAT("instruction_sigp_sense_running", instruction_sigp_sense_running),
+       VCPU_STAT("instruction_sigp_external_call", instruction_sigp_external_call),
+       VCPU_STAT("instruction_sigp_emergency", instruction_sigp_emergency),
+       VCPU_STAT("instruction_sigp_cond_emergency", instruction_sigp_cond_emergency),
+       VCPU_STAT("instruction_sigp_start", instruction_sigp_start),
+       VCPU_STAT("instruction_sigp_stop", instruction_sigp_stop),
+       VCPU_STAT("instruction_sigp_stop_store_status", instruction_sigp_stop_store_status),
+       VCPU_STAT("instruction_sigp_store_status", instruction_sigp_store_status),
+       VCPU_STAT("instruction_sigp_store_adtl_status", instruction_sigp_store_adtl_status),
+       VCPU_STAT("instruction_sigp_set_arch", instruction_sigp_arch),
+       VCPU_STAT("instruction_sigp_set_prefix", instruction_sigp_prefix),
+       VCPU_STAT("instruction_sigp_restart", instruction_sigp_restart),
+       VCPU_STAT("instruction_sigp_cpu_reset", instruction_sigp_cpu_reset),
+       VCPU_STAT("instruction_sigp_init_cpu_reset", instruction_sigp_init_cpu_reset),
+       VCPU_STAT("instruction_sigp_unknown", instruction_sigp_unknown),
+       VCPU_STAT("instruction_diag_10", diagnose_10),
+       VCPU_STAT("instruction_diag_44", diagnose_44),
+       VCPU_STAT("instruction_diag_9c", diagnose_9c),
+       VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored),
+       VCPU_STAT("instruction_diag_258", diagnose_258),
+       VCPU_STAT("instruction_diag_308", diagnose_308),
+       VCPU_STAT("instruction_diag_500", diagnose_500),
+       VCPU_STAT("instruction_diag_other", diagnose_other),
         { NULL }
  };
  
@@ -4337,8 +4336,9 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 store_regs_fmt2(vcpu, kvm_run);
  }
  
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
  {
+       struct kvm_run *kvm_run = vcpu->run;
         int rc;
  
         if (kvm_run->immediate_exit)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 0dea9f1..fd78bd4 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -83,6 +83,9 @@
  #define KVM_REQ_GET_VMCS12_PAGES       KVM_ARCH_REQ(24)
  #define KVM_REQ_APICV_UPDATE \
         KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_TLB_FLUSH_CURRENT      KVM_ARCH_REQ(26)
+#define KVM_REQ_HV_TLB_FLUSH \
+       KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
  
  #define CR0_RESERVED_BITS                                               \
         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -107,15 +110,8 @@
  #define UNMAPPED_GVA (~(gpa_t)0)
  
  /* KVM Hugepage definitions for x86 */
-enum {
-       PT_PAGE_TABLE_LEVEL   = 1,
-       PT_DIRECTORY_LEVEL    = 2,
-       PT_PDPE_LEVEL         = 3,
-       /* set max level to the biggest one */
-       PT_MAX_HUGEPAGE_LEVEL = PT_PDPE_LEVEL,
-};
-#define KVM_NR_PAGE_SIZES      (PT_MAX_HUGEPAGE_LEVEL - \
-                                PT_PAGE_TABLE_LEVEL + 1)
+#define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G
+#define KVM_NR_PAGE_SIZES      (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1)
  #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
  #define KVM_HPAGE_SHIFT(x)     (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
  #define KVM_HPAGE_SIZE(x)      (1UL << KVM_HPAGE_SHIFT(x))
@@ -124,7 +120,7 @@ enum {
  
  static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
  {
-       /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
+       /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */
         return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
                 (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
  }
@@ -164,9 +160,13 @@ enum kvm_reg {
         NR_VCPU_REGS,
  
         VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+       VCPU_EXREG_CR0,
         VCPU_EXREG_CR3,
+       VCPU_EXREG_CR4,
         VCPU_EXREG_RFLAGS,
         VCPU_EXREG_SEGMENTS,
+       VCPU_EXREG_EXIT_INFO_1,
+       VCPU_EXREG_EXIT_INFO_2,
  };
  
  enum {
@@ -182,8 +182,10 @@ enum {
  
  enum exit_fastpath_completion {
         EXIT_FASTPATH_NONE,
-       EXIT_FASTPATH_SKIP_EMUL_INS,
+       EXIT_FASTPATH_REENTER_GUEST,
+       EXIT_FASTPATH_EXIT_HANDLED,
  };
+typedef enum exit_fastpath_completion fastpath_t;
  
  struct x86_emulate_ctxt;
  struct x86_exception;
@@ -372,12 +374,12 @@ struct rsvd_bits_validate {
  };
  
  struct kvm_mmu_root_info {
-       gpa_t cr3;
+       gpa_t pgd;
         hpa_t hpa;
  };
  
  #define KVM_MMU_ROOT_INFO_INVALID \
-       ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE })
+       ((struct kvm_mmu_root_info) { .pgd = INVALID_PAGE, .hpa = INVALID_PAGE })
  
  #define KVM_MMU_NUM_PREV_ROOTS 3
  
@@ -403,7 +405,7 @@ struct kvm_mmu {
         void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                            u64 *spte, const void *pte);
         hpa_t root_hpa;
-       gpa_t root_cr3;
+       gpa_t root_pgd;
         union kvm_mmu_role mmu_role;
         u8 root_level;
         u8 shadow_root_level;
@@ -578,6 +580,7 @@ struct kvm_vcpu_arch {
         unsigned long cr4;
         unsigned long cr4_guest_owned_bits;
         unsigned long cr8;
+       u32 host_pkru;
         u32 pkru;
         u32 hflags;
         u64 efer;
@@ -649,7 +652,6 @@ struct kvm_vcpu_arch {
  
         u64 xcr0;
         u64 guest_supported_xcr0;
-       u32 guest_xstate_size;
  
         struct kvm_pio_request pio;
         void *pio_data;
@@ -679,6 +681,7 @@ struct kvm_vcpu_arch {
         struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
  
         int maxphyaddr;
+       int tdp_level;
  
         /* emulate context */
  
@@ -702,6 +705,7 @@ struct kvm_vcpu_arch {
                 struct gfn_to_pfn_cache cache;
         } st;
  
+       u64 l1_tsc_offset;
         u64 tsc_offset;
         u64 last_guest_tsc;
         u64 last_host_tsc;
@@ -761,7 +765,7 @@ struct kvm_vcpu_arch {
  
         struct {
                 bool halted;
-               gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
+               gfn_t gfns[ASYNC_PF_PER_VCPU];
                 struct gfn_to_hva_cache data;
                 u64 msr_val;
                 u32 id;
@@ -1027,6 +1031,8 @@ struct kvm_vcpu_stat {
         u64 irq_injections;
         u64 nmi_injections;
         u64 req_event;
+       u64 halt_poll_success_ns;
+       u64 halt_poll_fail_ns;
  };
  
  struct x86_instruction_info;
@@ -1084,8 +1090,6 @@ struct kvm_x86_ops {
         void (*set_segment)(struct kvm_vcpu *vcpu,
                             struct kvm_segment *var, int seg);
         void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
-       void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
-       void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
         void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
         int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
         void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
@@ -1093,15 +1097,14 @@ struct kvm_x86_ops {
         void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
         void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
         void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
-       u64 (*get_dr6)(struct kvm_vcpu *vcpu);
-       void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
         void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
         void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
         void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
         unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
         void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
  
-       void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa);
+       void (*tlb_flush_all)(struct kvm_vcpu *vcpu);
+       void (*tlb_flush_current)(struct kvm_vcpu *vcpu);
         int  (*tlb_remote_flush)(struct kvm *kvm);
         int  (*tlb_remote_flush_with_range)(struct kvm *kvm,
                         struct kvm_tlb_range *range);
@@ -1114,7 +1117,13 @@ struct kvm_x86_ops {
          */
         void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
  
-       void (*run)(struct kvm_vcpu *vcpu);
+       /*
+        * Flush any TLB entries created by the guest.  Like tlb_flush_gva(),
+        * does not need to flush GPA->HPA mappings.
+        */
+       void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
+
+       enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
         int (*handle_exit)(struct kvm_vcpu *vcpu,
                 enum exit_fastpath_completion exit_fastpath);
         int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
@@ -1127,8 +1136,8 @@ struct kvm_x86_ops {
         void (*set_nmi)(struct kvm_vcpu *vcpu);
         void (*queue_exception)(struct kvm_vcpu *vcpu);
         void (*cancel_injection)(struct kvm_vcpu *vcpu);
-       int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
-       int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+       bool (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
+       bool (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
         bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
         void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
         void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
@@ -1142,7 +1151,7 @@ struct kvm_x86_ops {
         bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
         void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
         void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
-       void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
+       void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
         int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
         int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
         int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
@@ -1154,7 +1163,6 @@ struct kvm_x86_ops {
  
         bool (*has_wbinvd_exit)(void);
  
-       u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
         /* Returns actual tsc_offset set in active VMCS */
         u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
  
@@ -1164,10 +1172,8 @@ struct kvm_x86_ops {
                                struct x86_instruction_info *info,
                                enum x86_intercept_stage stage,
                                struct x86_exception *exception);
-       void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu,
-               enum exit_fastpath_completion *exit_fastpath);
+       void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
  
-       int (*check_nested_events)(struct kvm_vcpu *vcpu);
         void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
  
         void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
@@ -1200,6 +1206,7 @@ struct kvm_x86_ops {
  
         /* pmu operations of sub-arch */
         const struct kvm_pmu_ops *pmu_ops;
+       const struct kvm_x86_nested_ops *nested_ops;
  
         /*
          * Architecture specific hooks for vCPU blocking due to
@@ -1227,15 +1234,7 @@ struct kvm_x86_ops {
  
         void (*setup_mce)(struct kvm_vcpu *vcpu);
  
-       int (*get_nested_state)(struct kvm_vcpu *vcpu,
-                               struct kvm_nested_state __user *user_kvm_nested_state,
-                               unsigned user_data_size);
-       int (*set_nested_state)(struct kvm_vcpu *vcpu,
-                               struct kvm_nested_state __user *user_kvm_nested_state,
-                               struct kvm_nested_state *kvm_state);
-       bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
-
-       int (*smi_allowed)(struct kvm_vcpu *vcpu);
+       bool (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
         int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
         int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
         int (*enable_smi_window)(struct kvm_vcpu *vcpu);
@@ -1246,14 +1245,28 @@ struct kvm_x86_ops {
  
         int (*get_msr_feature)(struct kvm_msr_entry *entry);
  
-       int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu,
-                                  uint16_t *vmcs_version);
-       uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
-
         bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
  
         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
+
+       void (*migrate_timers)(struct kvm_vcpu *vcpu);
+};
+
+struct kvm_x86_nested_ops {
+       int (*check_events)(struct kvm_vcpu *vcpu);
+       bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
+       int (*get_state)(struct kvm_vcpu *vcpu,
+                        struct kvm_nested_state __user *user_kvm_nested_state,
+                        unsigned user_data_size);
+       int (*set_state)(struct kvm_vcpu *vcpu,
+                        struct kvm_nested_state __user *user_kvm_nested_state,
+                        struct kvm_nested_state *kvm_state);
+       bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+
+       int (*enable_evmcs)(struct kvm_vcpu *vcpu,
+                           uint16_t *vmcs_version);
+       uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
  };
  
  struct kvm_x86_init_ops {
@@ -1449,9 +1462,12 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu);
  
  void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
  void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload);
  void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
  void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
  void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+                                   struct x86_exception *fault);
  int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                             gfn_t gfn, void *data, int offset, int len,
                             u32 access);
@@ -1509,8 +1525,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
                        void *insn, int insn_len);
  void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+                           gva_t gva, hpa_t root_hpa);
  void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush);
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
+                    bool skip_mmu_sync);
  
  void kvm_configure_mmu(bool enable_tdp, int tdp_page_level);
  
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h

index 5e090d1..cd7de4b 100644 (file)
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -527,10 +527,12 @@ struct vmx_msr_entry {
  /*
   * Exit Qualifications for entry failure during or after loading guest state
   */
-#define ENTRY_FAIL_DEFAULT             0
-#define ENTRY_FAIL_PDPTE               2
-#define ENTRY_FAIL_NMI                 3
-#define ENTRY_FAIL_VMCS_LINK_PTR       4
+enum vm_entry_failure_code {
+       ENTRY_FAIL_DEFAULT              = 0,
+       ENTRY_FAIL_PDPTE                = 2,
+       ENTRY_FAIL_NMI                  = 3,
+       ENTRY_FAIL_VMCS_LINK_PTR        = 4,
+};
  
  /*
   * Exit Qualifications for EPT Violations
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h

index e95b72e..b8ff9e8 100644 (file)
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -150,6 +150,9 @@
         { EXIT_REASON_UMWAIT,                "UMWAIT" }, \
         { EXIT_REASON_TPAUSE,                "TPAUSE" }
  
+#define VMX_EXIT_REASON_FLAGS \
+       { VMX_EXIT_REASONS_FAILED_VMENTRY,      "FAILED_VMENTRY" }
+
  #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
  #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL       2
  #define VMX_ABORT_LOAD_HOST_MSR_FAIL         4
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c

index 901cd1f..cd708b0 100644 (file)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -86,12 +86,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
         best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
         if (!best) {
                 vcpu->arch.guest_supported_xcr0 = 0;
-               vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
         } else {
                 vcpu->arch.guest_supported_xcr0 =
                         (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
-               vcpu->arch.guest_xstate_size = best->ebx =
-                       xstate_required_size(vcpu->arch.xcr0, false);
+               best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
         }
  
         best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
@@ -124,8 +122,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
                                            MSR_IA32_MISC_ENABLE_MWAIT);
         }
  
-       /* Update physical-address width */
+       /* Note, maxphyaddr must be updated before tdp_level. */
         vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+       vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu);
         kvm_mmu_reset_context(vcpu);
  
         kvm_pmu_refresh(vcpu);
@@ -728,6 +727,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
                 cpuid_entry_override(entry, CPUID_8000_0001_EDX);
                 cpuid_entry_override(entry, CPUID_8000_0001_ECX);
                 break;
+       case 0x80000006:
+               /* L2 cache and TLB: pass through host info. */
+               break;
         case 0x80000007: /* Advanced power management */
                 /* invariant TSC is CPUID.80000007H:EDX[8] */
                 entry->edx &= (1 << 8);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c

index bddaba9..de5476f 100644 (file)
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -5798,6 +5798,8 @@ writeback:
         }
  
         ctxt->eip = ctxt->_eip;
+       if (ctxt->mode != X86EMUL_MODE_PROT64)
+               ctxt->eip = (u32)ctxt->_eip;
  
  done:
         if (rc == X86EMUL_PROPAGATE_FAULT) {
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c

index bcefa9d..f9d3b91 100644 (file)
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1425,9 +1425,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
          * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
          * analyze it here, flush TLB regardless of the specified address space.
          */
-       kvm_make_vcpus_request_mask(kvm,
-                                   KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
-                                   vcpu_mask, &hv_vcpu->tlb_flush);
+       kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH,
+                                   NULL, vcpu_mask, &hv_vcpu->tlb_flush);
  
  ret_success:
         /* We always do full TLB flush, set rep_done = rep_cnt. */
@@ -1800,8 +1799,8 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
         };
         int i, nent = ARRAY_SIZE(cpuid_entries);
  
-       if (kvm_x86_ops.nested_get_evmcs_version)
-               evmcs_ver = kvm_x86_ops.nested_get_evmcs_version(vcpu);
+       if (kvm_x86_ops.nested_ops->get_evmcs_version)
+               evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu);
  
         /* Skip NESTED_FEATURES if eVMCS is not supported */
         if (!evmcs_ver)
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c

index e330e7d..54f7ea6 100644 (file)
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -159,6 +159,8 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
  {
         __kvm_migrate_apic_timer(vcpu);
         __kvm_migrate_pit_timer(vcpu);
+       if (kvm_x86_ops.migrate_timers)
+               kvm_x86_ops.migrate_timers(vcpu);
  }
  
  bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h

index 62558b9..ff2d0e9 100644 (file)
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -116,8 +116,9 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
  static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
  {
         ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
-       if (tmask & vcpu->arch.cr0_guest_owned_bits)
-               kvm_x86_ops.decache_cr0_guest_bits(vcpu);
+       if ((tmask & vcpu->arch.cr0_guest_owned_bits) &&
+           !kvm_register_is_available(vcpu, VCPU_EXREG_CR0))
+               kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR0);
         return vcpu->arch.cr0 & mask;
  }
  
@@ -129,8 +130,9 @@ static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
  static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
  {
         ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
-       if (tmask & vcpu->arch.cr4_guest_owned_bits)
-               kvm_x86_ops.decache_cr4_guest_bits(vcpu);
+       if ((tmask & vcpu->arch.cr4_guest_owned_bits) &&
+           !kvm_register_is_available(vcpu, VCPU_EXREG_CR4))
+               kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR4);
         return vcpu->arch.cr4 & mask;
  }
  
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index 9af25c9..2a3b574 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -110,11 +110,18 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
         return apic->vcpu->vcpu_id;
  }
  
-bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
+static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
  {
         return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
  }
-EXPORT_SYMBOL_GPL(kvm_can_post_timer_interrupt);
+
+bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
+{
+       return kvm_x86_ops.set_hv_timer
+              && !(kvm_mwait_in_guest(vcpu->kvm) ||
+                   kvm_can_post_timer_interrupt(vcpu));
+}
+EXPORT_SYMBOL_GPL(kvm_can_use_hv_timer);
  
  static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)
  {
@@ -1593,7 +1600,7 @@ static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic)
         }
  }
  
-static void apic_timer_expired(struct kvm_lapic *apic)
+static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
  {
         struct kvm_vcpu *vcpu = apic->vcpu;
         struct kvm_timer *ktimer = &apic->lapic_timer;
@@ -1604,6 +1611,12 @@ static void apic_timer_expired(struct kvm_lapic *apic)
         if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
                 ktimer->expired_tscdeadline = ktimer->tscdeadline;
  
+       if (!from_timer_fn && vcpu->arch.apicv_active) {
+               WARN_ON(kvm_get_running_vcpu() != vcpu);
+               kvm_apic_inject_pending_timer_irqs(apic);
+               return;
+       }
+
         if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
                 if (apic->lapic_timer.timer_advance_ns)
                         __kvm_wait_lapic_expire(vcpu);
@@ -1643,18 +1656,23 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
                 expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
                 hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD);
         } else
-               apic_timer_expired(apic);
+               apic_timer_expired(apic, false);
  
         local_irq_restore(flags);
  }
  
+static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
+{
+       return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count;
+}
+
  static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
  {
         ktime_t now, remaining;
         u64 ns_remaining_old, ns_remaining_new;
  
-       apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
-               * APIC_BUS_CYCLE_NS * apic->divide_count;
+       apic->lapic_timer.period =
+                       tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
         limit_periodic_timer_frequency(apic);
  
         now = ktime_get();
@@ -1672,14 +1690,15 @@ static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_diviso
         apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
  }
  
-static bool set_target_expiration(struct kvm_lapic *apic)
+static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
  {
         ktime_t now;
         u64 tscl = rdtsc();
+       s64 deadline;
  
         now = ktime_get();
-       apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
-               * APIC_BUS_CYCLE_NS * apic->divide_count;
+       apic->lapic_timer.period =
+                       tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
  
         if (!apic->lapic_timer.period) {
                 apic->lapic_timer.tscdeadline = 0;
@@ -1687,10 +1706,32 @@ static bool set_target_expiration(struct kvm_lapic *apic)
         }
  
         limit_periodic_timer_frequency(apic);
+       deadline = apic->lapic_timer.period;
+
+       if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
+               if (unlikely(count_reg != APIC_TMICT)) {
+                       deadline = tmict_to_ns(apic,
+                                    kvm_lapic_get_reg(apic, count_reg));
+                       if (unlikely(deadline <= 0))
+                               deadline = apic->lapic_timer.period;
+                       else if (unlikely(deadline > apic->lapic_timer.period)) {
+                               pr_info_ratelimited(
+                                   "kvm: vcpu %i: requested lapic timer restore with "
+                                   "starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
+                                   "Using initial count to start timer.\n",
+                                   apic->vcpu->vcpu_id,
+                                   count_reg,
+                                   kvm_lapic_get_reg(apic, count_reg),
+                                   deadline, apic->lapic_timer.period);
+                               kvm_lapic_set_reg(apic, count_reg, 0);
+                               deadline = apic->lapic_timer.period;
+                       }
+               }
+       }
  
         apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
-               nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
-       apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period);
+               nsec_to_cycles(apic->vcpu, deadline);
+       apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline);
  
         return true;
  }
@@ -1723,7 +1764,7 @@ static void start_sw_period(struct kvm_lapic *apic)
  
         if (ktime_after(ktime_get(),
                         apic->lapic_timer.target_expiration)) {
-               apic_timer_expired(apic);
+               apic_timer_expired(apic, false);
  
                 if (apic_lvtt_oneshot(apic))
                         return;
@@ -1760,7 +1801,7 @@ static bool start_hv_timer(struct kvm_lapic *apic)
         bool expired;
  
         WARN_ON(preemptible());
-       if (!kvm_x86_ops.set_hv_timer)
+       if (!kvm_can_use_hv_timer(vcpu))
                 return false;
  
         if (!ktimer->tscdeadline)
@@ -1785,7 +1826,7 @@ static bool start_hv_timer(struct kvm_lapic *apic)
                 if (atomic_read(&ktimer->pending)) {
                         cancel_hv_timer(apic);
                 } else if (expired) {
-                       apic_timer_expired(apic);
+                       apic_timer_expired(apic, false);
                         cancel_hv_timer(apic);
                 }
         }
@@ -1833,9 +1874,9 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
         /* If the preempt notifier has already run, it also called apic_timer_expired */
         if (!apic->lapic_timer.hv_timer_in_use)
                 goto out;
-       WARN_ON(swait_active(&vcpu->wq));
+       WARN_ON(rcuwait_active(&vcpu->wait));
         cancel_hv_timer(apic);
-       apic_timer_expired(apic);
+       apic_timer_expired(apic, false);
  
         if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
                 advance_periodic_target_expiration(apic);
@@ -1872,17 +1913,22 @@ void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
         restart_apic_timer(apic);
  }
  
-static void start_apic_timer(struct kvm_lapic *apic)
+static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg)
  {
         atomic_set(&apic->lapic_timer.pending, 0);
  
         if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
-           && !set_target_expiration(apic))
+           && !set_target_expiration(apic, count_reg))
                 return;
  
         restart_apic_timer(apic);
  }
  
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+       __start_apic_timer(apic, APIC_TMICT);
+}
+
  static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
  {
         bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
@@ -2336,7 +2382,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
         struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
         struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
  
-       apic_timer_expired(apic);
+       apic_timer_expired(apic, true);
  
         if (lapic_is_periodic(apic)) {
                 advance_periodic_target_expiration(apic);
@@ -2493,6 +2539,14 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
  int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
  {
         memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+
+       /*
+        * Get calculated timer current count for remaining timer period (if
+        * any) and store it in the returned register set.
+        */
+       __kvm_lapic_set_reg(s->regs, APIC_TMCCT,
+                           __apic_read(vcpu->arch.apic, APIC_TMCCT));
+
         return kvm_apic_state_fixup(vcpu, s, false);
  }
  
@@ -2520,7 +2574,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
         apic_update_lvtt(apic);
         apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
         update_divide_count(apic);
-       start_apic_timer(apic);
+       __start_apic_timer(apic, APIC_TMCCT);
         kvm_apic_update_apicv(vcpu);
         apic->highest_isr_cache = -1;
         if (vcpu->arch.apicv_active) {
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h

index a0ffb43..754f29b 100644 (file)
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -161,9 +161,14 @@ static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
         return *((u32 *) (apic->regs + reg_off));
  }
  
+static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
+{
+       *((u32 *) (regs + reg_off)) = val;
+}
+
  static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
  {
-       *((u32 *) (apic->regs + reg_off)) = val;
+       __kvm_lapic_set_reg(apic->regs, reg_off, val);
  }
  
  extern struct static_key kvm_no_apic_vcpu;
@@ -245,7 +250,7 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
  void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
  bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
  void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
-bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu);
+bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu);
  
  static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
  {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index dd900a6..d93cb3a 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -78,6 +78,9 @@ module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
                 &nx_huge_pages_recovery_ratio, 0644);
  __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
  
+static bool __read_mostly force_flush_and_sync_on_reuse;
+module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
+
  /*
   * When setting this variable to true it enables Two-Dimensional-Paging
   * where the hardware walks 2 page tables:
@@ -620,7 +623,7 @@ static int is_large_pte(u64 pte)
  
  static int is_last_spte(u64 pte, int level)
  {
-       if (level == PT_PAGE_TABLE_LEVEL)
+       if (level == PG_LEVEL_4K)
                 return 1;
         if (is_large_pte(pte))
                 return 1;
@@ -1196,7 +1199,7 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
         struct kvm_lpage_info *linfo;
         int i;
  
-       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+       for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
                 linfo = lpage_info_slot(gfn, slot, i);
                 linfo->disallow_lpage += count;
                 WARN_ON(linfo->disallow_lpage < 0);
@@ -1225,7 +1228,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
         slot = __gfn_to_memslot(slots, gfn);
  
         /* the non-leaf shadow pages are keeping readonly. */
-       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+       if (sp->role.level > PG_LEVEL_4K)
                 return kvm_slot_page_track_add_page(kvm, slot, gfn,
                                                     KVM_PAGE_TRACK_WRITE);
  
@@ -1253,7 +1256,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
         gfn = sp->gfn;
         slots = kvm_memslots_for_spte_role(kvm, sp->role);
         slot = __gfn_to_memslot(slots, gfn);
-       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+       if (sp->role.level > PG_LEVEL_4K)
                 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
                                                        KVM_PAGE_TRACK_WRITE);
  
@@ -1398,7 +1401,7 @@ static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
         unsigned long idx;
  
         idx = gfn_to_index(gfn, slot->base_gfn, level);
-       return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
+       return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
  }
  
  static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
@@ -1529,8 +1532,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
  static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
  {
         if (is_large_pte(*sptep)) {
-               WARN_ON(page_header(__pa(sptep))->role.level ==
-                       PT_PAGE_TABLE_LEVEL);
+               WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K);
                 drop_spte(kvm, sptep);
                 --kvm->stat.lpages;
                 return true;
@@ -1682,7 +1684,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
  
         while (mask) {
                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
-                                         PT_PAGE_TABLE_LEVEL, slot);
+                                         PG_LEVEL_4K, slot);
                 __rmap_write_protect(kvm, rmap_head, false);
  
                 /* clear the first set bit */
@@ -1708,7 +1710,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
  
         while (mask) {
                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
-                                         PT_PAGE_TABLE_LEVEL, slot);
+                                         PG_LEVEL_4K, slot);
                 __rmap_clear_dirty(kvm, rmap_head);
  
                 /* clear the first set bit */
@@ -1760,7 +1762,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
         int i;
         bool write_protected = false;
  
-       for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+       for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
                 rmap_head = __gfn_to_rmap(gfn, i, slot);
                 write_protected |= __rmap_write_protect(kvm, rmap_head, true);
         }
@@ -1948,8 +1950,8 @@ static int kvm_handle_hva_range(struct kvm *kvm,
                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
  
-                       for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
-                                                PT_MAX_HUGEPAGE_LEVEL,
+                       for_each_slot_rmap_range(memslot, PG_LEVEL_4K,
+                                                KVM_MAX_HUGEPAGE_LEVEL,
                                                  gfn_start, gfn_end - 1,
                                                  &iterator)
                                 ret |= handler(kvm, iterator.rmap, memslot,
@@ -2153,10 +2155,6 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
         return 0;
  }
  
-static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
-{
-}
-
  static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
                                  struct kvm_mmu_page *sp, u64 *spte,
                                  const void *pte)
@@ -2313,7 +2311,7 @@ static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
                 return;
  
         if (local_flush)
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
  }
  
  #ifdef CONFIG_KVM_MMU_AUDIT
@@ -2347,7 +2345,7 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
                 if (!s->unsync)
                         continue;
  
-               WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+               WARN_ON(s->role.level != PG_LEVEL_4K);
                 ret |= kvm_sync_page(vcpu, s, invalid_list);
         }
  
@@ -2376,7 +2374,7 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec,
                 int level = sp->role.level;
  
                 parents->idx[level-1] = idx;
-               if (level == PT_PAGE_TABLE_LEVEL)
+               if (level == PG_LEVEL_4K)
                         break;
  
                 parents->parent[level-2] = sp;
@@ -2398,7 +2396,7 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec,
  
         sp = pvec->page[0].sp;
         level = sp->role.level;
-       WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+       WARN_ON(level == PG_LEVEL_4K);
  
         parents->parent[level-2] = sp;
  
@@ -2520,11 +2518,11 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                                 break;
  
                         WARN_ON(!list_empty(&invalid_list));
-                       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
                 }
  
                 if (sp->unsync_children)
-                       kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
  
                 __clear_sp_write_flooding_count(sp);
                 trace_kvm_mmu_get_page(sp, false);
@@ -2546,11 +2544,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                  * be inconsistent with guest page table.
                  */
                 account_shadowed(vcpu->kvm, sp);
-               if (level == PT_PAGE_TABLE_LEVEL &&
-                     rmap_write_protect(vcpu, gfn))
+               if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
  
-               if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+               if (level > PG_LEVEL_4K && need_sync)
                         flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
         }
         clear_page(sp->spt);
@@ -2601,7 +2598,7 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
  
  static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
  {
-       if (iterator->level < PT_PAGE_TABLE_LEVEL)
+       if (iterator->level < PG_LEVEL_4K)
                 return false;
  
         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
@@ -2722,7 +2719,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
         struct mmu_page_path parents;
         struct kvm_mmu_pages pages;
  
-       if (parent->role.level == PT_PAGE_TABLE_LEVEL)
+       if (parent->role.level == PG_LEVEL_4K)
                 return 0;
  
         while (mmu_unsync_walk(parent, &pages)) {
@@ -2921,7 +2918,7 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
                 if (sp->unsync)
                         continue;
  
-               WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+               WARN_ON(sp->role.level != PG_LEVEL_4K);
                 kvm_unsync_page(vcpu, sp);
         }
  
@@ -3020,7 +3017,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
         if (!speculative)
                 spte |= spte_shadow_accessed_mask(spte);
  
-       if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+       if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
             is_nx_huge_page_enabled()) {
                 pte_access &= ~ACC_EXEC_MASK;
         }
@@ -3033,7 +3030,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
         if (pte_access & ACC_USER_MASK)
                 spte |= shadow_user_mask;
  
-       if (level > PT_PAGE_TABLE_LEVEL)
+       if (level > PG_LEVEL_4K)
                 spte |= PT_PAGE_SIZE_MASK;
         if (tdp_enabled)
                 spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
@@ -3103,8 +3100,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
                  * the parent of the now unreachable PTE.
                  */
-               if (level > PT_PAGE_TABLE_LEVEL &&
-                   !is_large_pte(*sptep)) {
+               if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
                         struct kvm_mmu_page *child;
                         u64 pte = *sptep;
  
@@ -3125,7 +3121,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
         if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
                 if (write_fault)
                         ret = RET_PF_EMULATE;
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
         }
  
         if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
@@ -3228,7 +3224,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
         if (sp_ad_disabled(sp))
                 return;
  
-       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+       if (sp->role.level > PG_LEVEL_4K)
                 return;
  
         __direct_pte_prefetch(vcpu, sp, sptep);
@@ -3241,12 +3237,8 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
         pte_t *pte;
         int level;
  
-       BUILD_BUG_ON(PT_PAGE_TABLE_LEVEL != (int)PG_LEVEL_4K ||
-                    PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M ||
-                    PT_PDPE_LEVEL != (int)PG_LEVEL_1G);
-
         if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
-               return PT_PAGE_TABLE_LEVEL;
+               return PG_LEVEL_4K;
  
         /*
          * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
@@ -3260,7 +3252,7 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
  
         pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level);
         if (unlikely(!pte))
-               return PT_PAGE_TABLE_LEVEL;
+               return PG_LEVEL_4K;
  
         return level;
  }
@@ -3274,28 +3266,28 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
         kvm_pfn_t mask;
         int level;
  
-       if (unlikely(max_level == PT_PAGE_TABLE_LEVEL))
-               return PT_PAGE_TABLE_LEVEL;
+       if (unlikely(max_level == PG_LEVEL_4K))
+               return PG_LEVEL_4K;
  
         if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn))
-               return PT_PAGE_TABLE_LEVEL;
+               return PG_LEVEL_4K;
  
         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true);
         if (!slot)
-               return PT_PAGE_TABLE_LEVEL;
+               return PG_LEVEL_4K;
  
         max_level = min(max_level, max_page_level);
-       for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) {
+       for ( ; max_level > PG_LEVEL_4K; max_level--) {
                 linfo = lpage_info_slot(gfn, slot, max_level);
                 if (!linfo->disallow_lpage)
                         break;
         }
  
-       if (max_level == PT_PAGE_TABLE_LEVEL)
-               return PT_PAGE_TABLE_LEVEL;
+       if (max_level == PG_LEVEL_4K)
+               return PG_LEVEL_4K;
  
         level = host_pfn_mapping_level(vcpu, gfn, pfn, slot);
-       if (level == PT_PAGE_TABLE_LEVEL)
+       if (level == PG_LEVEL_4K)
                 return level;
  
         level = min(level, max_level);
@@ -3317,7 +3309,7 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
         int level = *levelp;
         u64 spte = *it.sptep;
  
-       if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
+       if (it.level == level && level > PG_LEVEL_4K &&
             is_nx_huge_page_enabled() &&
             is_shadow_present_pte(spte) &&
             !is_large_pte(spte)) {
@@ -3574,7 +3566,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                          *
                          * See the comments in kvm_arch_commit_memory_region().
                          */
-                       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+                       if (sp->role.level > PG_LEVEL_4K)
                                 break;
                 }
  
@@ -3666,7 +3658,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                                                            &invalid_list);
                         mmu->root_hpa = INVALID_PAGE;
                 }
-               mmu->root_cr3 = 0;
+               mmu->root_pgd = 0;
         }
  
         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -3686,58 +3678,64 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
         return ret;
  }
  
-static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
+static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
+                           u8 level, bool direct)
  {
         struct kvm_mmu_page *sp;
+
+       spin_lock(&vcpu->kvm->mmu_lock);
+
+       if (make_mmu_pages_available(vcpu)) {
+               spin_unlock(&vcpu->kvm->mmu_lock);
+               return INVALID_PAGE;
+       }
+       sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
+       ++sp->root_count;
+
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       return __pa(sp->spt);
+}
+
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
+{
+       u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;
+       hpa_t root;
         unsigned i;
  
-       if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
-               spin_lock(&vcpu->kvm->mmu_lock);
-               if(make_mmu_pages_available(vcpu) < 0) {
-                       spin_unlock(&vcpu->kvm->mmu_lock);
+       if (shadow_root_level >= PT64_ROOT_4LEVEL) {
+               root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+               if (!VALID_PAGE(root))
                         return -ENOSPC;
-               }
-               sp = kvm_mmu_get_page(vcpu, 0, 0,
-                               vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
-               ++sp->root_count;
-               spin_unlock(&vcpu->kvm->mmu_lock);
-               vcpu->arch.mmu->root_hpa = __pa(sp->spt);
-       } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
+               vcpu->arch.mmu->root_hpa = root;
+       } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
                 for (i = 0; i < 4; ++i) {
-                       hpa_t root = vcpu->arch.mmu->pae_root[i];
+                       MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
  
-                       MMU_WARN_ON(VALID_PAGE(root));
-                       spin_lock(&vcpu->kvm->mmu_lock);
-                       if (make_mmu_pages_available(vcpu) < 0) {
-                               spin_unlock(&vcpu->kvm->mmu_lock);
+                       root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
+                                             i << 30, PT32_ROOT_LEVEL, true);
+                       if (!VALID_PAGE(root))
                                 return -ENOSPC;
-                       }
-                       sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
-                                       i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
-                       root = __pa(sp->spt);
-                       ++sp->root_count;
-                       spin_unlock(&vcpu->kvm->mmu_lock);
                         vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
                 }
                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
         } else
                 BUG();
  
-       /* root_cr3 is ignored for direct MMUs. */
-       vcpu->arch.mmu->root_cr3 = 0;
+       /* root_pgd is ignored for direct MMUs. */
+       vcpu->arch.mmu->root_pgd = 0;
  
         return 0;
  }
  
  static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
  {
-       struct kvm_mmu_page *sp;
         u64 pdptr, pm_mask;
-       gfn_t root_gfn, root_cr3;
+       gfn_t root_gfn, root_pgd;
+       hpa_t root;
         int i;
  
-       root_cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
-       root_gfn = root_cr3 >> PAGE_SHIFT;
+       root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
+       root_gfn = root_pgd >> PAGE_SHIFT;
  
         if (mmu_check_root(vcpu, root_gfn))
                 return 1;
@@ -3747,22 +3745,14 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
          * write-protect the guests page table root.
          */
         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
-               hpa_t root = vcpu->arch.mmu->root_hpa;
+               MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));
  
-               MMU_WARN_ON(VALID_PAGE(root));
-
-               spin_lock(&vcpu->kvm->mmu_lock);
-               if (make_mmu_pages_available(vcpu) < 0) {
-                       spin_unlock(&vcpu->kvm->mmu_lock);
+               root = mmu_alloc_root(vcpu, root_gfn, 0,
+                                     vcpu->arch.mmu->shadow_root_level, false);
+               if (!VALID_PAGE(root))
                         return -ENOSPC;
-               }
-               sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-                               vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
-               root = __pa(sp->spt);
-               ++sp->root_count;
-               spin_unlock(&vcpu->kvm->mmu_lock);
                 vcpu->arch.mmu->root_hpa = root;
-               goto set_root_cr3;
+               goto set_root_pgd;
         }
  
         /*
@@ -3775,9 +3765,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
  
         for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->arch.mmu->pae_root[i];
-
-               MMU_WARN_ON(VALID_PAGE(root));
+               MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
                 if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
                         pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
                         if (!(pdptr & PT_PRESENT_MASK)) {
@@ -3788,17 +3776,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                         if (mmu_check_root(vcpu, root_gfn))
                                 return 1;
                 }
-               spin_lock(&vcpu->kvm->mmu_lock);
-               if (make_mmu_pages_available(vcpu) < 0) {
-                       spin_unlock(&vcpu->kvm->mmu_lock);
-                       return -ENOSPC;
-               }
-               sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
-                                     0, ACC_ALL);
-               root = __pa(sp->spt);
-               ++sp->root_count;
-               spin_unlock(&vcpu->kvm->mmu_lock);
  
+               root = mmu_alloc_root(vcpu, root_gfn, i << 30,
+                                     PT32_ROOT_LEVEL, false);
+               if (!VALID_PAGE(root))
+                       return -ENOSPC;
                 vcpu->arch.mmu->pae_root[i] = root | pm_mask;
         }
         vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
@@ -3828,8 +3810,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
         }
  
-set_root_cr3:
-       vcpu->arch.mmu->root_cr3 = root_cr3;
+set_root_pgd:
+       vcpu->arch.mmu->root_pgd = root_pgd;
  
         return 0;
  }
@@ -4083,18 +4065,16 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                          gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
                          bool *writable)
  {
-       struct kvm_memory_slot *slot;
+       struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
         bool async;
  
-       /*
-        * Don't expose private memslots to L2.
-        */
-       if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+       /* Don't expose private memslots to L2. */
+       if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
                 *pfn = KVM_PFN_NOSLOT;
+               *writable = false;
                 return false;
         }
  
-       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
         async = false;
         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
         if (!async)
@@ -4135,7 +4115,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                 return r;
  
         if (lpage_disallowed)
-               max_level = PT_PAGE_TABLE_LEVEL;
+               max_level = PG_LEVEL_4K;
  
         if (fast_page_fault(vcpu, gpa, error_code))
                 return RET_PF_RETRY;
@@ -4171,7 +4151,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
  
         /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
         return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
-                                PT_DIRECTORY_LEVEL, false);
+                                PG_LEVEL_2M, false);
  }
  
  int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
@@ -4217,8 +4197,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
  {
         int max_level;
  
-       for (max_level = PT_MAX_HUGEPAGE_LEVEL;
-            max_level > PT_PAGE_TABLE_LEVEL;
+       for (max_level = KVM_MAX_HUGEPAGE_LEVEL;
+            max_level > PG_LEVEL_4K;
              max_level--) {
                 int page_num = KVM_PAGES_PER_HPAGE(max_level);
                 gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
@@ -4237,7 +4217,7 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
         context->page_fault = nonpaging_page_fault;
         context->gva_to_gpa = nonpaging_gva_to_gpa;
         context->sync_page = nonpaging_sync_page;
-       context->invlpg = nonpaging_invlpg;
+       context->invlpg = NULL;
         context->update_pte = nonpaging_update_pte;
         context->root_level = 0;
         context->shadow_root_level = PT32E_ROOT_LEVEL;
@@ -4245,51 +4225,50 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
         context->nx = false;
  }
  
-static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t cr3,
+static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
                                   union kvm_mmu_page_role role)
  {
-       return (role.direct || cr3 == root->cr3) &&
+       return (role.direct || pgd == root->pgd) &&
                VALID_PAGE(root->hpa) && page_header(root->hpa) &&
                role.word == page_header(root->hpa)->role.word;
  }
  
  /*
- * Find out if a previously cached root matching the new CR3/role is available.
+ * Find out if a previously cached root matching the new pgd/role is available.
   * The current root is also inserted into the cache.
   * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
   * returned.
   * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
   * false is returned. This root should now be freed by the caller.
   */
-static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd,
                                   union kvm_mmu_page_role new_role)
  {
         uint i;
         struct kvm_mmu_root_info root;
         struct kvm_mmu *mmu = vcpu->arch.mmu;
  
-       root.cr3 = mmu->root_cr3;
+       root.pgd = mmu->root_pgd;
         root.hpa = mmu->root_hpa;
  
-       if (is_root_usable(&root, new_cr3, new_role))
+       if (is_root_usable(&root, new_pgd, new_role))
                 return true;
  
         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
                 swap(root, mmu->prev_roots[i]);
  
-               if (is_root_usable(&root, new_cr3, new_role))
+               if (is_root_usable(&root, new_pgd, new_role))
                         break;
         }
  
         mmu->root_hpa = root.hpa;
-       mmu->root_cr3 = root.cr3;
+       mmu->root_pgd = root.pgd;
  
         return i < KVM_MMU_NUM_PREV_ROOTS;
  }
  
-static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
-                           union kvm_mmu_page_role new_role,
-                           bool skip_tlb_flush)
+static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
+                           union kvm_mmu_page_role new_role)
  {
         struct kvm_mmu *mmu = vcpu->arch.mmu;
  
@@ -4299,70 +4278,59 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
          * later if necessary.
          */
         if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
-           mmu->root_level >= PT64_ROOT_4LEVEL) {
-               if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
-                       return false;
-
-               if (cached_root_available(vcpu, new_cr3, new_role)) {
-                       /*
-                        * It is possible that the cached previous root page is
-                        * obsolete because of a change in the MMU generation
-                        * number. However, changing the generation number is
-                        * accompanied by KVM_REQ_MMU_RELOAD, which will free
-                        * the root set here and allocate a new one.
-                        */
-                       kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
-                       if (!skip_tlb_flush) {
-                               kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
-                               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-                       }
-
-                       /*
-                        * The last MMIO access's GVA and GPA are cached in the
-                        * VCPU. When switching to a new CR3, that GVA->GPA
-                        * mapping may no longer be valid. So clear any cached
-                        * MMIO info even when we don't need to sync the shadow
-                        * page tables.
-                        */
-                       vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
-
-                       __clear_sp_write_flooding_count(
-                               page_header(mmu->root_hpa));
-
-                       return true;
-               }
-       }
+           mmu->root_level >= PT64_ROOT_4LEVEL)
+               return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) &&
+                      cached_root_available(vcpu, new_pgd, new_role);
  
         return false;
  }
  
-static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
                               union kvm_mmu_page_role new_role,
-                             bool skip_tlb_flush)
+                             bool skip_tlb_flush, bool skip_mmu_sync)
  {
-       if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
-               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
-                                  KVM_MMU_ROOT_CURRENT);
+       if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
+               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
+               return;
+       }
+
+       /*
+        * It's possible that the cached previous root page is obsolete because
+        * of a change in the MMU generation number. However, changing the
+        * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will
+        * free the root set here and allocate a new one.
+        */
+       kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
+
+       if (!skip_mmu_sync || force_flush_and_sync_on_reuse)
+               kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+       if (!skip_tlb_flush || force_flush_and_sync_on_reuse)
+               kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+
+       /*
+        * The last MMIO access's GVA and GPA are cached in the VCPU. When
+        * switching to a new CR3, that GVA->GPA mapping may no longer be
+        * valid. So clear any cached MMIO info even when we don't need to sync
+        * the shadow page tables.
+        */
+       vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+       __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa));
  }
  
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
+                    bool skip_mmu_sync)
  {
-       __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
-                         skip_tlb_flush);
+       __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu),
+                         skip_tlb_flush, skip_mmu_sync);
  }
-EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
+EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
  
  static unsigned long get_cr3(struct kvm_vcpu *vcpu)
  {
         return kvm_read_cr3(vcpu);
  }
  
-static void inject_page_fault(struct kvm_vcpu *vcpu,
-                             struct x86_exception *fault)
-{
-       vcpu->arch.mmu->inject_page_fault(vcpu, fault);
-}
-
  static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
                            unsigned int access, int *nr_present)
  {
@@ -4391,11 +4359,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
         gpte &= level - mmu->last_nonleaf_level;
  
         /*
-        * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
-        * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
-        * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+        * PG_LEVEL_4K always terminates.  The RHS has bit 7 set
+        * iff level <= PG_LEVEL_4K, which for our purpose means
+        * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
          */
-       gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+       gpte |= level - PG_LEVEL_4K - 1;
  
         return gpte & PT_PAGE_SIZE_MASK;
  }
@@ -4909,7 +4877,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
  
         role.base.ad_disabled = (shadow_accessed_mask == 0);
-       role.base.level = kvm_x86_ops.get_tdp_level(vcpu);
+       role.base.level = vcpu->arch.tdp_level;
         role.base.direct = true;
         role.base.gpte_is_8_bytes = true;
  
@@ -4928,9 +4896,9 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
         context->mmu_role.as_u64 = new_role.as_u64;
         context->page_fault = kvm_tdp_page_fault;
         context->sync_page = nonpaging_sync_page;
-       context->invlpg = nonpaging_invlpg;
+       context->invlpg = NULL;
         context->update_pte = nonpaging_update_pte;
-       context->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu);
+       context->shadow_root_level = vcpu->arch.tdp_level;
         context->direct_map = true;
         context->get_guest_pgd = get_cr3;
         context->get_pdptr = kvm_pdptr_read;
@@ -5047,7 +5015,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
                 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
                                                    execonly, level);
  
-       __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
+       __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true);
  
         if (new_role.as_u64 == context->mmu_role.as_u64)
                 return;
@@ -5096,6 +5064,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
         g_context->get_pdptr         = kvm_pdptr_read;
         g_context->inject_page_fault = kvm_inject_page_fault;
  
+       /*
+        * L2 page tables are never shadowed, so there is no need to sync
+        * SPTEs.
+        */
+       g_context->invlpg            = NULL;
+
         /*
          * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
          * L1's nested page tables (e.g. EPT12). The nested translation
@@ -5183,7 +5157,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
         if (r)
                 goto out;
         kvm_mmu_load_pgd(vcpu);
-       kvm_x86_ops.tlb_flush(vcpu, true);
+       kvm_x86_ops.tlb_flush_current(vcpu);
  out:
         return r;
  }
@@ -5202,7 +5176,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
                                   struct kvm_mmu_page *sp, u64 *spte,
                                   const void *new)
  {
-       if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+       if (sp->role.level != PG_LEVEL_4K) {
                 ++vcpu->kvm->stat.mmu_pde_zapped;
                 return;
          }
@@ -5260,7 +5234,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
          * Skip write-flooding detected for the sp whose level is 1, because
          * it can become unsync, then the guest page is not write-protected.
          */
-       if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+       if (sp->role.level == PG_LEVEL_4K)
                 return false;
  
         atomic_inc(&sp->write_flooding_count);
@@ -5497,37 +5471,54 @@ emulate:
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
  
-void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+                           gva_t gva, hpa_t root_hpa)
  {
-       struct kvm_mmu *mmu = vcpu->arch.mmu;
         int i;
  
-       /* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
-       if (is_noncanonical_address(gva, vcpu))
+       /* It's actually a GPA for vcpu->arch.guest_mmu.  */
+       if (mmu != &vcpu->arch.guest_mmu) {
+               /* INVLPG on a non-canonical address is a NOP according to the SDM.  */
+               if (is_noncanonical_address(gva, vcpu))
+                       return;
+
+               kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+       }
+
+       if (!mmu->invlpg)
                 return;
  
-       mmu->invlpg(vcpu, gva, mmu->root_hpa);
+       if (root_hpa == INVALID_PAGE) {
+               mmu->invlpg(vcpu, gva, mmu->root_hpa);
  
-       /*
-        * INVLPG is required to invalidate any global mappings for the VA,
-        * irrespective of PCID. Since it would take us roughly similar amount
-        * of work to determine whether any of the prev_root mappings of the VA
-        * is marked global, or to just sync it blindly, so we might as well
-        * just always sync it.
-        *
-        * Mappings not reachable via the current cr3 or the prev_roots will be
-        * synced when switching to that cr3, so nothing needs to be done here
-        * for them.
-        */
-       for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-               if (VALID_PAGE(mmu->prev_roots[i].hpa))
-                       mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+               /*
+                * INVLPG is required to invalidate any global mappings for the VA,
+                * irrespective of PCID. Since it would take us roughly similar amount
+                * of work to determine whether any of the prev_root mappings of the VA
+                * is marked global, or to just sync it blindly, so we might as well
+                * just always sync it.
+                *
+                * Mappings not reachable via the current cr3 or the prev_roots will be
+                * synced when switching to that cr3, so nothing needs to be done here
+                * for them.
+                */
+               for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+                       if (VALID_PAGE(mmu->prev_roots[i].hpa))
+                               mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+       } else {
+               mmu->invlpg(vcpu, gva, root_hpa);
+       }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva);
  
-       kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE);
         ++vcpu->stat.invlpg;
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
  
+
  void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
  {
         struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -5541,7 +5532,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
  
         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
                 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
-                   pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
+                   pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
                         tlb_flush = true;
                 }
@@ -5574,9 +5565,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_page_level)
         if (tdp_enabled)
                 max_page_level = tdp_page_level;
         else if (boot_cpu_has(X86_FEATURE_GBPAGES))
-               max_page_level = PT_PDPE_LEVEL;
+               max_page_level = PG_LEVEL_1G;
         else
-               max_page_level = PT_DIRECTORY_LEVEL;
+               max_page_level = PG_LEVEL_2M;
  }
  EXPORT_SYMBOL_GPL(kvm_configure_mmu);
  
@@ -5632,24 +5623,24 @@ static __always_inline bool
  slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
                       slot_level_handler fn, bool lock_flush_tlb)
  {
-       return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
-                                PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+       return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
+                                KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
  }
  
  static __always_inline bool
  slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
                         slot_level_handler fn, bool lock_flush_tlb)
  {
-       return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
-                                PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+       return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1,
+                                KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
  }
  
  static __always_inline bool
  slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
                  slot_level_handler fn, bool lock_flush_tlb)
  {
-       return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
-                                PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+       return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
+                                PG_LEVEL_4K, lock_flush_tlb);
  }
  
  static void free_mmu_pages(struct kvm_mmu *mmu)
@@ -5672,7 +5663,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
          * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
          * skip allocating the PDP table.
          */
-       if (tdp_enabled && kvm_x86_ops.get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
+       if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL)
                 return 0;
  
         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
@@ -5695,13 +5686,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
  
         vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
-       vcpu->arch.root_mmu.root_cr3 = 0;
+       vcpu->arch.root_mmu.root_pgd = 0;
         vcpu->arch.root_mmu.translate_gpa = translate_gpa;
         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
                 vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
  
         vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
-       vcpu->arch.guest_mmu.root_cr3 = 0;
+       vcpu->arch.guest_mmu.root_pgd = 0;
         vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
                 vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
@@ -5859,7 +5850,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
                                 continue;
  
                         slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
-                                               PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
+                                               PG_LEVEL_4K,
+                                               KVM_MAX_HUGEPAGE_LEVEL,
                                                 start, end - 1, true);
                 }
         }
@@ -5881,7 +5873,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
  
         spin_lock(&kvm->mmu_lock);
         flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
-                               start_level, PT_MAX_HUGEPAGE_LEVEL, false);
+                               start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
         spin_unlock(&kvm->mmu_lock);
  
         /*
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c

index ddc1ec3..a7bcde3 100644 (file)
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -61,7 +61,7 @@ static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
  {
         int index, val;
  
-       index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+       index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
  
         val = slot->arch.gfn_track[mode][index];
  
@@ -151,7 +151,7 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
         if (!slot)
                 return false;
  
-       index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+       index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
         return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
  }
  
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h

index 9bdf9b7..38c5764 100644 (file)
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -75,7 +75,7 @@
  #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
  
  #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
-#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
+#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K)
  
  /*
   * The guest_walker structure emulates the behavior of the hardware page
@@ -198,7 +198,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
             !(gpte & PT_GUEST_ACCESSED_MASK))
                 goto no_present;
  
-       if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+       if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K))
                 goto no_present;
  
         return false;
@@ -436,7 +436,7 @@ retry_walk:
         gfn = gpte_to_gfn_lvl(pte, walker->level);
         gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
  
-       if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
+       if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
                 gfn += pse36_gfn_delta(pte);
  
         real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault);
@@ -552,7 +552,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
          * we call mmu_set_spte() with host_writable = true because
          * pte_prefetch_gfn_to_pfn always gets a writable pfn.
          */
-       mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
+       mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn,
                      true, true);
  
         kvm_release_pfn_clean(pfn);
@@ -575,7 +575,7 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
         u64 mask;
         int r, index;
  
-       if (level == PT_PAGE_TABLE_LEVEL) {
+       if (level == PG_LEVEL_4K) {
                 mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
                 base_gpa = pte_gpa & ~mask;
                 index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
@@ -600,7 +600,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
  
         sp = page_header(__pa(sptep));
  
-       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+       if (sp->role.level > PG_LEVEL_4K)
                 return;
  
         if (sp->role.direct)
@@ -812,7 +812,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
         if (!r) {
                 pgprintk("%s: guest page fault\n", __func__);
                 if (!prefault)
-                       inject_page_fault(vcpu, &walker.fault);
+                       kvm_inject_emulated_page_fault(vcpu, &walker.fault);
  
                 return RET_PF_RETRY;
         }
@@ -828,7 +828,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
               &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
  
         if (lpage_disallowed || is_self_change_mapping)
-               max_level = PT_PAGE_TABLE_LEVEL;
+               max_level = PG_LEVEL_4K;
         else
                 max_level = walker.level;
  
@@ -884,7 +884,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
  {
         int offset = 0;
  
-       WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+       WARN_ON(sp->role.level != PG_LEVEL_4K);
  
         if (PTTYPE == 32)
                 offset = sp->role.quadrant << PT64_LEVEL_BITS;
@@ -1070,7 +1070,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
  
                 set_spte_ret |= set_spte(vcpu, &sp->spt[i],
-                                        pte_access, PT_PAGE_TABLE_LEVEL,
+                                        pte_access, PG_LEVEL_4K,
                                          gfn, spte_to_pfn(sp->spt[i]),
                                          true, false, host_writable);
         }
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c

index ca39f62..9d2844f 100644 (file)
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -100,7 +100,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
         sp = page_header(__pa(sptep));
  
         if (sp->unsync) {
-               if (level != PT_PAGE_TABLE_LEVEL) {
+               if (level != PG_LEVEL_4K) {
                         audit_printk(vcpu->kvm, "unsync sp: %p "
                                      "level = %d\n", sp, level);
                         return;
@@ -176,7 +176,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
         int i;
  
-       if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+       if (sp->role.level != PG_LEVEL_4K)
                 return;
  
         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
@@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
  
         slots = kvm_memslots_for_spte_role(kvm, sp->role);
         slot = __gfn_to_memslot(slots, sp->gfn);
-       rmap_head = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
+       rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
  
         for_each_rmap_spte(rmap_head, &iter, sptep) {
                 if (is_writable_pte(*sptep))
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c

index 90a1ca9..a89a166 100644 (file)
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -19,6 +19,7 @@
  #include <linux/kernel.h>
  
  #include <asm/msr-index.h>
+#include <asm/debugreg.h>
  
  #include "kvm_emulate.h"
  #include "trace.h"
@@ -85,7 +86,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
         vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
         vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
         vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
-       vcpu->arch.mmu->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu);
+       vcpu->arch.mmu->shadow_root_level = vcpu->arch.tdp_level;
         reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
         vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
  }
@@ -207,6 +208,10 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
         if ((vmcb->save.efer & EFER_SVME) == 0)
                 return false;
  
+       if (((vmcb->save.cr0 & X86_CR0_CD) == 0) &&
+           (vmcb->save.cr0 & X86_CR0_NW))
+               return false;
+
         if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
                 return false;
  
@@ -267,7 +272,7 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
         svm->vmcb->save.rsp = nested_vmcb->save.rsp;
         svm->vmcb->save.rip = nested_vmcb->save.rip;
         svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
-       svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
+       svm->vcpu.arch.dr6  = nested_vmcb->save.dr6;
         svm->vmcb->save.cpl = nested_vmcb->save.cpl;
  
         svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
@@ -279,7 +284,7 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
         svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
         svm->nested.intercept            = nested_vmcb->control.intercept;
  
-       svm_flush_tlb(&svm->vcpu, true);
+       svm_flush_tlb(&svm->vcpu);
         svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
         if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
                 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -341,8 +346,12 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
         struct kvm_host_map map;
         u64 vmcb_gpa;
  
-       vmcb_gpa = svm->vmcb->save.rax;
+       if (is_smm(&svm->vcpu)) {
+               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+               return 1;
+       }
  
+       vmcb_gpa = svm->vmcb->save.rax;
         ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
         if (ret == -EINVAL) {
                 kvm_inject_gp(&svm->vcpu, 0);
@@ -405,6 +414,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
  
         copy_vmcb_control_area(hsave, vmcb);
  
+       svm->nested.nested_run_pending = 1;
         enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
  
         if (!nested_svm_vmrun_msrpm(svm)) {
@@ -463,6 +473,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
         leave_guest_mode(&svm->vcpu);
         svm->nested.vmcb = 0;
  
+       /* in case we halted in L2 */
+       svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
         /* Give the current vmcb to the guest */
         disable_gif(svm);
  
@@ -482,7 +495,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
         nested_vmcb->save.rsp    = vmcb->save.rsp;
         nested_vmcb->save.rax    = vmcb->save.rax;
         nested_vmcb->save.dr7    = vmcb->save.dr7;
-       nested_vmcb->save.dr6    = vmcb->save.dr6;
+       nested_vmcb->save.dr6    = svm->vcpu.arch.dr6;
         nested_vmcb->save.cpl    = vmcb->save.cpl;
  
         nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
@@ -606,26 +619,45 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
  /* DB exceptions for our internal use must not cause vmexit */
  static int nested_svm_intercept_db(struct vcpu_svm *svm)
  {
-       unsigned long dr6;
+       unsigned long dr6 = svm->vmcb->save.dr6;
+
+       /* Always catch it and pass it to userspace if debugging.  */
+       if (svm->vcpu.guest_debug &
+           (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+               return NESTED_EXIT_HOST;
  
         /* if we're not singlestepping, it's not ours */
         if (!svm->nmi_singlestep)
-               return NESTED_EXIT_DONE;
+               goto reflected_db;
  
         /* if it's not a singlestep exception, it's not ours */
-       if (kvm_get_dr(&svm->vcpu, 6, &dr6))
-               return NESTED_EXIT_DONE;
         if (!(dr6 & DR6_BS))
-               return NESTED_EXIT_DONE;
+               goto reflected_db;
  
         /* if the guest is singlestepping, it should get the vmexit */
         if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
                 disable_nmi_singlestep(svm);
-               return NESTED_EXIT_DONE;
+               goto reflected_db;
         }
  
         /* it's ours, the nested hypervisor must not see this one */
         return NESTED_EXIT_HOST;
+
+reflected_db:
+       /*
+        * Synchronize guest DR6 here just like in kvm_deliver_exception_payload;
+        * it will be moved into the nested VMCB by nested_svm_vmexit.  Once
+        * exceptions will be moved to svm_check_nested_events, all this stuff
+        * will just go away and we could just return NESTED_EXIT_HOST
+        * unconditionally.  db_interception will queue the exception, which
+        * will be processed by svm_check_nested_events if a nested vmexit is
+        * required, and we will just use kvm_deliver_exception_payload to copy
+        * the payload to DR6 before vmexit.
+        */
+       WARN_ON(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT);
+       svm->vcpu.arch.dr6 &= ~(DR_TRAP_BITS | DR6_RTM);
+       svm->vcpu.arch.dr6 |= dr6 & ~DR6_FIXED_1;
+       return NESTED_EXIT_DONE;
  }
  
  static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
@@ -682,6 +714,9 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
                 if (svm->nested.intercept_exceptions & excp_bits) {
                         if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
                                 vmexit = nested_svm_intercept_db(svm);
+                       else if (exit_code == SVM_EXIT_EXCP_BASE + BP_VECTOR &&
+                                svm->vcpu.guest_debug & KVM_GUESTDBG_USE_SW_BP)
+                               vmexit = NESTED_EXIT_HOST;
                         else
                                 vmexit = NESTED_EXIT_DONE;
                 }
@@ -764,31 +799,65 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
         return vmexit;
  }
  
-static void nested_svm_intr(struct vcpu_svm *svm)
+static void nested_svm_smi(struct vcpu_svm *svm)
  {
-       svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
+       svm->vmcb->control.exit_code = SVM_EXIT_SMI;
         svm->vmcb->control.exit_info_1 = 0;
         svm->vmcb->control.exit_info_2 = 0;
  
-       /* nested_svm_vmexit this gets called afterwards from handle_exit */
-       svm->nested.exit_required = true;
-       trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+       nested_svm_vmexit(svm);
+}
+
+static void nested_svm_nmi(struct vcpu_svm *svm)
+{
+       svm->vmcb->control.exit_code = SVM_EXIT_NMI;
+       svm->vmcb->control.exit_info_1 = 0;
+       svm->vmcb->control.exit_info_2 = 0;
+
+       nested_svm_vmexit(svm);
  }
  
-static bool nested_exit_on_intr(struct vcpu_svm *svm)
+static void nested_svm_intr(struct vcpu_svm *svm)
  {
-       return (svm->nested.intercept & 1ULL);
+       trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+
+       svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
+       svm->vmcb->control.exit_info_1 = 0;
+       svm->vmcb->control.exit_info_2 = 0;
+
+       nested_svm_vmexit(svm);
  }
  
-int svm_check_nested_events(struct kvm_vcpu *vcpu)
+static int svm_check_nested_events(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
         bool block_nested_events =
-               kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required;
+               kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required ||
+               svm->nested.nested_run_pending;
+
+       if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
+               if (block_nested_events)
+                       return -EBUSY;
+               if (!nested_exit_on_smi(svm))
+                       return 0;
+               nested_svm_smi(svm);
+               return 0;
+       }
  
-       if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) {
+       if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
                 if (block_nested_events)
                         return -EBUSY;
+               if (!nested_exit_on_nmi(svm))
+                       return 0;
+               nested_svm_nmi(svm);
+               return 0;
+       }
+
+       if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) {
+               if (block_nested_events)
+                       return -EBUSY;
+               if (!nested_exit_on_intr(svm))
+                       return 0;
                 nested_svm_intr(svm);
                 return 0;
         }
@@ -821,3 +890,7 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
  
         return NESTED_EXIT_CONTINUE;
  }
+
+struct kvm_x86_nested_ops svm_nested_ops = {
+       .check_events = svm_check_nested_events,
+};
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index 38f6aee..4e9cd2a 100644 (file)
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -33,6 +33,7 @@
  #include <asm/debugreg.h>
  #include <asm/kvm_para.h>
  #include <asm/irq_remapping.h>
+#include <asm/mce.h>
  #include <asm/spec-ctrl.h>
  #include <asm/cpu_device_id.h>
  
@@ -318,9 +319,6 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
                         return 0;
         } else {
-               if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
-                       pr_err("%s: ip 0x%lx next 0x%llx\n",
-                              __func__, kvm_rip_read(vcpu), svm->next_rip);
                 kvm_rip_write(vcpu, svm->next_rip);
         }
         svm_set_interrupt_shadow(vcpu, 0);
@@ -890,7 +888,7 @@ static __init int svm_hardware_setup(void)
         if (npt_enabled && !npt)
                 npt_enabled = false;
  
-       kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL);
+       kvm_configure_mmu(npt_enabled, PG_LEVEL_1G);
         pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
  
         if (nrips) {
@@ -953,16 +951,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
         seg->base = 0;
  }
  
-static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (is_guest_mode(vcpu))
-               return svm->nested.hsave->control.tsc_offset;
-
-       return vcpu->arch.tsc_offset;
-}
-
  static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -1364,12 +1352,13 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
         }
  }
  
-static inline void svm_enable_vintr(struct vcpu_svm *svm)
+static void svm_set_vintr(struct vcpu_svm *svm)
  {
         struct vmcb_control_area *control;
  
         /* The following fields are ignored when AVIC is enabled */
         WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
+       set_intercept(svm, INTERCEPT_VINTR);
  
         /*
          * This is just a dummy VINTR to actually cause a vmexit to happen.
@@ -1383,13 +1372,6 @@ static inline void svm_enable_vintr(struct vcpu_svm *svm)
         mark_dirty(svm->vmcb, VMCB_INTR);
  }
  
-static void svm_set_vintr(struct vcpu_svm *svm)
-{
-       set_intercept(svm, INTERCEPT_VINTR);
-       if (is_intercept(svm, INTERCEPT_VINTR))
-               svm_enable_vintr(svm);
-}
-
  static void svm_clear_vintr(struct vcpu_svm *svm)
  {
         clr_intercept(svm, INTERCEPT_VINTR);
@@ -1533,14 +1515,6 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
         mark_dirty(svm->vmcb, VMCB_DT);
  }
  
-static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
-}
-
  static void update_cr0_intercept(struct vcpu_svm *svm)
  {
         ulong gcr0 = svm->vcpu.arch.cr0;
@@ -1603,7 +1577,7 @@ int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                 return 1;
  
         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
-               svm_flush_tlb(vcpu, true);
+               svm_flush_tlb(vcpu);
  
         vcpu->arch.cr4 = cr4;
         if (!npt_enabled)
@@ -1672,17 +1646,14 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
         mark_dirty(svm->vmcb, VMCB_ASID);
  }
  
-static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
-{
-       return to_svm(vcpu)->vmcb->save.dr6;
-}
-
-static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
+static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
  {
-       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *vmcb = svm->vmcb;
  
-       svm->vmcb->save.dr6 = value;
-       mark_dirty(svm->vmcb, VMCB_DR);
+       if (unlikely(value != vmcb->save.dr6)) {
+               vmcb->save.dr6 = value;
+               mark_dirty(vmcb, VMCB_DR);
+       }
  }
  
  static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
@@ -1693,9 +1664,12 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
         get_debugreg(vcpu->arch.db[1], 1);
         get_debugreg(vcpu->arch.db[2], 2);
         get_debugreg(vcpu->arch.db[3], 3);
-       vcpu->arch.dr6 = svm_get_dr6(vcpu);
+       /*
+        * We cannot reset svm->vmcb->save.dr6 to DR6_FIXED_1|DR6_RTM here,
+        * because db_interception might need it.  We can do it before vmentry.
+        */
+       vcpu->arch.dr6 = svm->vmcb->save.dr6;
         vcpu->arch.dr7 = svm->vmcb->save.dr7;
-
         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
         set_dr_intercepts(svm);
  }
@@ -1739,7 +1713,8 @@ static int db_interception(struct vcpu_svm *svm)
         if (!(svm->vcpu.guest_debug &
               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
                 !svm->nmi_singlestep) {
-               kvm_queue_exception(&svm->vcpu, DB_VECTOR);
+               u32 payload = (svm->vmcb->save.dr6 ^ DR6_RTM) & ~DR6_FIXED_1;
+               kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
                 return 1;
         }
  
@@ -1841,6 +1816,25 @@ static bool is_erratum_383(void)
         return true;
  }
  
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE)
+       struct pt_regs regs = {
+               .cs = 3, /* Fake ring 3 no matter what the guest ran on */
+               .flags = X86_EFLAGS_IF,
+       };
+
+       do_machine_check(&regs, 0);
+#endif
+}
+
  static void svm_handle_mce(struct vcpu_svm *svm)
  {
         if (is_erratum_383()) {
@@ -1859,11 +1853,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
          * On an #MC intercept the MCE handler is not called automatically in
          * the host. So do it by hand here.
          */
-       asm volatile (
-               "int $0x12\n");
-       /* not sure if we ever come back to this point */
-
-       return;
+       kvm_machine_check();
  }
  
  static int mc_interception(struct vcpu_svm *svm)
@@ -2670,8 +2660,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
          */
         svm_toggle_avic_for_irq_window(&svm->vcpu, true);
  
-       svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
-       mark_dirty(svm->vmcb, VMCB_INTR);
         ++svm->vcpu.stat.irq_window_exits;
         return 1;
  }
@@ -2897,8 +2885,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
         *info2 = control->exit_info_2;
  }
  
-static int handle_exit(struct kvm_vcpu *vcpu,
-       enum exit_fastpath_completion exit_fastpath)
+static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
         struct kvm_run *kvm_run = vcpu->run;
@@ -2956,10 +2943,10 @@ static int handle_exit(struct kvm_vcpu *vcpu,
                        __func__, svm->vmcb->control.exit_int_info,
                        exit_code);
  
-       if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
-               kvm_skip_emulated_instruction(vcpu);
+       if (exit_fastpath != EXIT_FASTPATH_NONE)
                 return 1;
-       } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
+
+       if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
             || !svm_exit_handlers[exit_code]) {
                 vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
                 dump_vmcb(vcpu);
@@ -3048,18 +3035,37 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
                 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
  }
  
-static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
+bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
         struct vmcb *vmcb = svm->vmcb;
-       int ret;
-       ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-             !(svm->vcpu.arch.hflags & HF_NMI_MASK);
-       ret = ret && gif_set(svm) && nested_svm_nmi(svm);
+       bool ret;
+
+       if (!gif_set(svm))
+               return true;
+
+       if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
+               return false;
+
+       ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
+             (svm->vcpu.arch.hflags & HF_NMI_MASK);
  
         return ret;
  }
  
+static bool svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       if (svm->nested.nested_run_pending)
+               return false;
+
+       /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
+       if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
+               return false;
+
+       return !svm_nmi_blocked(vcpu);
+}
+
  static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -3080,19 +3086,46 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
         }
  }
  
-static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
+bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
         struct vmcb *vmcb = svm->vmcb;
  
-       if (!gif_set(svm) ||
-            (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
-               return 0;
+       if (!gif_set(svm))
+               return true;
  
-       if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK))
-               return !!(svm->vcpu.arch.hflags & HF_HIF_MASK);
-       else
-               return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
+       if (is_guest_mode(vcpu)) {
+               /* As long as interrupts are being delivered...  */
+               if ((svm->vcpu.arch.hflags & HF_VINTR_MASK)
+                   ? !(svm->vcpu.arch.hflags & HF_HIF_MASK)
+                   : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
+                       return true;
+
+               /* ... vmexits aren't blocked by the interrupt shadow  */
+               if (nested_exit_on_intr(svm))
+                       return false;
+       } else {
+               if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
+                       return true;
+       }
+
+       return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
+}
+
+static bool svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       if (svm->nested.nested_run_pending)
+               return false;
+
+       /*
+        * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
+        * e.g. if the IRQ arrived asynchronously after checking nested events.
+        */
+       if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
+               return false;
+
+       return !svm_interrupt_blocked(vcpu);
  }
  
  static void enable_irq_window(struct kvm_vcpu *vcpu)
@@ -3133,9 +3166,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
                 return; /* STGI will cause a vm exit */
         }
  
-       if (svm->nested.exit_required)
-               return; /* we're not going to run the guest yet */
-
         /*
          * Something prevents NMI from been injected. Single step over possible
          * problem (IRET or exception injection or interrupt shadow)
@@ -3155,10 +3185,17 @@ static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
         return 0;
  }
  
-void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+void svm_flush_tlb(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
+       /*
+        * Flush only the current ASID even if the TLB flush was invoked via
+        * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
+        * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
+        * unconditionally does a TLB flush on both nested VM-Enter and nested
+        * VM-Exit (via kvm_mmu_reset_context()).
+        */
         if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
         else
@@ -3278,10 +3315,21 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
         svm_complete_interrupts(svm);
  }
  
+static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+       if (!is_guest_mode(vcpu) &&
+           to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+           to_svm(vcpu)->vmcb->control.exit_info_1)
+               return handle_fastpath_set_msr_irqoff(vcpu);
+
+       return EXIT_FASTPATH_NONE;
+}
+
  void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
  
-static void svm_vcpu_run(struct kvm_vcpu *vcpu)
+static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
  {
+       fastpath_t exit_fastpath;
         struct vcpu_svm *svm = to_svm(vcpu);
  
         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
@@ -3293,7 +3341,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
          * again.
          */
         if (unlikely(svm->nested.exit_required))
-               return;
+               return EXIT_FASTPATH_NONE;
  
         /*
          * Disable singlestep if we're injecting an interrupt/exception.
@@ -3317,6 +3365,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
  
         svm->vmcb->save.cr2 = vcpu->arch.cr2;
  
+       /*
+        * Run with all-zero DR6 unless needed, so that we can get the exact cause
+        * of a #DB.
+        */
+       if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+               svm_set_dr6(svm, vcpu->arch.dr6);
+       else
+               svm_set_dr6(svm, DR6_FIXED_1 | DR6_RTM);
+
         clgi();
         kvm_load_guest_xsave_state(vcpu);
  
@@ -3377,6 +3434,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
         stgi();
  
         /* Any pending NMI will happen here */
+       exit_fastpath = svm_exit_handlers_fastpath(vcpu);
  
         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
                 kvm_after_interrupt(&svm->vcpu);
@@ -3384,6 +3442,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
         sync_cr8_to_lapic(vcpu);
  
         svm->next_rip = 0;
+       svm->nested.nested_run_pending = 0;
  
         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
  
@@ -3405,6 +3464,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
                 svm_handle_mce(svm);
  
         mark_all_clean(svm->vmcb);
+       return exit_fastpath;
  }
  
  static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
@@ -3706,13 +3766,8 @@ out:
         return ret;
  }
  
-static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu,
-       enum exit_fastpath_completion *exit_fastpath)
+static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
  {
-       if (!is_guest_mode(vcpu) &&
-           to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
-           to_svm(vcpu)->vmcb->control.exit_info_1)
-               *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
  }
  
  static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
@@ -3727,23 +3782,28 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
         vcpu->arch.mcg_cap &= 0x1ff;
  }
  
-static int svm_smi_allowed(struct kvm_vcpu *vcpu)
+bool svm_smi_blocked(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
         /* Per APM Vol.2 15.22.2 "Response to SMI" */
         if (!gif_set(svm))
-               return 0;
+               return true;
  
-       if (is_guest_mode(&svm->vcpu) &&
-           svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
-               /* TODO: Might need to set exit_info_1 and exit_info_2 here */
-               svm->vmcb->control.exit_code = SVM_EXIT_SMI;
-               svm->nested.exit_required = true;
-               return 0;
-       }
+       return is_smm(vcpu);
+}
  
-       return 1;
+static bool svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       if (svm->nested.nested_run_pending)
+               return false;
+
+       /* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
+       if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
+               return false;
+
+       return !svm_smi_blocked(vcpu);
  }
  
  static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
@@ -3808,6 +3868,13 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
         bool smap = cr4 & X86_CR4_SMAP;
         bool is_user = svm_get_cpl(vcpu) == 3;
  
+       /*
+        * If RIP is invalid, go ahead with emulation which will cause an
+        * internal error exit.
+        */
+       if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
+               return true;
+
         /*
          * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
          *
@@ -3866,9 +3933,9 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
         /*
          * TODO: Last condition latch INIT signals on vCPU when
          * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
-        * To properly emulate the INIT intercept, SVM should implement
-        * kvm_x86_ops.check_nested_events() and call nested_svm_vmexit()
-        * there if an INIT signal is pending.
+        * To properly emulate the INIT intercept,
+        * svm_check_nested_events() should call nested_svm_vmexit()
+        * if an INIT signal is pending.
          */
         return !gif_set(svm) ||
                    (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
@@ -3922,8 +3989,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
         .set_segment = svm_set_segment,
         .get_cpl = svm_get_cpl,
         .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
-       .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
-       .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
         .set_cr0 = svm_set_cr0,
         .set_cr4 = svm_set_cr4,
         .set_efer = svm_set_efer,
@@ -3931,16 +3996,16 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
         .set_idt = svm_set_idt,
         .get_gdt = svm_get_gdt,
         .set_gdt = svm_set_gdt,
-       .get_dr6 = svm_get_dr6,
-       .set_dr6 = svm_set_dr6,
         .set_dr7 = svm_set_dr7,
         .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
         .cache_reg = svm_cache_reg,
         .get_rflags = svm_get_rflags,
         .set_rflags = svm_set_rflags,
  
-       .tlb_flush = svm_flush_tlb,
+       .tlb_flush_all = svm_flush_tlb,
+       .tlb_flush_current = svm_flush_tlb,
         .tlb_flush_gva = svm_flush_tlb_gva,
+       .tlb_flush_guest = svm_flush_tlb,
  
         .run = svm_vcpu_run,
         .handle_exit = handle_exit,
@@ -3981,7 +4046,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
  
         .has_wbinvd_exit = svm_has_wbinvd_exit,
  
-       .read_l1_tsc_offset = svm_read_l1_tsc_offset,
         .write_l1_tsc_offset = svm_write_l1_tsc_offset,
  
         .load_mmu_pgd = svm_load_mmu_pgd,
@@ -3994,6 +4058,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
         .sched_in = svm_sched_in,
  
         .pmu_ops = &amd_pmu_ops,
+       .nested_ops = &svm_nested_ops,
+
         .deliver_posted_interrupt = svm_deliver_avic_intr,
         .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
         .update_pi_irte = svm_update_pi_irte,
@@ -4008,14 +4074,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
         .mem_enc_reg_region = svm_register_enc_region,
         .mem_enc_unreg_region = svm_unregister_enc_region,
  
-       .nested_enable_evmcs = NULL,
-       .nested_get_evmcs_version = NULL,
-
         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
  
         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
-
-       .check_nested_events = svm_check_nested_events,
  };
  
  static struct kvm_x86_init_ops svm_init_ops __initdata = {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h

index df3474f..5cc559a 100644 (file)
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -97,6 +97,10 @@ struct nested_state {
         /* A VMEXIT is required but not yet emulated */
         bool exit_required;
  
+       /* A VMRUN has started but has not yet been performed, so
+        * we cannot inject a nested vmexit yet.  */
+       bool nested_run_pending;
+
         /* cache for intercepts of the guest */
         u32 intercept_cr;
         u32 intercept_dr;
@@ -360,8 +364,11 @@ u32 svm_msrpm_offset(u32 msr);
  void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
  void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
  int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
+void svm_flush_tlb(struct kvm_vcpu *vcpu);
  void disable_nmi_singlestep(struct vcpu_svm *svm);
+bool svm_smi_blocked(struct kvm_vcpu *vcpu);
+bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
+bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
  
  /* nested.c */
  
@@ -369,24 +376,24 @@ void disable_nmi_singlestep(struct vcpu_svm *svm);
  #define NESTED_EXIT_DONE       1       /* Exit caused nested vmexit  */
  #define NESTED_EXIT_CONTINUE   2       /* Further checks needed      */
  
-/* This function returns true if it is save to enable the nmi window */
-static inline bool nested_svm_nmi(struct vcpu_svm *svm)
+static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
  {
-       if (!is_guest_mode(&svm->vcpu))
-               return true;
-
-       if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
-               return true;
+       return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
+}
  
-       svm->vmcb->control.exit_code = SVM_EXIT_NMI;
-       svm->nested.exit_required = true;
+static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
+{
+       return (svm->nested.intercept & (1ULL << INTERCEPT_SMI));
+}
  
-       return false;
+static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
+{
+       return (svm->nested.intercept & (1ULL << INTERCEPT_INTR));
  }
  
-static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
+static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
  {
-       return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
+       return (svm->nested.intercept & (1ULL << INTERCEPT_NMI));
  }
  
  void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
@@ -398,9 +405,10 @@ int nested_svm_exit_handled(struct vcpu_svm *svm);
  int nested_svm_check_permissions(struct vcpu_svm *svm);
  int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
                                bool has_error_code, u32 error_code);
-int svm_check_nested_events(struct kvm_vcpu *vcpu);
  int nested_svm_exit_special(struct vcpu_svm *svm);
  
+extern struct kvm_x86_nested_ops svm_nested_ops;
+
  /* avic.c */
  
  #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK   (0xFF)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h

index 249062f..54a10c9 100644 (file)
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -225,6 +225,14 @@ TRACE_EVENT(kvm_apic,
  #define KVM_ISA_VMX   1
  #define KVM_ISA_SVM   2
  
+#define kvm_print_exit_reason(exit_reason, isa)                                \
+       (isa == KVM_ISA_VMX) ?                                          \
+       __print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) :      \
+       __print_symbolic(exit_reason, SVM_EXIT_REASONS),                \
+       (isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "",       \
+       (isa == KVM_ISA_VMX) ?                                          \
+       __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : ""
+
  /*
   * Tracepoint for kvm guest exit:
   */
@@ -250,12 +258,10 @@ TRACE_EVENT(kvm_exit,
                                            &__entry->info2);
         ),
  
-       TP_printk("vcpu %u reason %s rip 0x%lx info %llx %llx",
+       TP_printk("vcpu %u reason %s%s%s rip 0x%lx info %llx %llx",
                   __entry->vcpu_id,
-                (__entry->isa == KVM_ISA_VMX) ?
-                __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) :
-                __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS),
-                __entry->guest_rip, __entry->info1, __entry->info2)
+                 kvm_print_exit_reason(__entry->exit_reason, __entry->isa),
+                 __entry->guest_rip, __entry->info1, __entry->info2)
  );
  
  /*
@@ -588,12 +594,10 @@ TRACE_EVENT(kvm_nested_vmexit,
                 __entry->exit_int_info_err      = exit_int_info_err;
                 __entry->isa                    = isa;
         ),
-       TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
+       TP_printk("rip: 0x%016llx reason: %s%s%s ext_inf1: 0x%016llx "
                   "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
                   __entry->rip,
-                (__entry->isa == KVM_ISA_VMX) ?
-                __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
-                __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
+                 kvm_print_exit_reason(__entry->exit_code, __entry->isa),
                   __entry->exit_info1, __entry->exit_info2,
                   __entry->exit_int_info, __entry->exit_int_info_err)
  );
@@ -626,13 +630,11 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
                 __entry->isa                    = isa;
         ),
  
-       TP_printk("reason: %s ext_inf1: 0x%016llx "
+       TP_printk("reason: %s%s%s ext_inf1: 0x%016llx "
                   "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
-                (__entry->isa == KVM_ISA_VMX) ?
-                __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
-                __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
-               __entry->exit_info1, __entry->exit_info2,
-               __entry->exit_int_info, __entry->exit_int_info_err)
+                 kvm_print_exit_reason(__entry->exit_code, __entry->isa),
+                 __entry->exit_info1, __entry->exit_info2,
+                 __entry->exit_int_info, __entry->exit_int_info_err)
  );
  
  /*
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c

index 3038134..e5325bd 100644 (file)
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -4,6 +4,7 @@
  #include <linux/smp.h>
  
  #include "../hyperv.h"
+#include "../cpuid.h"
  #include "evmcs.h"
  #include "vmcs.h"
  #include "vmx.h"
@@ -160,14 +161,6 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
         EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
-       EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
-                    HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
-       EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
-                    HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
-       EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
-                    HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
-       EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
-                    HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
  
         /* 32 bit rw */
         EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
@@ -334,17 +327,18 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa)
  
  uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
  {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       /*
-        * vmcs_version represents the range of supported Enlightened VMCS
-        * versions: lower 8 bits is the minimal version, higher 8 bits is the
-        * maximum supported version. KVM supports versions from 1 to
-        * KVM_EVMCS_VERSION.
-        */
-       if (vmx->nested.enlightened_vmcs_enabled)
-               return (KVM_EVMCS_VERSION << 8) | 1;
-
-       return 0;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       /*
+        * vmcs_version represents the range of supported Enlightened VMCS
+        * versions: lower 8 bits is the minimal version, higher 8 bits is the
+        * maximum supported version. KVM supports versions from 1 to
+        * KVM_EVMCS_VERSION.
+        */
+       if (kvm_cpu_cap_get(X86_FEATURE_VMX) &&
+           vmx->nested.enlightened_vmcs_enabled)
+               return (KVM_EVMCS_VERSION << 8) | 1;
+
+       return 0;
  }
  
  void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c

index e44f33c..51ebb60 100644 (file)
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -303,11 +303,11 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
         cpu = get_cpu();
         prev = vmx->loaded_vmcs;
         vmx->loaded_vmcs = vmcs;
-       vmx_vcpu_load_vmcs(vcpu, cpu);
+       vmx_vcpu_load_vmcs(vcpu, cpu, prev);
         vmx_sync_vmcs_host_state(vmx, prev);
         put_cpu();
  
-       vmx_segment_cache_clear(vmx);
+       vmx_register_cache_reset(vcpu);
  }
  
  /*
@@ -328,19 +328,19 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
  {
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 exit_reason;
+       u32 vm_exit_reason;
         unsigned long exit_qualification = vcpu->arch.exit_qualification;
  
         if (vmx->nested.pml_full) {
-               exit_reason = EXIT_REASON_PML_FULL;
+               vm_exit_reason = EXIT_REASON_PML_FULL;
                 vmx->nested.pml_full = false;
                 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
         } else if (fault->error_code & PFERR_RSVD_MASK)
-               exit_reason = EXIT_REASON_EPT_MISCONFIG;
+               vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
         else
-               exit_reason = EXIT_REASON_EPT_VIOLATION;
+               vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
  
-       nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
+       nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
         vmcs12->guest_physical_address = fault->address;
  }
  
@@ -698,11 +698,6 @@ static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
                 VM_EXIT_ACK_INTR_ON_EXIT;
  }
  
-static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
-{
-       return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
-}
-
  static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
                                           struct vmcs12 *vmcs12)
  {
@@ -927,6 +922,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
         }
         return 0;
  fail:
+       /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
         return i + 1;
  }
  
@@ -1073,6 +1069,48 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
         return (val & invalid_mask) == 0;
  }
  
+/*
+ * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
+ * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
+ * enable VPID for L2 (implying it expects a TLB flush on VMX transitions).
+ * Here's why.
+ *
+ * If EPT is enabled by L0 a sync is never needed:
+ * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there
+ *   cannot be unsync'd SPTEs for either L1 or L2.
+ *
+ * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter
+ *   VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings
+ *   (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush
+ *   stale guest-physical mappings for L2 from the TLB.  And as above, L0 isn't
+ *   shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit.
+ *
+ * If EPT is disabled by L0:
+ * - if VPID is enabled by L1 (for L2), the situation is similar to when L1
+ *   enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't
+ *   required to invalidate linear mappings (EPT is disabled so there are
+ *   no combined or guest-physical mappings), i.e. L1 can't rely on the
+ *   (virtual) CPU to flush stale linear mappings for either L2 or itself (L1).
+ *
+ * - however if VPID is disabled by L1, then a sync is needed as L1 expects all
+ *   linear mappings (EPT is disabled so there are no combined or guest-physical
+ *   mappings) to be invalidated on both VM-Enter and VM-Exit.
+ *
+ * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which
+ * additionally checks that L2 has been assigned a VPID (when EPT is disabled).
+ * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect
+ * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2
+ * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has
+ * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1
+ * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't
+ * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush
+ * stale TLB entries, at which point L0 will sync L2's MMU.
+ */
+static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
+{
+       return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu));
+}
+
  /*
   * Load guest's/host's cr3 at nested entry/exit.  @nested_ept is true if we are
   * emulating VM-Entry into a guest with EPT enabled.  On failure, the expected
@@ -1080,28 +1118,33 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
   * @entry_failure_code.
   */
  static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
-                              u32 *entry_failure_code)
+                              enum vm_entry_failure_code *entry_failure_code)
  {
-       if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
-               if (CC(!nested_cr3_valid(vcpu, cr3))) {
-                       *entry_failure_code = ENTRY_FAIL_DEFAULT;
-                       return -EINVAL;
-               }
+       if (CC(!nested_cr3_valid(vcpu, cr3))) {
+               *entry_failure_code = ENTRY_FAIL_DEFAULT;
+               return -EINVAL;
+       }
  
-               /*
-                * If PAE paging and EPT are both on, CR3 is not used by the CPU and
-                * must not be dereferenced.
-                */
-               if (is_pae_paging(vcpu) && !nested_ept) {
-                       if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
-                               *entry_failure_code = ENTRY_FAIL_PDPTE;
-                               return -EINVAL;
-                       }
+       /*
+        * If PAE paging and EPT are both on, CR3 is not used by the CPU and
+        * must not be dereferenced.
+        */
+       if (!nested_ept && is_pae_paging(vcpu) &&
+           (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
+               if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
+                       *entry_failure_code = ENTRY_FAIL_PDPTE;
+                       return -EINVAL;
                 }
         }
  
+       /*
+        * Unconditionally skip the TLB flush on fast CR3 switch, all TLB
+        * flushes are handled by nested_vmx_transition_tlb_flush().  See
+        * nested_vmx_transition_mmu_sync for details on skipping the MMU sync.
+        */
         if (!nested_ept)
-               kvm_mmu_new_cr3(vcpu, cr3, false);
+               kvm_mmu_new_pgd(vcpu, cr3, true,
+                               !nested_vmx_transition_mmu_sync(vcpu));
  
         vcpu->arch.cr3 = cr3;
         kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
@@ -1132,11 +1175,48 @@ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
                (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
  }
  
-static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
+                                           struct vmcs12 *vmcs12,
+                                           bool is_vmenter)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
-       return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
+       /*
+        * If VPID is disabled, linear and combined mappings are flushed on
+        * VM-Enter/VM-Exit, and guest-physical mappings are valid only for
+        * their associated EPTP.
+        */
+       if (!enable_vpid)
+               return;
+
+       /*
+        * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
+        * for *all* contexts to be flushed on VM-Enter/VM-Exit.
+        *
+        * If VPID is enabled and used by vmc12, but L2 does not have a unique
+        * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
+        * a VPID for L2, flush the current context as the effective ASID is
+        * common to both L1 and L2.
+        *
+        * Defer the flush so that it runs after vmcs02.EPTP has been set by
+        * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
+        * redundant flushes further down the nested pipeline.
+        *
+        * If a TLB flush isn't required due to any of the above, and vpid12 is
+        * changing then the new "virtual" VPID (vpid12) will reuse the same
+        * "real" VPID (vpid02), and so needs to be sync'd.  There is no direct
+        * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
+        * all nested vCPUs.
+        */
+       if (!nested_cpu_has_vpid(vmcs12)) {
+               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+       } else if (!nested_has_guest_tlb_tag(vcpu)) {
+               kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+       } else if (is_vmenter &&
+                  vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+               vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+               vpid_sync_context(nested_get_vpid02(vcpu));
+       }
  }
  
  static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
@@ -1700,10 +1780,6 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
          * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
          * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
          * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
-        * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
-        * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
-        * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
-        * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
          * vmcs12->page_fault_error_code_mask =
          *              evmcs->page_fault_error_code_mask;
          * vmcs12->page_fault_error_code_match =
@@ -1777,10 +1853,6 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
          * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
          * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
          * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
-        * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
-        * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
-        * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
-        * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
          * evmcs->tpr_threshold = vmcs12->tpr_threshold;
          * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
          * evmcs->exception_bitmap = vmcs12->exception_bitmap;
@@ -2041,7 +2113,8 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
         preemption_timeout *= 1000000;
         do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
         hrtimer_start(&vmx->nested.preemption_timer,
-                     ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
+                     ktime_add_ns(ktime_get(), preemption_timeout),
+                     HRTIMER_MODE_ABS_PINNED);
  }
  
  static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
@@ -2398,7 +2471,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
   * is assigned to entry_failure_code on failure.
   */
  static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-                         u32 *entry_failure_code)
+                         enum vm_entry_failure_code *entry_failure_code)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
@@ -2447,32 +2520,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         if (kvm_has_tsc_control)
                 decache_tsc_multiplier(vmx);
  
-       if (enable_vpid) {
-               /*
-                * There is no direct mapping between vpid02 and vpid12, the
-                * vpid02 is per-vCPU for L0 and reused while the value of
-                * vpid12 is changed w/ one invvpid during nested vmentry.
-                * The vpid12 is allocated by L1 for L2, so it will not
-                * influence global bitmap(for vpid01 and vpid02 allocation)
-                * even if spawn a lot of nested vCPUs.
-                */
-               if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
-                       if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
-                               vmx->nested.last_vpid = vmcs12->virtual_processor_id;
-                               __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
-                       }
-               } else {
-                       /*
-                        * If L1 use EPT, then L0 needs to execute INVEPT on
-                        * EPTP02 instead of EPTP01. Therefore, delay TLB
-                        * flush until vmcs02->eptp is fully updated by
-                        * KVM_REQ_LOAD_MMU_PGD. Note that this assumes
-                        * KVM_REQ_TLB_FLUSH is evaluated after
-                        * KVM_REQ_LOAD_MMU_PGD in vcpu_enter_guest().
-                        */
-                       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-               }
-       }
+       nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
  
         if (nested_cpu_has_ept(vmcs12))
                 nested_ept_init_mmu_context(vcpu);
@@ -2883,11 +2931,11 @@ static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
  
  static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
                                         struct vmcs12 *vmcs12,
-                                       u32 *exit_qual)
+                                       enum vm_entry_failure_code *entry_failure_code)
  {
         bool ia32e;
  
-       *exit_qual = ENTRY_FAIL_DEFAULT;
+       *entry_failure_code = ENTRY_FAIL_DEFAULT;
  
         if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
             CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
@@ -2902,7 +2950,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
                 return -EINVAL;
  
         if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
-               *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
+               *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
                 return -EINVAL;
         }
  
@@ -3194,9 +3242,12 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       enum vm_entry_failure_code entry_failure_code;
         bool evaluate_pending_interrupts;
-       u32 exit_reason = EXIT_REASON_INVALID_STATE;
-       u32 exit_qual;
+       u32 exit_reason, failed_index;
+
+       if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+               kvm_vcpu_flush_tlb_current(vcpu);
  
         evaluate_pending_interrupts = exec_controls_get(vmx) &
                 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
@@ -3241,24 +3292,33 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
                         return NVMX_VMENTRY_VMFAIL;
                 }
  
-               if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
+               if (nested_vmx_check_guest_state(vcpu, vmcs12,
+                                                &entry_failure_code)) {
+                       exit_reason = EXIT_REASON_INVALID_STATE;
+                       vmcs12->exit_qualification = entry_failure_code;
                         goto vmentry_fail_vmexit;
+               }
         }
  
         enter_guest_mode(vcpu);
         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
                 vcpu->arch.tsc_offset += vmcs12->tsc_offset;
  
-       if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
+       if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) {
+               exit_reason = EXIT_REASON_INVALID_STATE;
+               vmcs12->exit_qualification = entry_failure_code;
                 goto vmentry_fail_vmexit_guest_mode;
+       }
  
         if (from_vmentry) {
-               exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
-               exit_qual = nested_vmx_load_msr(vcpu,
-                                               vmcs12->vm_entry_msr_load_addr,
-                                               vmcs12->vm_entry_msr_load_count);
-               if (exit_qual)
+               failed_index = nested_vmx_load_msr(vcpu,
+                                                  vmcs12->vm_entry_msr_load_addr,
+                                                  vmcs12->vm_entry_msr_load_count);
+               if (failed_index) {
+                       exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
+                       vmcs12->exit_qualification = failed_index;
                         goto vmentry_fail_vmexit_guest_mode;
+               }
         } else {
                 /*
                  * The MMU is not initialized to point at the right entities yet and
@@ -3322,7 +3382,6 @@ vmentry_fail_vmexit:
  
         load_vmcs12_host_state(vcpu, vmcs12);
         vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
-       vmcs12->exit_qualification = exit_qual;
         if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
                 vmx->nested.need_vmcs12_to_shadow_sync = true;
         return NVMX_VMENTRY_VMEXIT;
@@ -3632,6 +3691,12 @@ static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
                             vcpu->arch.exception.payload);
  }
  
+static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
+{
+       return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
+              to_vmx(vcpu)->nested.preemption_timer_expired;
+}
+
  static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3661,11 +3726,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
         /*
          * Process any exceptions that are not debug traps before MTF.
          */
-       if (vcpu->arch.exception.pending &&
-           !vmx_pending_dbg_trap(vcpu) &&
-           nested_vmx_check_exception(vcpu, &exit_qual)) {
+       if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
                 if (block_nested_events)
                         return -EBUSY;
+               if (!nested_vmx_check_exception(vcpu, &exit_qual))
+                       goto no_vmexit;
                 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
                 return 0;
         }
@@ -3678,25 +3743,34 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
                 return 0;
         }
  
-       if (vcpu->arch.exception.pending &&
-           nested_vmx_check_exception(vcpu, &exit_qual)) {
+       if (vcpu->arch.exception.pending) {
                 if (block_nested_events)
                         return -EBUSY;
+               if (!nested_vmx_check_exception(vcpu, &exit_qual))
+                       goto no_vmexit;
                 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
                 return 0;
         }
  
-       if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
-           vmx->nested.preemption_timer_expired) {
+       if (nested_vmx_preemption_timer_pending(vcpu)) {
                 if (block_nested_events)
                         return -EBUSY;
                 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
                 return 0;
         }
  
-       if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
+       if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+               if (block_nested_events)
+                       return -EBUSY;
+               goto no_vmexit;
+       }
+
+       if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
                 if (block_nested_events)
                         return -EBUSY;
+               if (!nested_exit_on_nmi(vcpu))
+                       goto no_vmexit;
+
                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
                                   NMI_VECTOR | INTR_TYPE_NMI_INTR |
                                   INTR_INFO_VALID_MASK, 0);
@@ -3709,13 +3783,16 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
                 return 0;
         }
  
-       if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) {
+       if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
                 if (block_nested_events)
                         return -EBUSY;
+               if (!nested_exit_on_intr(vcpu))
+                       goto no_vmexit;
                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
                 return 0;
         }
  
+no_vmexit:
         vmx_complete_nested_posted_interrupt(vcpu);
         return 0;
  }
@@ -3842,12 +3919,12 @@ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
  
         cpu = get_cpu();
         vmx->loaded_vmcs = &vmx->nested.vmcs02;
-       vmx_vcpu_load(&vmx->vcpu, cpu);
+       vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
  
         sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
  
         vmx->loaded_vmcs = &vmx->vmcs01;
-       vmx_vcpu_load(&vmx->vcpu, cpu);
+       vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
         put_cpu();
  }
  
@@ -3876,10 +3953,6 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
         vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
  
-       vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
-       vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
-       vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
-
         vmcs12->guest_interruptibility_info =
                 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
  
@@ -3939,11 +4012,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
   * which already writes to vmcs12 directly.
   */
  static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-                          u32 exit_reason, u32 exit_intr_info,
+                          u32 vm_exit_reason, u32 exit_intr_info,
                            unsigned long exit_qualification)
  {
         /* update exit information fields: */
-       vmcs12->vm_exit_reason = exit_reason;
+       vmcs12->vm_exit_reason = vm_exit_reason;
         vmcs12->exit_qualification = exit_qualification;
         vmcs12->vm_exit_intr_info = exit_intr_info;
  
@@ -3998,8 +4071,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
  static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                                    struct vmcs12 *vmcs12)
  {
+       enum vm_entry_failure_code ignored;
         struct kvm_segment seg;
-       u32 entry_failure_code;
  
         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                 vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -4034,30 +4107,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
          * Only PDPTE load can fail as the value of cr3 was checked on entry and
          * couldn't have changed.
          */
-       if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+       if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
  
         if (!enable_ept)
                 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
  
-       /*
-        * If vmcs01 doesn't use VPID, CPU flushes TLB on every
-        * VMEntry/VMExit. Thus, no need to flush TLB.
-        *
-        * If vmcs12 doesn't use VPID, L1 expects TLB to be
-        * flushed on every VMEntry/VMExit.
-        *
-        * Otherwise, we can preserve TLB entries as long as we are
-        * able to tag L1 TLB entries differently than L2 TLB entries.
-        *
-        * If vmcs12 uses EPT, we need to execute this flush on EPTP01
-        * and therefore we request the TLB flush to happen only after VMCS EPTP
-        * has been set by KVM_REQ_LOAD_MMU_PGD.
-        */
-       if (enable_vpid &&
-           (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-       }
+       nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
  
         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
         vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
@@ -4204,7 +4260,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
          * VMFail, like everything else we just need to ensure our
          * software model is up-to-date.
          */
-       if (enable_ept)
+       if (enable_ept && is_pae_paging(vcpu))
                 ept_save_pdptrs(vcpu);
  
         kvm_mmu_reset_context(vcpu);
@@ -4272,7 +4328,7 @@ vmabort:
   * and modify vmcs12 to make it see what it would expect to see there if
   * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
   */
-void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
                        u32 exit_intr_info, unsigned long exit_qualification)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4281,6 +4337,10 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         /* trying to cancel vmlaunch/vmresume is a bug */
         WARN_ON_ONCE(vmx->nested.nested_run_pending);
  
+       /* Service the TLB flush request for L2 before switching to L1. */
+       if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+               kvm_vcpu_flush_tlb_current(vcpu);
+
         leave_guest_mode(vcpu);
  
         if (nested_cpu_has_preemption_timer(vmcs12))
@@ -4292,9 +4352,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         if (likely(!vmx->fail)) {
                 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
  
-               if (exit_reason != -1)
-                       prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
-                                      exit_qualification);
+               if (vm_exit_reason != -1)
+                       prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
+                                      exit_intr_info, exit_qualification);
  
                 /*
                  * Must happen outside of sync_vmcs02_to_vmcs12() as it will
@@ -4344,20 +4404,20 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
         vmx->nested.pi_desc = NULL;
  
-       /*
-        * We are now running in L2, mmu_notifier will force to reload the
-        * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
-        */
-       kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+       if (vmx->nested.reload_vmcs01_apic_access_page) {
+               vmx->nested.reload_vmcs01_apic_access_page = false;
+               kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+       }
  
-       if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
+       if ((vm_exit_reason != -1) &&
+           (enable_shadow_vmcs || vmx->nested.hv_evmcs))
                 vmx->nested.need_vmcs12_to_shadow_sync = true;
  
         /* in case we halted in L2 */
         vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
  
         if (likely(!vmx->fail)) {
-               if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+               if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
                     nested_exit_intr_ack_set(vcpu)) {
                         int irq = kvm_cpu_get_interrupt(vcpu);
                         WARN_ON(irq < 0);
@@ -4365,7 +4425,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                                 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
                 }
  
-               if (exit_reason != -1)
+               if (vm_exit_reason != -1)
                         trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
                                                        vmcs12->exit_qualification,
                                                        vmcs12->idt_vectoring_info_field,
@@ -4554,13 +4614,13 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
         gva_t gva;
         struct x86_exception e;
  
-       if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+       if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
                                 vmcs_read32(VMX_INSTRUCTION_INFO), false,
                                 sizeof(*vmpointer), &gva))
                 return 1;
  
         if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
-               kvm_inject_page_fault(vcpu, &e);
+               kvm_inject_emulated_page_fault(vcpu, &e);
                 return 1;
         }
  
@@ -4614,7 +4674,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
                 goto out_shadow_vmcs;
  
         hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
-                    HRTIMER_MODE_REL_PINNED);
+                    HRTIMER_MODE_ABS_PINNED);
         vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
  
         vmx->nested.vpid02 = allocate_vpid();
@@ -4819,7 +4879,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
  {
         struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
                                                     : get_vmcs12(vcpu);
-       unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
         u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct x86_exception e;
@@ -4869,7 +4929,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                         return 1;
                 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
                 if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) {
-                       kvm_inject_page_fault(vcpu, &e);
+                       kvm_inject_emulated_page_fault(vcpu, &e);
                         return 1;
                 }
         }
@@ -4905,7 +4965,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
  {
         struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
                                                     : get_vmcs12(vcpu);
-       unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
         u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct x86_exception e;
@@ -4943,7 +5003,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                                         instr_info, false, len, &gva))
                         return 1;
                 if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) {
-                       kvm_inject_page_fault(vcpu, &e);
+                       kvm_inject_emulated_page_fault(vcpu, &e);
                         return 1;
                 }
         }
@@ -5090,7 +5150,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
  /* Emulate the VMPTRST instruction */
  static int handle_vmptrst(struct kvm_vcpu *vcpu)
  {
-       unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
+       unsigned long exit_qual = vmx_get_exit_qual(vcpu);
         u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
         gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
         struct x86_exception e;
@@ -5108,23 +5168,33 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
         /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
         if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
                                         sizeof(gpa_t), &e)) {
-               kvm_inject_page_fault(vcpu, &e);
+               kvm_inject_emulated_page_fault(vcpu, &e);
                 return 1;
         }
         return nested_vmx_succeed(vcpu);
  }
  
+#define EPTP_PA_MASK   GENMASK_ULL(51, 12)
+
+static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
+{
+       return VALID_PAGE(root_hpa) &&
+               ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
+}
+
  /* Emulate the INVEPT instruction */
  static int handle_invept(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         u32 vmx_instruction_info, types;
-       unsigned long type;
+       unsigned long type, roots_to_free;
+       struct kvm_mmu *mmu;
         gva_t gva;
         struct x86_exception e;
         struct {
                 u64 eptp, gpa;
         } operand;
+       int i;
  
         if (!(vmx->nested.msrs.secondary_ctls_high &
               SECONDARY_EXEC_ENABLE_EPT) ||
@@ -5148,27 +5218,49 @@ static int handle_invept(struct kvm_vcpu *vcpu)
         /* According to the Intel VMX instruction reference, the memory
          * operand is read even if it isn't needed (e.g., for type==global)
          */
-       if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+       if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
                         vmx_instruction_info, false, sizeof(operand), &gva))
                 return 1;
         if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
-               kvm_inject_page_fault(vcpu, &e);
+               kvm_inject_emulated_page_fault(vcpu, &e);
                 return 1;
         }
  
-       switch (type) {
-       case VMX_EPT_EXTENT_GLOBAL:
-       case VMX_EPT_EXTENT_CONTEXT:
         /*
-        * TODO: Sync the necessary shadow EPT roots here, rather than
-        * at the next emulated VM-entry.
+        * Nested EPT roots are always held through guest_mmu,
+        * not root_mmu.
          */
+       mmu = &vcpu->arch.guest_mmu;
+
+       switch (type) {
+       case VMX_EPT_EXTENT_CONTEXT:
+               if (!nested_vmx_check_eptp(vcpu, operand.eptp))
+                       return nested_vmx_failValid(vcpu,
+                               VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+
+               roots_to_free = 0;
+               if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd,
+                                           operand.eptp))
+                       roots_to_free |= KVM_MMU_ROOT_CURRENT;
+
+               for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+                       if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
+                                                   mmu->prev_roots[i].pgd,
+                                                   operand.eptp))
+                               roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+               }
+               break;
+       case VMX_EPT_EXTENT_GLOBAL:
+               roots_to_free = KVM_MMU_ROOTS_ALL;
                 break;
         default:
                 BUG();
                 break;
         }
  
+       if (roots_to_free)
+               kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+
         return nested_vmx_succeed(vcpu);
  }
  
@@ -5208,11 +5300,11 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
         /* according to the intel vmx instruction reference, the memory
          * operand is read even if it isn't needed (e.g., for type==global)
          */
-       if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+       if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
                         vmx_instruction_info, false, sizeof(operand), &gva))
                 return 1;
         if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
-               kvm_inject_page_fault(vcpu, &e);
+               kvm_inject_emulated_page_fault(vcpu, &e);
                 return 1;
         }
         if (operand.vpid >> 16)
@@ -5226,27 +5318,37 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                     is_noncanonical_address(operand.gla, vcpu))
                         return nested_vmx_failValid(vcpu,
                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               if (cpu_has_vmx_invvpid_individual_addr()) {
-                       __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
-                               vpid02, operand.gla);
-               } else
-                       __vmx_flush_tlb(vcpu, vpid02, false);
+               vpid_sync_vcpu_addr(vpid02, operand.gla);
                 break;
         case VMX_VPID_EXTENT_SINGLE_CONTEXT:
         case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
                 if (!operand.vpid)
                         return nested_vmx_failValid(vcpu,
                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               __vmx_flush_tlb(vcpu, vpid02, false);
+               vpid_sync_context(vpid02);
                 break;
         case VMX_VPID_EXTENT_ALL_CONTEXT:
-               __vmx_flush_tlb(vcpu, vpid02, false);
+               vpid_sync_context(vpid02);
                 break;
         default:
                 WARN_ON_ONCE(1);
                 return kvm_skip_emulated_instruction(vcpu);
         }
  
+       /*
+        * Sync the shadow page tables if EPT is disabled, L1 is invalidating
+        * linear mappings for L2 (tagged with L2's VPID).  Free all roots as
+        * VPIDs are not tracked in the MMU role.
+        *
+        * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
+        * an MMU when EPT is disabled.
+        *
+        * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
+        */
+       if (!enable_ept)
+               kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu,
+                                  KVM_MMU_ROOTS_ALL);
+
         return nested_vmx_succeed(vcpu);
  }
  
@@ -5327,8 +5429,8 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
  
  fail:
         nested_vmx_vmexit(vcpu, vmx->exit_reason,
-                         vmcs_read32(VM_EXIT_INTR_INFO),
-                         vmcs_readl(EXIT_QUALIFICATION));
+                         vmx_get_intr_info(vcpu),
+                         vmx_get_exit_qual(vcpu));
         return 1;
  }
  
@@ -5379,7 +5481,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
                 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
  
-       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       exit_qualification = vmx_get_exit_qual(vcpu);
  
         port = exit_qualification >> 16;
         size = (exit_qualification & 7) + 1;
@@ -5433,7 +5535,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
  static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
         struct vmcs12 *vmcs12)
  {
-       unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
         int cr = exit_qualification & 15;
         int reg;
         unsigned long val;
@@ -5449,15 +5551,6 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
                                 return true;
                         break;
                 case 3:
-                       if ((vmcs12->cr3_target_count >= 1 &&
-                                       vmcs12->cr3_target_value0 == val) ||
-                               (vmcs12->cr3_target_count >= 2 &&
-                                       vmcs12->cr3_target_value1 == val) ||
-                               (vmcs12->cr3_target_count >= 3 &&
-                                       vmcs12->cr3_target_value2 == val) ||
-                               (vmcs12->cr3_target_count >= 4 &&
-                                       vmcs12->cr3_target_value3 == val))
-                               return false;
                         if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
                                 return true;
                         break;
@@ -5551,49 +5644,85 @@ static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
  }
  
  /*
- * Return true if we should exit from L2 to L1 to handle an exit, or false if we
- * should handle it ourselves in L0 (and then continue L2). Only call this
- * when in is_guest_mode (L2).
+ * Return true if L0 wants to handle an exit from L2 regardless of whether or not
+ * L1 wants the exit.  Only call this when in is_guest_mode (L2).
   */
-bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
+static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
  {
-       u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
-       WARN_ON_ONCE(vmx->nested.nested_run_pending);
-
-       if (unlikely(vmx->fail)) {
-               trace_kvm_nested_vmenter_failed(
-                       "hardware VM-instruction error: ",
-                       vmcs_read32(VM_INSTRUCTION_ERROR));
-               return true;
-       }
-
-       trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
-                               vmcs_readl(EXIT_QUALIFICATION),
-                               vmx->idt_vectoring_info,
-                               intr_info,
-                               vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
-                               KVM_ISA_VMX);
+       u32 intr_info;
  
         switch (exit_reason) {
         case EXIT_REASON_EXCEPTION_NMI:
+               intr_info = vmx_get_intr_info(vcpu);
                 if (is_nmi(intr_info))
-                       return false;
+                       return true;
                 else if (is_page_fault(intr_info))
-                       return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
+                       return vcpu->arch.apf.host_apf_reason || !enable_ept;
                 else if (is_debug(intr_info) &&
                          vcpu->guest_debug &
                          (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
-                       return false;
+                       return true;
                 else if (is_breakpoint(intr_info) &&
                          vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
-                       return false;
+                       return true;
+               return false;
+       case EXIT_REASON_EXTERNAL_INTERRUPT:
+               return true;
+       case EXIT_REASON_MCE_DURING_VMENTRY:
+               return true;
+       case EXIT_REASON_EPT_VIOLATION:
+               /*
+                * L0 always deals with the EPT violation. If nested EPT is
+                * used, and the nested mmu code discovers that the address is
+                * missing in the guest EPT table (EPT12), the EPT violation
+                * will be injected with nested_ept_inject_page_fault()
+                */
+               return true;
+       case EXIT_REASON_EPT_MISCONFIG:
+               /*
+                * L2 never uses directly L1's EPT, but rather L0's own EPT
+                * table (shadow on EPT) or a merged EPT table that L0 built
+                * (EPT on EPT). So any problems with the structure of the
+                * table is L0's fault.
+                */
+               return true;
+       case EXIT_REASON_PREEMPTION_TIMER:
+               return true;
+       case EXIT_REASON_PML_FULL:
+               /* We emulate PML support to L1. */
+               return true;
+       case EXIT_REASON_VMFUNC:
+               /* VM functions are emulated through L2->L0 vmexits. */
+               return true;
+       case EXIT_REASON_ENCLS:
+               /* SGX is never exposed to L1 */
+               return true;
+       default:
+               break;
+       }
+       return false;
+}
+
+/*
+ * Return 1 if L1 wants to intercept an exit from L2.  Only call this when in
+ * is_guest_mode (L2).
+ */
+static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       u32 intr_info;
+
+       switch (exit_reason) {
+       case EXIT_REASON_EXCEPTION_NMI:
+               intr_info = vmx_get_intr_info(vcpu);
+               if (is_nmi(intr_info))
+                       return true;
+               else if (is_page_fault(intr_info))
+                       return true;
                 return vmcs12->exception_bitmap &
                                 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
         case EXIT_REASON_EXTERNAL_INTERRUPT:
-               return false;
+               return nested_exit_on_intr(vcpu);
         case EXIT_REASON_TRIPLE_FAULT:
                 return true;
         case EXIT_REASON_INTERRUPT_WINDOW:
@@ -5658,7 +5787,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
                         nested_cpu_has2(vmcs12,
                                 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
         case EXIT_REASON_MCE_DURING_VMENTRY:
-               return false;
+               return true;
         case EXIT_REASON_TPR_BELOW_THRESHOLD:
                 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
         case EXIT_REASON_APIC_ACCESS:
@@ -5670,22 +5799,6 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
                  * delivery" only come from vmcs12.
                  */
                 return true;
-       case EXIT_REASON_EPT_VIOLATION:
-               /*
-                * L0 always deals with the EPT violation. If nested EPT is
-                * used, and the nested mmu code discovers that the address is
-                * missing in the guest EPT table (EPT12), the EPT violation
-                * will be injected with nested_ept_inject_page_fault()
-                */
-               return false;
-       case EXIT_REASON_EPT_MISCONFIG:
-               /*
-                * L2 never uses directly L1's EPT, but rather L0's own EPT
-                * table (shadow on EPT) or a merged EPT table that L0 built
-                * (EPT on EPT). So any problems with the structure of the
-                * table is L0's fault.
-                */
-               return false;
         case EXIT_REASON_INVPCID:
                 return
                         nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
@@ -5702,17 +5815,6 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
                  * the XSS exit bitmap in vmcs12.
                  */
                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
-       case EXIT_REASON_PREEMPTION_TIMER:
-               return false;
-       case EXIT_REASON_PML_FULL:
-               /* We emulate PML support to L1. */
-               return false;
-       case EXIT_REASON_VMFUNC:
-               /* VM functions are emulated through L2->L0 vmexits. */
-               return false;
-       case EXIT_REASON_ENCLS:
-               /* SGX is never exposed to L1 */
-               return false;
         case EXIT_REASON_UMWAIT:
         case EXIT_REASON_TPAUSE:
                 return nested_cpu_has2(vmcs12,
@@ -5722,6 +5824,67 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
         }
  }
  
+/*
+ * Conditionally reflect a VM-Exit into L1.  Returns %true if the VM-Exit was
+ * reflected into L1.
+ */
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 exit_reason = vmx->exit_reason;
+       unsigned long exit_qual;
+       u32 exit_intr_info;
+
+       WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
+       /*
+        * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
+        * has already loaded L2's state.
+        */
+       if (unlikely(vmx->fail)) {
+               trace_kvm_nested_vmenter_failed(
+                       "hardware VM-instruction error: ",
+                       vmcs_read32(VM_INSTRUCTION_ERROR));
+               exit_intr_info = 0;
+               exit_qual = 0;
+               goto reflect_vmexit;
+       }
+
+       exit_intr_info = vmx_get_intr_info(vcpu);
+       exit_qual = vmx_get_exit_qual(vcpu);
+
+       trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual,
+                               vmx->idt_vectoring_info, exit_intr_info,
+                               vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+                               KVM_ISA_VMX);
+
+       /* If L0 (KVM) wants the exit, it trumps L1's desires. */
+       if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
+               return false;
+
+       /* If L1 doesn't want the exit, handle it in L0. */
+       if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
+               return false;
+
+       /*
+        * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits.  For
+        * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
+        * need to be synthesized by querying the in-kernel LAPIC, but external
+        * interrupts are never reflected to L1 so it's a non-issue.
+        */
+       if ((exit_intr_info &
+            (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+           (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+               vmcs12->vm_exit_intr_error_code =
+                       vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+       }
+
+reflect_vmexit:
+       nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual);
+       return true;
+}
  
  static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
                                 struct kvm_nested_state __user *user_kvm_nested_state,
@@ -5844,7 +6007,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct vmcs12 *vmcs12;
-       u32 exit_qual;
+       enum vm_entry_failure_code ignored;
         struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
                 &user_kvm_nested_state->data.vmx[0];
         int ret;
@@ -5985,7 +6148,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
  
         if (nested_vmx_check_controls(vcpu, vmcs12) ||
             nested_vmx_check_host_state(vcpu, vmcs12) ||
-           nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
+           nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
                 goto error_guest_mode;
  
         vmx->nested.dirty_vmcs12 = true;
@@ -6031,7 +6194,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
          * reason is that if one of these bits is necessary, it will appear
          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
          * fields of vmcs01 and vmcs02, will turn these bits off - and
-        * nested_vmx_exit_reflected() will not pass related exits to L1.
+        * nested_vmx_l1_wants_exit() will not pass related exits to L1.
          * These rules have exceptions below.
          */
  
@@ -6259,8 +6422,7 @@ void nested_vmx_hardware_unsetup(void)
         }
  }
  
-__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops,
-                                    int (*exit_handlers[])(struct kvm_vcpu *))
+__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
  {
         int i;
  
@@ -6296,12 +6458,15 @@ __init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops,
         exit_handlers[EXIT_REASON_INVVPID]      = handle_invvpid;
         exit_handlers[EXIT_REASON_VMFUNC]       = handle_vmfunc;
  
-       ops->check_nested_events = vmx_check_nested_events;
-       ops->get_nested_state = vmx_get_nested_state;
-       ops->set_nested_state = vmx_set_nested_state;
-       ops->get_vmcs12_pages = nested_get_vmcs12_pages;
-       ops->nested_enable_evmcs = nested_enable_evmcs;
-       ops->nested_get_evmcs_version = nested_get_evmcs_version;
-
         return 0;
  }
+
+struct kvm_x86_nested_ops vmx_nested_ops = {
+       .check_events = vmx_check_nested_events,
+       .hv_timer_pending = nested_vmx_preemption_timer_pending,
+       .get_state = vmx_get_nested_state,
+       .set_state = vmx_set_nested_state,
+       .get_vmcs12_pages = nested_get_vmcs12_pages,
+       .enable_evmcs = nested_enable_evmcs,
+       .get_evmcs_version = nested_get_evmcs_version,
+};
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h

index ac56aef..758bccc 100644 (file)
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -19,14 +19,13 @@ enum nvmx_vmentry_status {
  void vmx_leave_nested(struct kvm_vcpu *vcpu);
  void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps);
  void nested_vmx_hardware_unsetup(void);
-__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops,
-                                    int (*exit_handlers[])(struct kvm_vcpu *));
+__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
  void nested_vmx_set_vmcs_shadowing_bitmap(void);
  void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
  enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
                                                      bool from_vmentry);
-bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason);
-void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu);
+void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
                        u32 exit_intr_info, unsigned long exit_qualification);
  void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
  int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
@@ -62,6 +61,13 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
                 vmx->nested.hv_evmcs;
  }
  
+static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
+}
+
  static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu)
  {
         /* return the page table to be shadowed - in our case, EPT12 */
@@ -73,34 +79,6 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
         return nested_ept_get_eptp(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
  }
  
-/*
- * Reflect a VM Exit into L1.
- */
-static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
-                                           u32 exit_reason)
-{
-       u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
-       /*
-        * At this point, the exit interruption info in exit_intr_info
-        * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
-        * we need to query the in-kernel LAPIC.
-        */
-       WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
-       if ((exit_intr_info &
-            (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
-           (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
-               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
-               vmcs12->vm_exit_intr_error_code =
-                       vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
-       }
-
-       nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
-                         vmcs_readl(EXIT_QUALIFICATION));
-       return 1;
-}
-
  /*
   * Return the cr0 value that a nested guest would read. This is a combination
   * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
@@ -246,6 +224,11 @@ static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12)
             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
  }
  
+static inline bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
+{
+       return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
+}
+
  /*
   * In nested virtualization, check if L1 asked to exit on external interrupts.
   * For most existing hypervisors, this will always return true.
@@ -299,4 +282,6 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
  #define nested_guest_cr4_valid nested_cr4_valid
  #define nested_host_cr4_valid  nested_cr4_valid
  
+extern struct kvm_x86_nested_ops vmx_nested_ops;
+
  #endif /* __KVM_X86_VMX_NESTED_H */
diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h

index 19717d0..5f1ac00 100644 (file)
--- a/arch/x86/kvm/vmx/ops.h
+++ b/arch/x86/kvm/vmx/ops.h
@@ -268,42 +268,38 @@ static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
         vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
  }
  
-static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
-{
-       if (vpid == 0)
-               return true;
-
-       if (cpu_has_vmx_invvpid_individual_addr()) {
-               __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
-               return true;
-       }
-
-       return false;
-}
-
  static inline void vpid_sync_vcpu_single(int vpid)
  {
         if (vpid == 0)
                 return;
  
-       if (cpu_has_vmx_invvpid_single())
-               __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
+       __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
  }
  
  static inline void vpid_sync_vcpu_global(void)
  {
-       if (cpu_has_vmx_invvpid_global())
-               __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+       __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
  }
  
  static inline void vpid_sync_context(int vpid)
  {
         if (cpu_has_vmx_invvpid_single())
                 vpid_sync_vcpu_single(vpid);
-       else
+       else if (vpid != 0)
                 vpid_sync_vcpu_global();
  }
  
+static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr)
+{
+       if (vpid == 0)
+               return;
+
+       if (cpu_has_vmx_invvpid_individual_addr())
+               __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
+       else
+               vpid_sync_context(vpid);
+}
+
  static inline void ept_sync_global(void)
  {
         __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c

index 53dfb40..c8e51c0 100644 (file)
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -115,10 +115,6 @@ const unsigned short vmcs_field_to_offset_table[] = {
         FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
         FIELD(CR0_READ_SHADOW, cr0_read_shadow),
         FIELD(CR4_READ_SHADOW, cr4_read_shadow),
-       FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
-       FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
-       FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
-       FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
         FIELD(EXIT_QUALIFICATION, exit_qualification),
         FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
         FIELD(GUEST_CR0, guest_cr0),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h

index d0c6df3..80232da 100644 (file)
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -80,10 +80,7 @@ struct __packed vmcs12 {
         natural_width cr4_guest_host_mask;
         natural_width cr0_read_shadow;
         natural_width cr4_read_shadow;
-       natural_width cr3_target_value0;
-       natural_width cr3_target_value1;
-       natural_width cr3_target_value2;
-       natural_width cr3_target_value3;
+       natural_width dead_space[4]; /* Last remnants of cr3_target_value[0-3]. */
         natural_width exit_qualification;
         natural_width guest_linear_address;
         natural_width guest_cr0;
@@ -263,10 +260,7 @@ static inline void vmx_check_vmcs12_offsets(void)
         CHECK_OFFSET(cr4_guest_host_mask, 352);
         CHECK_OFFSET(cr0_read_shadow, 360);
         CHECK_OFFSET(cr4_read_shadow, 368);
-       CHECK_OFFSET(cr3_target_value0, 376);
-       CHECK_OFFSET(cr3_target_value1, 384);
-       CHECK_OFFSET(cr3_target_value2, 392);
-       CHECK_OFFSET(cr3_target_value3, 400);
+       CHECK_OFFSET(dead_space, 376);
         CHECK_OFFSET(exit_qualification, 408);
         CHECK_OFFSET(guest_linear_address, 416);
         CHECK_OFFSET(guest_cr0, 424);
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S

index 51d1a82..e0a182c 100644 (file)
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -166,13 +166,13 @@ SYM_FUNC_START(__vmx_vcpu_run)
         mov WORD_SIZE(%_ASM_SP), %_ASM_AX
  
         /* Save all guest registers, including RAX from the stack */
-       __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX)
-       mov %_ASM_CX,   VCPU_RCX(%_ASM_AX)
-       mov %_ASM_DX,   VCPU_RDX(%_ASM_AX)
-       mov %_ASM_BX,   VCPU_RBX(%_ASM_AX)
-       mov %_ASM_BP,   VCPU_RBP(%_ASM_AX)
-       mov %_ASM_SI,   VCPU_RSI(%_ASM_AX)
-       mov %_ASM_DI,   VCPU_RDI(%_ASM_AX)
+       pop           VCPU_RAX(%_ASM_AX)
+       mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
+       mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
+       mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
+       mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
+       mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
+       mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
  #ifdef CONFIG_X86_64
         mov %r8,  VCPU_R8 (%_ASM_AX)
         mov %r9,  VCPU_R9 (%_ASM_AX)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index c2c6335..6a03c27 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -437,6 +437,11 @@ static const struct kvm_vmx_segment_field {
         VMX_SEGMENT_FIELD(LDTR),
  };
  
+static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+       vmx->segment_cache.bitmask = 0;
+}
+
  static unsigned long host_idt_base;
  
  /*
@@ -1306,10 +1311,12 @@ after_clear_sn:
                 pi_set_on(pi_desc);
  }
  
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+                       struct loaded_vmcs *buddy)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
+       struct vmcs *prev;
  
         if (!already_loaded) {
                 loaded_vmcs_clear(vmx->loaded_vmcs);
@@ -1328,16 +1335,28 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
                 local_irq_enable();
         }
  
-       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+       prev = per_cpu(current_vmcs, cpu);
+       if (prev != vmx->loaded_vmcs->vmcs) {
                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
                 vmcs_load(vmx->loaded_vmcs->vmcs);
-               indirect_branch_prediction_barrier();
+
+               /*
+                * No indirect branch prediction barrier needed when switching
+                * the active VMCS within a guest, e.g. on nested VM-Enter.
+                * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
+                */
+               if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
+                       indirect_branch_prediction_barrier();
         }
  
         if (!already_loaded) {
                 void *gdt = get_current_gdt_ro();
                 unsigned long sysenter_esp;
  
+               /*
+                * Flush all EPTP/VPID contexts, the new pCPU may have stale
+                * TLB entries from its previous association with the vCPU.
+                */
                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  
                 /*
@@ -1364,15 +1383,14 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
   * Switches to specified vcpu, until a matching vcpu_put(), but assumes
   * vcpu mutex is already taken.
   */
-void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
-       vmx_vcpu_load_vmcs(vcpu, cpu);
+       vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
  
         vmx_vcpu_pi_load(vcpu, cpu);
  
-       vmx->host_pkru = read_pkru();
         vmx->host_debugctlmsr = get_debugctlmsr();
  }
  
@@ -1547,7 +1565,7 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
  
  static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
  {
-       unsigned long rip;
+       unsigned long rip, orig_rip;
  
         /*
          * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
@@ -1559,8 +1577,17 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
          */
         if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
             to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
-               rip = kvm_rip_read(vcpu);
-               rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+               orig_rip = kvm_rip_read(vcpu);
+               rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+#ifdef CONFIG_X86_64
+               /*
+                * We need to mask out the high 32 bits of RIP if not in 64-bit
+                * mode, but just finding out that we are in 64-bit mode is
+                * quite expensive.  Only do it if there was a carry.
+                */
+               if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
+                       rip = (u32)rip;
+#endif
                 kvm_rip_write(vcpu, rip);
         } else {
                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
@@ -1713,17 +1740,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
                 vmx_update_msr_bitmap(&vmx->vcpu);
  }
  
-static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
-{
-       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
-       if (is_guest_mode(vcpu) &&
-           (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
-               return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
-
-       return vcpu->arch.tsc_offset;
-}
-
  static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
  {
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -1927,6 +1943,16 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
         return 0;
  }
  
+static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
+                                                   u64 data)
+{
+#ifdef CONFIG_X86_64
+       if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+               return (u32)data;
+#endif
+       return (unsigned long)data;
+}
+
  /*
   * Writes msr value into the appropriate "register".
   * Returns 0 on success, non-0 otherwise.
@@ -1964,13 +1990,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 vmcs_write32(GUEST_SYSENTER_CS, data);
                 break;
         case MSR_IA32_SYSENTER_EIP:
-               if (is_guest_mode(vcpu))
+               if (is_guest_mode(vcpu)) {
+                       data = nested_vmx_truncate_sysenter_addr(vcpu, data);
                         get_vmcs12(vcpu)->guest_sysenter_eip = data;
+               }
                 vmcs_writel(GUEST_SYSENTER_EIP, data);
                 break;
         case MSR_IA32_SYSENTER_ESP:
-               if (is_guest_mode(vcpu))
+               if (is_guest_mode(vcpu)) {
+                       data = nested_vmx_truncate_sysenter_addr(vcpu, data);
                         get_vmcs12(vcpu)->guest_sysenter_esp = data;
+               }
                 vmcs_writel(GUEST_SYSENTER_ESP, data);
                 break;
         case MSR_IA32_DEBUGCTLMSR:
@@ -2188,6 +2218,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  
  static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
  {
+       unsigned long guest_owned_bits;
+
         kvm_register_mark_available(vcpu, reg);
  
         switch (reg) {
@@ -2201,10 +2233,22 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
                 if (enable_ept)
                         ept_save_pdptrs(vcpu);
                 break;
+       case VCPU_EXREG_CR0:
+               guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+
+               vcpu->arch.cr0 &= ~guest_owned_bits;
+               vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
+               break;
         case VCPU_EXREG_CR3:
                 if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
                         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
                 break;
+       case VCPU_EXREG_CR4:
+               guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+
+               vcpu->arch.cr4 &= ~guest_owned_bits;
+               vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
+               break;
         default:
                 WARN_ON_ONCE(1);
                 break;
@@ -2838,34 +2882,64 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
  
  #endif
  
-static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
  {
-       int vpid = to_vmx(vcpu)->vpid;
-
-       if (!vpid_sync_vcpu_addr(vpid, addr))
-               vpid_sync_context(vpid);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
  
         /*
-        * If VPIDs are not supported or enabled, then the above is a no-op.
-        * But we don't really need a TLB flush in that case anyway, because
-        * each VM entry/exit includes an implicit flush when VPID is 0.
+        * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
+        * the CPU is not required to invalidate guest-physical mappings on
+        * VM-Entry, even if VPID is disabled.  Guest-physical mappings are
+        * associated with the root EPT structure and not any particular VPID
+        * (INVVPID also isn't required to invalidate guest-physical mappings).
          */
+       if (enable_ept) {
+               ept_sync_global();
+       } else if (enable_vpid) {
+               if (cpu_has_vmx_invvpid_global()) {
+                       vpid_sync_vcpu_global();
+               } else {
+                       vpid_sync_vcpu_single(vmx->vpid);
+                       vpid_sync_vcpu_single(vmx->nested.vpid02);
+               }
+       }
  }
  
-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
  {
-       ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+       u64 root_hpa = vcpu->arch.mmu->root_hpa;
+
+       /* No flush required if the current context is invalid. */
+       if (!VALID_PAGE(root_hpa))
+               return;
  
-       vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
-       vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
+       if (enable_ept)
+               ept_sync_context(construct_eptp(vcpu, root_hpa));
+       else if (!is_guest_mode(vcpu))
+               vpid_sync_context(to_vmx(vcpu)->vpid);
+       else
+               vpid_sync_context(nested_get_vpid02(vcpu));
  }
  
-static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
  {
-       ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+       /*
+        * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
+        * vmx_flush_tlb_guest() for an explanation of why this is ok.
+        */
+       vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
+}
  
-       vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
-       vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
+static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+       /*
+        * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
+        * or a vpid couldn't be allocated for this vCPU.  VM-Enter and VM-Exit
+        * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+        * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
+        * i.e. no explicit INVVPID is necessary.
+        */
+       vpid_sync_context(to_vmx(vcpu)->vpid);
  }
  
  static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -2887,12 +2961,13 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
  {
         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
  
-       if (is_pae_paging(vcpu)) {
-               mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
-               mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
-               mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
-               mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
-       }
+       if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
+               return;
+
+       mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+       mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+       mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+       mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
  
         kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
  }
@@ -2956,20 +3031,27 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         vmcs_writel(CR0_READ_SHADOW, cr0);
         vmcs_writel(GUEST_CR0, hw_cr0);
         vcpu->arch.cr0 = cr0;
+       kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
  
         /* depends on vcpu->arch.cr0 to be set to a new value */
         vmx->emulation_required = emulation_required(vcpu);
  }
  
-static int get_ept_level(struct kvm_vcpu *vcpu)
+static int vmx_get_tdp_level(struct kvm_vcpu *vcpu)
  {
-       if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
-               return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
         if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
                 return 5;
         return 4;
  }
  
+static int get_ept_level(struct kvm_vcpu *vcpu)
+{
+       if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
+               return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
+
+       return vmx_get_tdp_level(vcpu);
+}
+
  u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
  {
         u64 eptp = VMX_EPTP_MT_WB;
@@ -2984,16 +3066,15 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
         return eptp;
  }
  
-void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd)
  {
         struct kvm *kvm = vcpu->kvm;
         bool update_guest_cr3 = true;
         unsigned long guest_cr3;
         u64 eptp;
  
-       guest_cr3 = cr3;
         if (enable_ept) {
-               eptp = construct_eptp(vcpu, cr3);
+               eptp = construct_eptp(vcpu, pgd);
                 vmcs_write64(EPT_POINTER, eptp);
  
                 if (kvm_x86_ops.tlb_remote_flush) {
@@ -3014,6 +3095,8 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3)
                 else /* vmcs01.GUEST_CR3 is already up-to-date. */
                         update_guest_cr3 = false;
                 ept_load_pdptrs(vcpu);
+       } else {
+               guest_cr3 = pgd;
         }
  
         if (update_guest_cr3)
@@ -3064,6 +3147,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                 return 1;
  
         vcpu->arch.cr4 = cr4;
+       kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
  
         if (!enable_unrestricted_guest) {
                 if (enable_ept) {
@@ -3852,7 +3936,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
         if (pi_test_and_set_on(&vmx->pi_desc))
                 return 0;
  
-       if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
+       if (vcpu != kvm_get_running_vcpu() &&
+           !kvm_vcpu_trigger_posted_interrupt(vcpu, false))
                 kvm_vcpu_kick(vcpu);
  
         return 0;
@@ -4454,31 +4539,54 @@ void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
         }
  }
  
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
+bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
  {
-       if (to_vmx(vcpu)->nested.nested_run_pending)
-               return 0;
+       if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
+               return false;
  
-       if (!enable_vnmi &&
-           to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
-               return 0;
+       if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
+               return true;
  
-       return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-                 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
-                  | GUEST_INTR_STATE_NMI));
+       return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+               (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
+                GUEST_INTR_STATE_NMI));
  }
  
-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
+static bool vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
  {
         if (to_vmx(vcpu)->nested.nested_run_pending)
                 return false;
  
+       /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
+       if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
+               return false;
+
+       return !vmx_nmi_blocked(vcpu);
+}
+
+bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
         if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
-               return true;
+               return false;
+
+       return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
+              (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+               (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
  
-       return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-               !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-                       (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+static bool vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+       if (to_vmx(vcpu)->nested.nested_run_pending)
+               return false;
+
+       /*
+        * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
+        * e.g. if the IRQ arrived asynchronously after checking nested events.
+        */
+       if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+               return false;
+
+       return !vmx_interrupt_blocked(vcpu);
  }
  
  static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -4617,7 +4725,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
         u32 vect_info;
  
         vect_info = vmx->idt_vectoring_info;
-       intr_info = vmx->exit_intr_info;
+       intr_info = vmx_get_intr_info(vcpu);
  
         if (is_machine_check(intr_info) || is_nmi(intr_info))
                 return 1; /* handled by handle_exception_nmi_irqoff() */
@@ -4661,7 +4769,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
         }
  
         if (is_page_fault(intr_info)) {
-               cr2 = vmcs_readl(EXIT_QUALIFICATION);
+               cr2 = vmx_get_exit_qual(vcpu);
                 /* EPT won't cause page fault directly */
                 WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
                 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
@@ -4674,18 +4782,16 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
  
         switch (ex_no) {
         case DB_VECTOR:
-               dr6 = vmcs_readl(EXIT_QUALIFICATION);
+               dr6 = vmx_get_exit_qual(vcpu);
                 if (!(vcpu->guest_debug &
                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
-                       vcpu->arch.dr6 &= ~DR_TRAP_BITS;
-                       vcpu->arch.dr6 |= dr6 | DR6_RTM;
                         if (is_icebp(intr_info))
                                 WARN_ON(!skip_emulated_instruction(vcpu));
  
-                       kvm_queue_exception(vcpu, DB_VECTOR);
+                       kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
                         return 1;
                 }
-               kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+               kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
                 /* fall through */
         case BP_VECTOR:
@@ -4743,7 +4849,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
         int size, in, string;
         unsigned port;
  
-       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       exit_qualification = vmx_get_exit_qual(vcpu);
         string = (exit_qualification & 16) != 0;
  
         ++vcpu->stat.io_exits;
@@ -4834,7 +4940,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
         int err;
         int ret;
  
-       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       exit_qualification = vmx_get_exit_qual(vcpu);
         cr = exit_qualification & 15;
         reg = (exit_qualification >> 8) & 15;
         switch ((exit_qualification >> 4) & 3) {
@@ -4911,7 +5017,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
         unsigned long exit_qualification;
         int dr, dr7, reg;
  
-       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       exit_qualification = vmx_get_exit_qual(vcpu);
         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
  
         /* First, if DR does not exist, trigger UD */
@@ -4929,16 +5035,14 @@ static int handle_dr(struct kvm_vcpu *vcpu)
                  * guest debugging itself.
                  */
                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
-                       vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
+                       vcpu->run->debug.arch.dr6 = DR6_BD | DR6_RTM | DR6_FIXED_1;
                         vcpu->run->debug.arch.dr7 = dr7;
                         vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
                         vcpu->run->debug.arch.exception = DB_VECTOR;
                         vcpu->run->exit_reason = KVM_EXIT_DEBUG;
                         return 0;
                 } else {
-                       vcpu->arch.dr6 &= ~DR_TRAP_BITS;
-                       vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
-                       kvm_queue_exception(vcpu, DB_VECTOR);
+                       kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
                         return 1;
                 }
         }
@@ -4969,15 +5073,6 @@ static int handle_dr(struct kvm_vcpu *vcpu)
         return kvm_skip_emulated_instruction(vcpu);
  }
  
-static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.dr6;
-}
-
-static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
-{
-}
-
  static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
  {
         get_debugreg(vcpu->arch.db[0], 0);
@@ -5024,7 +5119,7 @@ static int handle_invd(struct kvm_vcpu *vcpu)
  
  static int handle_invlpg(struct kvm_vcpu *vcpu)
  {
-       unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
  
         kvm_mmu_invlpg(vcpu, exit_qualification);
         return kvm_skip_emulated_instruction(vcpu);
@@ -5056,7 +5151,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
  static int handle_apic_access(struct kvm_vcpu *vcpu)
  {
         if (likely(fasteoi)) {
-               unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+               unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
                 int access_type, offset;
  
                 access_type = exit_qualification & APIC_ACCESS_TYPE;
@@ -5077,7 +5172,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
  
  static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
  {
-       unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
         int vector = exit_qualification & 0xff;
  
         /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
@@ -5087,7 +5182,7 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
  
  static int handle_apic_write(struct kvm_vcpu *vcpu)
  {
-       unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
         u32 offset = exit_qualification & 0xfff;
  
         /* APIC-write VM exit is trap-like and thus no need to adjust IP */
@@ -5108,7 +5203,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
         idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
  
-       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       exit_qualification = vmx_get_exit_qual(vcpu);
  
         reason = (u32)exit_qualification >> 30;
         if (reason == TASK_SWITCH_GATE && idt_v) {
@@ -5158,7 +5253,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
         gpa_t gpa;
         u64 error_code;
  
-       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       exit_qualification = vmx_get_exit_qual(vcpu);
  
         /*
          * EPT violation happened while executing iret from NMI,
@@ -5230,18 +5325,11 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
         bool intr_window_requested;
         unsigned count = 130;
  
-       /*
-        * We should never reach the point where we are emulating L2
-        * due to invalid guest state as that means we incorrectly
-        * allowed a nested VMEntry with an invalid vmcs12.
-        */
-       WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
-
         intr_window_requested = exec_controls_get(vmx) &
                                 CPU_BASED_INTR_WINDOW_EXITING;
  
         while (vmx->emulation_required && count-- != 0) {
-               if (intr_window_requested && vmx_interrupt_allowed(vcpu))
+               if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
                         return handle_interrupt_window(&vmx->vcpu);
  
                 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
@@ -5418,13 +5506,13 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
         /* According to the Intel instruction reference, the memory operand
          * is read even if it isn't needed (e.g., for type==all)
          */
-       if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+       if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
                                 vmx_instruction_info, false,
                                 sizeof(operand), &gva))
                 return 1;
  
         if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
-               kvm_inject_page_fault(vcpu, &e);
+               kvm_inject_emulated_page_fault(vcpu, &e);
                 return 1;
         }
  
@@ -5453,11 +5541,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
  
                 if (kvm_get_active_pcid(vcpu) == operand.pcid) {
                         kvm_mmu_sync_roots(vcpu);
-                       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
                 }
  
                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
+                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
                             == operand.pcid)
                                 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
  
@@ -5494,7 +5582,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
  
         trace_kvm_pml_full(vcpu->vcpu_id);
  
-       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       exit_qualification = vmx_get_exit_qual(vcpu);
  
         /*
          * PML buffer FULL happened while executing iret from NMI,
@@ -5513,14 +5601,22 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
         return 1;
  }
  
-static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
         if (!vmx->req_immediate_exit &&
-           !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
+           !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
                 kvm_lapic_expired_hv_timer(vcpu);
+               return EXIT_FASTPATH_REENTER_GUEST;
+       }
+
+       return EXIT_FASTPATH_NONE;
+}
  
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+       handle_fastpath_preemption_timer(vcpu);
         return 1;
  }
  
@@ -5608,8 +5704,8 @@ static const int kvm_vmx_max_exit_handlers =
  
  static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
  {
-       *info1 = vmcs_readl(EXIT_QUALIFICATION);
-       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+       *info1 = vmx_get_exit_qual(vcpu);
+       *info2 = vmx_get_intr_info(vcpu);
  }
  
  static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
@@ -5691,7 +5787,6 @@ void dump_vmcs(void)
         u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
         unsigned long cr4;
         u64 efer;
-       int i, n;
  
         if (!dump_invalid_vmcs) {
                 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
@@ -5828,14 +5923,6 @@ void dump_vmcs(void)
                 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
         if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
                 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
-       n = vmcs_read32(CR3_TARGET_COUNT);
-       for (i = 0; i + 1 < n; i += 4)
-               pr_err("CR3 target%u=%016lx target%u=%016lx\n",
-                      i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
-                      i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
-       if (i < n)
-               pr_err("CR3 target%u=%016lx\n",
-                      i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
         if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
                 pr_err("PLE Gap=%08x Window=%08x\n",
                        vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
@@ -5848,15 +5935,12 @@ void dump_vmcs(void)
   * The guest has exited.  See if we can fix it or if we need userspace
   * assistance.
   */
-static int vmx_handle_exit(struct kvm_vcpu *vcpu,
-       enum exit_fastpath_completion exit_fastpath)
+static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         u32 exit_reason = vmx->exit_reason;
         u32 vectoring_info = vmx->idt_vectoring_info;
  
-       trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
-
         /*
          * Flush logged GPAs PML buffer, this will make dirty_bitmap more
          * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
@@ -5867,6 +5951,14 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
         if (enable_pml)
                 vmx_flush_pml_buffer(vcpu);
  
+       /*
+        * We should never reach this point with a pending nested VM-Enter, and
+        * more specifically emulation of L2 due to invalid guest state (see
+        * below) should never happen as that means we incorrectly allowed a
+        * nested VM-Enter with an invalid vmcs12.
+        */
+       WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
         /* If guest state is invalid, start emulating */
         if (vmx->emulation_required)
                 return handle_invalid_guest_state(vcpu);
@@ -5885,8 +5977,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
                  */
                 nested_mark_vmcs12_pages_dirty(vcpu);
  
-               if (nested_vmx_exit_reflected(vcpu, exit_reason))
-                       return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+               if (nested_vmx_reflect_vmexit(vcpu))
+                       return 1;
         }
  
         if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
@@ -5933,7 +6025,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
  
         if (unlikely(!enable_vnmi &&
                      vmx->loaded_vmcs->soft_vnmi_blocked)) {
-               if (vmx_interrupt_allowed(vcpu)) {
+               if (!vmx_interrupt_blocked(vcpu)) {
                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
                 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
                            vcpu->arch.nmi_pending) {
@@ -5950,10 +6042,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
                 }
         }
  
-       if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
-               kvm_skip_emulated_instruction(vcpu);
+       if (exit_fastpath != EXIT_FASTPATH_NONE)
                 return 1;
-       }
  
         if (exit_reason >= kvm_vmx_max_exit_handlers)
                 goto unexpected_vmexit;
@@ -6107,7 +6197,15 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
                 if (flexpriority_enabled) {
                         sec_exec_control |=
                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-                       vmx_flush_tlb(vcpu, true);
+                       kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+
+                       /*
+                        * Flush the TLB, reloading the APIC access page will
+                        * only do so if its physical address has changed, but
+                        * the guest may have inserted a non-APIC mapping into
+                        * the TLB while the APIC access page was disabled.
+                        */
+                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
                 }
                 break;
         case LAPIC_MODE_X2APIC:
@@ -6121,12 +6219,32 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
         vmx_update_msr_bitmap(vcpu);
  }
  
-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
+static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
  {
-       if (!is_guest_mode(vcpu)) {
-               vmcs_write64(APIC_ACCESS_ADDR, hpa);
-               vmx_flush_tlb(vcpu, true);
+       struct page *page;
+
+       /* Defer reload until vmcs01 is the current VMCS. */
+       if (is_guest_mode(vcpu)) {
+               to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
+               return;
         }
+
+       if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
+           SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+               return;
+
+       page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+       if (is_error_page(page))
+               return;
+
+       vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+       vmx_flush_tlb_current(vcpu);
+
+       /*
+        * Do not pin apic access page in memory, the MMU notifier
+        * will call us again if it is migrated or swapped out.
+        */
+       put_page(page);
  }
  
  static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
@@ -6244,16 +6362,16 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
  
  static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
  {
-       vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+       u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
  
         /* if exit due to PF check for async PF */
-       if (is_page_fault(vmx->exit_intr_info)) {
+       if (is_page_fault(intr_info)) {
                 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
         /* Handle machine checks before interrupts are enabled */
-       } else if (is_machine_check(vmx->exit_intr_info)) {
+       } else if (is_machine_check(intr_info)) {
                 kvm_machine_check();
         /* We need to handle NMIs before interrupts are enabled */
-       } else if (is_nmi(vmx->exit_intr_info)) {
+       } else if (is_nmi(intr_info)) {
                 kvm_before_interrupt(&vmx->vcpu);
                 asm("int $2");
                 kvm_after_interrupt(&vmx->vcpu);
@@ -6268,9 +6386,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
         unsigned long tmp;
  #endif
         gate_desc *desc;
-       u32 intr_info;
+       u32 intr_info = vmx_get_intr_info(vcpu);
  
-       intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
         if (WARN_ONCE(!is_external_intr(intr_info),
             "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
                 return;
@@ -6283,13 +6400,13 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
  
         asm volatile(
  #ifdef CONFIG_X86_64
-               "mov %%" _ASM_SP ", %[sp]\n\t"
-               "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
-               "push $%c[ss]\n\t"
+               "mov %%rsp, %[sp]\n\t"
+               "and $-16, %%rsp\n\t"
+               "push %[ss]\n\t"
                 "push %[sp]\n\t"
  #endif
                 "pushf\n\t"
-               __ASM_SIZE(push) " $%c[cs]\n\t"
+               "push %[cs]\n\t"
                 CALL_NOSPEC
                 :
  #ifdef CONFIG_X86_64
@@ -6298,7 +6415,9 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
                 ASM_CALL_CONSTRAINT
                 :
                 [thunk_target]"r"(entry),
+#ifdef CONFIG_X86_64
                 [ss]"i"(__KERNEL_DS),
+#endif
                 [cs]"i"(__KERNEL_CS)
         );
  
@@ -6306,8 +6425,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
  }
  STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
  
-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu,
-       enum exit_fastpath_completion *exit_fastpath)
+static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
@@ -6315,9 +6433,6 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu,
                 handle_external_interrupt_irqoff(vcpu);
         else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
                 handle_exception_nmi_irqoff(vmx);
-       else if (!is_guest_mode(vcpu) &&
-               vmx->exit_reason == EXIT_REASON_MSR_WRITE)
-               *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
  }
  
  static bool vmx_has_emulated_msr(int index)
@@ -6351,11 +6466,8 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
         if (enable_vnmi) {
                 if (vmx->loaded_vmcs->nmi_known_unmasked)
                         return;
-               /*
-                * Can't use vmx->exit_intr_info since we're not sure what
-                * the exit reason is.
-                */
-               exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+               exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
                 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
                 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
                 /*
@@ -6522,13 +6634,27 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
         }
  }
  
+static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+       switch (to_vmx(vcpu)->exit_reason) {
+       case EXIT_REASON_MSR_WRITE:
+               return handle_fastpath_set_msr_irqoff(vcpu);
+       case EXIT_REASON_PREEMPTION_TIMER:
+               return handle_fastpath_preemption_timer(vcpu);
+       default:
+               return EXIT_FASTPATH_NONE;
+       }
+}
+
  bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
  
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
  {
+       fastpath_t exit_fastpath;
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         unsigned long cr3, cr4;
  
+reenter_guest:
         /* Record the guest's net vcpu time for enforced NMI injections. */
         if (unlikely(!enable_vnmi &&
                      vmx->loaded_vmcs->soft_vnmi_blocked))
@@ -6537,7 +6663,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
         /* Don't enter VMX if guest state is invalid, let the exit handler
            start emulation until we arrive back to a valid state */
         if (vmx->emulation_required)
-               return;
+               return EXIT_FASTPATH_NONE;
  
         if (vmx->ple_window_dirty) {
                 vmx->ple_window_dirty = false;
@@ -6577,11 +6703,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
  
         kvm_load_guest_xsave_state(vcpu);
  
-       if (static_cpu_has(X86_FEATURE_PKU) &&
-           kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
-           vcpu->arch.pkru != vmx->host_pkru)
-               __write_pkru(vcpu->arch.pkru);
-
         pt_guest_enter(vmx);
  
         if (vcpu_to_pmu(vcpu)->version)
@@ -6662,44 +6783,54 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
         loadsegment(es, __USER_DS);
  #endif
  
-       vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
-                                 | (1 << VCPU_EXREG_RFLAGS)
-                                 | (1 << VCPU_EXREG_PDPTR)
-                                 | (1 << VCPU_EXREG_SEGMENTS)
-                                 | (1 << VCPU_EXREG_CR3));
-       vcpu->arch.regs_dirty = 0;
+       vmx_register_cache_reset(vcpu);
  
         pt_guest_exit(vmx);
  
-       /*
-        * eager fpu is enabled if PKEY is supported and CR4 is switched
-        * back on host, so it is safe to read guest PKRU from current
-        * XSAVE.
-        */
-       if (static_cpu_has(X86_FEATURE_PKU) &&
-           kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
-               vcpu->arch.pkru = rdpkru();
-               if (vcpu->arch.pkru != vmx->host_pkru)
-                       __write_pkru(vmx->host_pkru);
-       }
-
         kvm_load_host_xsave_state(vcpu);
  
         vmx->nested.nested_run_pending = 0;
         vmx->idt_vectoring_info = 0;
  
-       vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
-       if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+       if (unlikely(vmx->fail)) {
+               vmx->exit_reason = 0xdead;
+               return EXIT_FASTPATH_NONE;
+       }
+
+       vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+       if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY))
                 kvm_machine_check();
  
-       if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
-               return;
+       trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
+
+       if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+               return EXIT_FASTPATH_NONE;
  
         vmx->loaded_vmcs->launched = 1;
         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
  
         vmx_recover_nmi_blocking(vmx);
         vmx_complete_interrupts(vmx);
+
+       if (is_guest_mode(vcpu))
+               return EXIT_FASTPATH_NONE;
+
+       exit_fastpath = vmx_exit_handlers_fastpath(vcpu);
+       if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) {
+               if (!kvm_vcpu_exit_request(vcpu)) {
+                       /*
+                        * FIXME: this goto should be a loop in vcpu_enter_guest,
+                        * but it would incur the cost of a retpoline for now.
+                        * Revisit once static calls are available.
+                        */
+                       if (vcpu->arch.apicv_active)
+                               vmx_sync_pir_to_irr(vcpu);
+                       goto reenter_guest;
+               }
+               exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
+       }
+
+       return exit_fastpath;
  }
  
  static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
@@ -7284,10 +7415,6 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
         u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
         struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
  
-       if (kvm_mwait_in_guest(vcpu->kvm) ||
-               kvm_can_post_timer_interrupt(vcpu))
-               return -EOPNOTSUPP;
-
         vmx = to_vmx(vcpu);
         tscl = rdtsc();
         guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
@@ -7630,12 +7757,12 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
                         ~FEAT_CTL_LMCE_ENABLED;
  }
  
-static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
+static bool vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
  {
         /* we need a nested vmexit to enter SMM, postpone if run is pending */
         if (to_vmx(vcpu)->nested.nested_run_pending)
-               return 0;
-       return 1;
+               return false;
+       return !is_smm(vcpu);
  }
  
  static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
@@ -7687,6 +7814,16 @@ static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
         return to_vmx(vcpu)->nested.vmxon;
  }
  
+static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
+{
+       if (is_guest_mode(vcpu)) {
+               struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
+
+               if (hrtimer_try_to_cancel(timer) == 1)
+                       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+       }
+}
+
  static void hardware_unsetup(void)
  {
         if (nested)
@@ -7731,8 +7868,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
         .set_segment = vmx_set_segment,
         .get_cpl = vmx_get_cpl,
         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
-       .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
-       .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
         .set_cr0 = vmx_set_cr0,
         .set_cr4 = vmx_set_cr4,
         .set_efer = vmx_set_efer,
@@ -7740,16 +7875,16 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
         .set_idt = vmx_set_idt,
         .get_gdt = vmx_get_gdt,
         .set_gdt = vmx_set_gdt,
-       .get_dr6 = vmx_get_dr6,
-       .set_dr6 = vmx_set_dr6,
         .set_dr7 = vmx_set_dr7,
         .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
         .cache_reg = vmx_cache_reg,
         .get_rflags = vmx_get_rflags,
         .set_rflags = vmx_set_rflags,
  
-       .tlb_flush = vmx_flush_tlb,
+       .tlb_flush_all = vmx_flush_tlb_all,
+       .tlb_flush_current = vmx_flush_tlb_current,
         .tlb_flush_gva = vmx_flush_tlb_gva,
+       .tlb_flush_guest = vmx_flush_tlb_guest,
  
         .run = vmx_vcpu_run,
         .handle_exit = vmx_handle_exit,
@@ -7784,7 +7919,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
  
         .set_tss_addr = vmx_set_tss_addr,
         .set_identity_map_addr = vmx_set_identity_map_addr,
-       .get_tdp_level = get_ept_level,
+       .get_tdp_level = vmx_get_tdp_level,
         .get_mt_mask = vmx_get_mt_mask,
  
         .get_exit_info = vmx_get_exit_info,
@@ -7793,7 +7928,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
  
         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
  
-       .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
         .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
  
         .load_mmu_pgd = vmx_load_mmu_pgd,
@@ -7815,6 +7949,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
         .post_block = vmx_post_block,
  
         .pmu_ops = &intel_pmu_ops,
+       .nested_ops = &vmx_nested_ops,
  
         .update_pi_irte = vmx_update_pi_irte,
  
@@ -7830,14 +7965,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
         .pre_leave_smm = vmx_pre_leave_smm,
         .enable_smi_window = enable_smi_window,
  
-       .check_nested_events = NULL,
-       .get_nested_state = NULL,
-       .set_nested_state = NULL,
-       .get_vmcs12_pages = NULL,
-       .nested_enable_evmcs = NULL,
-       .nested_get_evmcs_version = NULL,
         .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+       .migrate_timers = vmx_migrate_timers,
  };
  
  static __init int hardware_setup(void)
@@ -7936,11 +8066,11 @@ static __init int hardware_setup(void)
         if (!enable_ept)
                 ept_lpage_level = 0;
         else if (cpu_has_vmx_ept_1g_page())
-               ept_lpage_level = PT_PDPE_LEVEL;
+               ept_lpage_level = PG_LEVEL_1G;
         else if (cpu_has_vmx_ept_2m_page())
-               ept_lpage_level = PT_DIRECTORY_LEVEL;
+               ept_lpage_level = PG_LEVEL_2M;
         else
-               ept_lpage_level = PT_PAGE_TABLE_LEVEL;
+               ept_lpage_level = PG_LEVEL_4K;
         kvm_configure_mmu(enable_ept, ept_lpage_level);
  
         /*
@@ -8000,8 +8130,7 @@ static __init int hardware_setup(void)
                 nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
                                            vmx_capability.ept);
  
-               r = nested_vmx_hardware_setup(&vmx_x86_ops,
-                                             kvm_vmx_exit_handlers);
+               r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
                 if (r)
                         return r;
         }
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h

index aab9df5..298ddef 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -8,6 +8,7 @@
  #include <asm/intel_pt.h>
  
  #include "capabilities.h"
+#include "kvm_cache_regs.h"
  #include "ops.h"
  #include "vmcs.h"
  
@@ -136,6 +137,7 @@ struct nested_vmx {
         bool vmcs02_initialized;
  
         bool change_vmcs01_virtual_apic_mode;
+       bool reload_vmcs01_apic_access_page;
  
         /*
          * Enlightened VMCS has been enabled. It does not mean that L1 has to
@@ -208,6 +210,7 @@ struct vcpu_vmx {
          */
         bool                  guest_state_loaded;
  
+       unsigned long         exit_qualification;
         u32                   exit_intr_info;
         u32                   idt_vectoring_info;
         ulong                 rflags;
@@ -317,8 +320,8 @@ struct kvm_vmx {
  };
  
  bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu);
-void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+                       struct loaded_vmcs *buddy);
  int allocate_vpid(void);
  void free_vpid(int vpid);
  void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
@@ -341,6 +344,8 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
  u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
  void update_exception_bitmap(struct kvm_vcpu *vcpu);
  void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
+bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
+bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
  bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
  void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
  void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
@@ -441,9 +446,18 @@ BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL)
  BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
  BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
  
-static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
  {
-       vmx->segment_cache.bitmask = 0;
+       vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+                                 | (1 << VCPU_EXREG_RFLAGS)
+                                 | (1 << VCPU_EXREG_PDPTR)
+                                 | (1 << VCPU_EXREG_SEGMENTS)
+                                 | (1 << VCPU_EXREG_CR0)
+                                 | (1 << VCPU_EXREG_CR3)
+                                 | (1 << VCPU_EXREG_CR4)
+                                 | (1 << VCPU_EXREG_EXIT_INFO_1)
+                                 | (1 << VCPU_EXREG_EXIT_INFO_2));
+       vcpu->arch.regs_dirty = 0;
  }
  
  static inline u32 vmx_vmentry_ctrl(void)
@@ -486,6 +500,28 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
         return &(to_vmx(vcpu)->pi_desc);
  }
  
+static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) {
+               kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+               vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       }
+       return vmx->exit_qualification;
+}
+
+static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) {
+               kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+               vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+       }
+       return vmx->exit_intr_info;
+}
+
  struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
  void free_vmcs(struct vmcs *vmcs);
  int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
@@ -500,24 +536,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow)
  
  u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
  
-static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
-                               bool invalidate_gpa)
-{
-       if (enable_ept && (invalidate_gpa || !enable_vpid)) {
-               if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
-                       return;
-               ept_sync_context(construct_eptp(vcpu,
-                                               vcpu->arch.mmu->root_hpa));
-       } else {
-               vpid_sync_context(vpid);
-       }
-}
-
-static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
-{
-       __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
-}
-
  static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
  {
         vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index d786c7d..471fccf 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -97,9 +97,6 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
  
  static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
  
-#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
-#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
-
  #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
                                      KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
  
@@ -194,45 +191,46 @@ u64 __read_mostly supported_xss;
  EXPORT_SYMBOL_GPL(supported_xss);
  
  struct kvm_stats_debugfs_item debugfs_entries[] = {
-       { "pf_fixed", VCPU_STAT(pf_fixed) },
-       { "pf_guest", VCPU_STAT(pf_guest) },
-       { "tlb_flush", VCPU_STAT(tlb_flush) },
-       { "invlpg", VCPU_STAT(invlpg) },
-       { "exits", VCPU_STAT(exits) },
-       { "io_exits", VCPU_STAT(io_exits) },
-       { "mmio_exits", VCPU_STAT(mmio_exits) },
-       { "signal_exits", VCPU_STAT(signal_exits) },
-       { "irq_window", VCPU_STAT(irq_window_exits) },
-       { "nmi_window", VCPU_STAT(nmi_window_exits) },
-       { "halt_exits", VCPU_STAT(halt_exits) },
-       { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
-       { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
-       { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
-       { "halt_wakeup", VCPU_STAT(halt_wakeup) },
-       { "hypercalls", VCPU_STAT(hypercalls) },
-       { "request_irq", VCPU_STAT(request_irq_exits) },
-       { "irq_exits", VCPU_STAT(irq_exits) },
-       { "host_state_reload", VCPU_STAT(host_state_reload) },
-       { "fpu_reload", VCPU_STAT(fpu_reload) },
-       { "insn_emulation", VCPU_STAT(insn_emulation) },
-       { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
-       { "irq_injections", VCPU_STAT(irq_injections) },
-       { "nmi_injections", VCPU_STAT(nmi_injections) },
-       { "req_event", VCPU_STAT(req_event) },
-       { "l1d_flush", VCPU_STAT(l1d_flush) },
-       { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
-       { "mmu_pte_write", VM_STAT(mmu_pte_write) },
-       { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
-       { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
-       { "mmu_flooded", VM_STAT(mmu_flooded) },
-       { "mmu_recycled", VM_STAT(mmu_recycled) },
-       { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
-       { "mmu_unsync", VM_STAT(mmu_unsync) },
-       { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
-       { "largepages", VM_STAT(lpages, .mode = 0444) },
-       { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
-       { "max_mmu_page_hash_collisions",
-               VM_STAT(max_mmu_page_hash_collisions) },
+       VCPU_STAT("pf_fixed", pf_fixed),
+       VCPU_STAT("pf_guest", pf_guest),
+       VCPU_STAT("tlb_flush", tlb_flush),
+       VCPU_STAT("invlpg", invlpg),
+       VCPU_STAT("exits", exits),
+       VCPU_STAT("io_exits", io_exits),
+       VCPU_STAT("mmio_exits", mmio_exits),
+       VCPU_STAT("signal_exits", signal_exits),
+       VCPU_STAT("irq_window", irq_window_exits),
+       VCPU_STAT("nmi_window", nmi_window_exits),
+       VCPU_STAT("halt_exits", halt_exits),
+       VCPU_STAT("halt_successful_poll", halt_successful_poll),
+       VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+       VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+       VCPU_STAT("halt_wakeup", halt_wakeup),
+       VCPU_STAT("hypercalls", hypercalls),
+       VCPU_STAT("request_irq", request_irq_exits),
+       VCPU_STAT("irq_exits", irq_exits),
+       VCPU_STAT("host_state_reload", host_state_reload),
+       VCPU_STAT("fpu_reload", fpu_reload),
+       VCPU_STAT("insn_emulation", insn_emulation),
+       VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
+       VCPU_STAT("irq_injections", irq_injections),
+       VCPU_STAT("nmi_injections", nmi_injections),
+       VCPU_STAT("req_event", req_event),
+       VCPU_STAT("l1d_flush", l1d_flush),
+       VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+       VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+       VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
+       VM_STAT("mmu_pte_write", mmu_pte_write),
+       VM_STAT("mmu_pte_updated", mmu_pte_updated),
+       VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
+       VM_STAT("mmu_flooded", mmu_flooded),
+       VM_STAT("mmu_recycled", mmu_recycled),
+       VM_STAT("mmu_cache_miss", mmu_cache_miss),
+       VM_STAT("mmu_unsync", mmu_unsync),
+       VM_STAT("remote_tlb_flush", remote_tlb_flush),
+       VM_STAT("largepages", lpages, .mode = 0444),
+       VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
+       VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
         { NULL }
  };
  
@@ -261,7 +259,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
  static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
  {
         int i;
-       for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
+       for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
                 vcpu->arch.apf.gfns[i] = ~0;
  }
  
@@ -572,11 +570,12 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
  }
  EXPORT_SYMBOL_GPL(kvm_requeue_exception);
  
-static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
-                                 unsigned long payload)
+void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
+                          unsigned long payload)
  {
         kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
  }
+EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
  
  static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
                                     u32 error_code, unsigned long payload)
@@ -611,15 +610,28 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
  }
  EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
  
-static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+                                   struct x86_exception *fault)
  {
-       if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
-               vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
-       else
-               vcpu->arch.mmu->inject_page_fault(vcpu, fault);
+       struct kvm_mmu *fault_mmu;
+       WARN_ON_ONCE(fault->vector != PF_VECTOR);
+
+       fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
+                                              vcpu->arch.walk_mmu;
+
+       /*
+        * Invalidate the TLB entry for the faulting address, if it exists,
+        * else the access will fault indefinitely (and to emulate hardware).
+        */
+       if ((fault->error_code & PFERR_PRESENT_MASK) &&
+           !(fault->error_code & PFERR_RSVD_MASK))
+               kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
+                                      fault_mmu->root_hpa);
  
+       fault_mmu->inject_page_fault(vcpu, fault);
         return fault->nested_page_fault;
  }
+EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
  
  void kvm_inject_nmi(struct kvm_vcpu *vcpu)
  {
@@ -836,11 +848,25 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
                     vcpu->arch.ia32_xss != host_xss)
                         wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
         }
+
+       if (static_cpu_has(X86_FEATURE_PKU) &&
+           (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
+            (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
+           vcpu->arch.pkru != vcpu->arch.host_pkru)
+               __write_pkru(vcpu->arch.pkru);
  }
  EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
  
  void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
  {
+       if (static_cpu_has(X86_FEATURE_PKU) &&
+           (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
+            (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
+               vcpu->arch.pkru = rdpkru();
+               if (vcpu->arch.pkru != vcpu->arch.host_pkru)
+                       __write_pkru(vcpu->arch.host_pkru);
+       }
+
         if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
  
                 if (vcpu->arch.xcr0 != host_xcr0)
@@ -993,7 +1019,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
         if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
                 if (!skip_tlb_flush) {
                         kvm_mmu_sync_roots(vcpu);
-                       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
                 }
                 return 0;
         }
@@ -1005,7 +1031,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                  !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
                 return 1;
  
-       kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
+       kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
         vcpu->arch.cr3 = cr3;
         kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
  
@@ -1045,12 +1071,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
         }
  }
  
-static void kvm_update_dr6(struct kvm_vcpu *vcpu)
-{
-       if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
-               kvm_x86_ops.set_dr6(vcpu, vcpu->arch.dr6);
-}
-
  static void kvm_update_dr7(struct kvm_vcpu *vcpu)
  {
         unsigned long dr7;
@@ -1090,7 +1110,6 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
                 if (val & 0xffffffff00000000ULL)
                         return -1; /* #GP */
                 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
-               kvm_update_dr6(vcpu);
                 break;
         case 5:
                 /* fall through */
@@ -1126,10 +1145,7 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
         case 4:
                 /* fall through */
         case 6:
-               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
-                       *val = vcpu->arch.dr6;
-               else
-                       *val = kvm_x86_ops.get_dr6(vcpu);
+               *val = vcpu->arch.dr6;
                 break;
         case 5:
                 /* fall through */
@@ -1559,6 +1575,13 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
  }
  EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
  
+bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
+{
+       return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
+               need_resched() || signal_pending(current);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
+
  /*
   * The fast path for frequent and performance sensitive wrmsr emulation,
   * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
@@ -1587,27 +1610,44 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
         return 1;
  }
  
-enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
+static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
+{
+       if (!kvm_can_use_hv_timer(vcpu))
+               return 1;
+
+       kvm_set_lapic_tscdeadline_msr(vcpu, data);
+       return 0;
+}
+
+fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
  {
         u32 msr = kvm_rcx_read(vcpu);
         u64 data;
-       int ret = 0;
+       fastpath_t ret = EXIT_FASTPATH_NONE;
  
         switch (msr) {
         case APIC_BASE_MSR + (APIC_ICR >> 4):
                 data = kvm_read_edx_eax(vcpu);
-               ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
+               if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
+                       kvm_skip_emulated_instruction(vcpu);
+                       ret = EXIT_FASTPATH_EXIT_HANDLED;
+               }
+               break;
+       case MSR_IA32_TSCDEADLINE:
+               data = kvm_read_edx_eax(vcpu);
+               if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
+                       kvm_skip_emulated_instruction(vcpu);
+                       ret = EXIT_FASTPATH_REENTER_GUEST;
+               }
                 break;
         default:
-               return EXIT_FASTPATH_NONE;
+               break;
         }
  
-       if (!ret) {
+       if (ret != EXIT_FASTPATH_NONE)
                 trace_kvm_msr_write(msr, data);
-               return EXIT_FASTPATH_SKIP_EMUL_INS;
-       }
  
-       return EXIT_FASTPATH_NONE;
+       return ret;
  }
  EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
  
@@ -1896,7 +1936,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
  
  static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
  {
-       u64 curr_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu);
+       u64 curr_offset = vcpu->arch.l1_tsc_offset;
         vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
  }
  
@@ -1938,14 +1978,13 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
  
  u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
  {
-       u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu);
-
-       return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+       return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
  }
  EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
  
  static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
  {
+       vcpu->arch.l1_tsc_offset = offset;
         vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
  }
  
@@ -2070,7 +2109,7 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
  static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
                                            s64 adjustment)
  {
-       u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu);
+       u64 tsc_offset = vcpu->arch.l1_tsc_offset;
         kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
  }
  
@@ -2664,10 +2703,16 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
         vcpu->arch.time = 0;
  }
  
-static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+       ++vcpu->stat.tlb_flush;
+       kvm_x86_ops.tlb_flush_all(vcpu);
+}
+
+static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
  {
         ++vcpu->stat.tlb_flush;
-       kvm_x86_ops.tlb_flush(vcpu, invalidate_gpa);
+       kvm_x86_ops.tlb_flush_guest(vcpu);
  }
  
  static void record_steal_time(struct kvm_vcpu *vcpu)
@@ -2693,7 +2738,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
         trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
                 st->preempted & KVM_VCPU_FLUSH_TLB);
         if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
-               kvm_vcpu_flush_tlb(vcpu, false);
+               kvm_vcpu_flush_tlb_guest(vcpu);
  
         vcpu->arch.st.preempted = 0;
  
@@ -3426,14 +3471,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                 r = KVM_X2APIC_API_VALID_FLAGS;
                 break;
         case KVM_CAP_NESTED_STATE:
-               r = kvm_x86_ops.get_nested_state ?
-                       kvm_x86_ops.get_nested_state(NULL, NULL, 0) : 0;
+               r = kvm_x86_ops.nested_ops->get_state ?
+                       kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
                 break;
         case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
                 r = kvm_x86_ops.enable_direct_tlbflush != NULL;
                 break;
         case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
-               r = kvm_x86_ops.nested_enable_evmcs != NULL;
+               r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
                 break;
         default:
                 break;
@@ -3558,6 +3603,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  
         kvm_x86_ops.vcpu_load(vcpu, cpu);
  
+       /* Save host pkru register if supported */
+       vcpu->arch.host_pkru = read_pkru();
+
         /* Apply any externally detected TSC adjustments (due to suspend) */
         if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
                 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
@@ -4009,7 +4057,6 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
         memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
         kvm_update_dr0123(vcpu);
         vcpu->arch.dr6 = dbgregs->dr6;
-       kvm_update_dr6(vcpu);
         vcpu->arch.dr7 = dbgregs->dr7;
         kvm_update_dr7(vcpu);
  
@@ -4219,9 +4266,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
                 return kvm_hv_activate_synic(vcpu, cap->cap ==
                                              KVM_CAP_HYPERV_SYNIC2);
         case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
-               if (!kvm_x86_ops.nested_enable_evmcs)
+               if (!kvm_x86_ops.nested_ops->enable_evmcs)
                         return -ENOTTY;
-               r = kvm_x86_ops.nested_enable_evmcs(vcpu, &vmcs_version);
+               r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
                 if (!r) {
                         user_ptr = (void __user *)(uintptr_t)cap->args[0];
                         if (copy_to_user(user_ptr, &vmcs_version,
@@ -4536,7 +4583,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 u32 user_data_size;
  
                 r = -EINVAL;
-               if (!kvm_x86_ops.get_nested_state)
+               if (!kvm_x86_ops.nested_ops->get_state)
                         break;
  
                 BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
@@ -4544,8 +4591,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 if (get_user(user_data_size, &user_kvm_nested_state->size))
                         break;
  
-               r = kvm_x86_ops.get_nested_state(vcpu, user_kvm_nested_state,
-                                                 user_data_size);
+               r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
+                                                    user_data_size);
                 if (r < 0)
                         break;
  
@@ -4566,7 +4613,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 int idx;
  
                 r = -EINVAL;
-               if (!kvm_x86_ops.set_nested_state)
+               if (!kvm_x86_ops.nested_ops->set_state)
                         break;
  
                 r = -EFAULT;
@@ -4588,7 +4635,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                         break;
  
                 idx = srcu_read_lock(&vcpu->kvm->srcu);
-               r = kvm_x86_ops.set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+               r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
                 srcu_read_unlock(&vcpu->kvm->srcu, idx);
                 break;
         }
@@ -6396,7 +6443,7 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
  {
         struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
         if (ctxt->exception.vector == PF_VECTOR)
-               return kvm_propagate_fault(vcpu, &ctxt->exception);
+               return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
  
         if (ctxt->exception.error_code_valid)
                 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
@@ -6659,7 +6706,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
  
         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
                 kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
-               kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+               kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
                 kvm_run->debug.arch.exception = DB_VECTOR;
                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
                 return 0;
@@ -6719,9 +6766,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
                                            vcpu->arch.db);
  
                 if (dr6 != 0) {
-                       vcpu->arch.dr6 &= ~DR_TRAP_BITS;
-                       vcpu->arch.dr6 |= dr6 | DR6_RTM;
-                       kvm_queue_exception(vcpu, DB_VECTOR);
+                       kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
                         *r = 1;
                         return true;
                 }
@@ -7693,14 +7738,17 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
                         kvm_x86_ops.set_irq(vcpu);
         }
  
+       WARN_ON_ONCE(vcpu->arch.exception.injected &&
+                    vcpu->arch.exception.pending);
+
         /*
          * Call check_nested_events() even if we reinjected a previous event
          * in order for caller to determine if it should require immediate-exit
          * from L2 to L1 due to pending L1 events which require exit
          * from L2 to L1.
          */
-       if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) {
-               r = kvm_x86_ops.check_nested_events(vcpu);
+       if (is_guest_mode(vcpu)) {
+               r = kvm_x86_ops.nested_ops->check_events(vcpu);
                 if (r != 0)
                         return r;
         }
@@ -7711,7 +7759,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
                                         vcpu->arch.exception.has_error_code,
                                         vcpu->arch.exception.error_code);
  
-               WARN_ON_ONCE(vcpu->arch.exception.injected);
                 vcpu->arch.exception.pending = false;
                 vcpu->arch.exception.injected = true;
  
@@ -7744,33 +7791,20 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
         if (kvm_event_needs_reinjection(vcpu))
                 return 0;
  
-       if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
-           kvm_x86_ops.smi_allowed(vcpu)) {
+       if (vcpu->arch.smi_pending &&
+           kvm_x86_ops.smi_allowed(vcpu, true)) {
                 vcpu->arch.smi_pending = false;
                 ++vcpu->arch.smi_count;
                 enter_smm(vcpu);
-       } else if (vcpu->arch.nmi_pending && kvm_x86_ops.nmi_allowed(vcpu)) {
+       } else if (vcpu->arch.nmi_pending &&
+                  kvm_x86_ops.nmi_allowed(vcpu, true)) {
                 --vcpu->arch.nmi_pending;
                 vcpu->arch.nmi_injected = true;
                 kvm_x86_ops.set_nmi(vcpu);
-       } else if (kvm_cpu_has_injectable_intr(vcpu)) {
-               /*
-                * Because interrupts can be injected asynchronously, we are
-                * calling check_nested_events again here to avoid a race condition.
-                * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
-                * proposal and current concerns.  Perhaps we should be setting
-                * KVM_REQ_EVENT only on certain events and not unconditionally?
-                */
-               if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) {
-                       r = kvm_x86_ops.check_nested_events(vcpu);
-                       if (r != 0)
-                               return r;
-               }
-               if (kvm_x86_ops.interrupt_allowed(vcpu)) {
-                       kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
-                                           false);
-                       kvm_x86_ops.set_irq(vcpu);
-               }
+       } else if (kvm_cpu_has_injectable_intr(vcpu) &&
+                  kvm_x86_ops.interrupt_allowed(vcpu, true)) {
+               kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
+               kvm_x86_ops.set_irq(vcpu);
         }
  
         return 0;
@@ -8042,7 +8076,7 @@ void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
         zalloc_cpumask_var(&cpus, GFP_ATOMIC);
  
         kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
-                                   vcpu_bitmap, cpus);
+                                   NULL, vcpu_bitmap, cpus);
  
         free_cpumask_var(cpus);
  }
@@ -8072,6 +8106,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
   */
  void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
  {
+       struct kvm_vcpu *except;
         unsigned long old, new, expected;
  
         if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
@@ -8096,7 +8131,17 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
         trace_kvm_apicv_update_request(activate, bit);
         if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
                 kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
-       kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
+
+       /*
+        * Sending request to update APICV for all other vcpus,
+        * while update the calling vcpu immediately instead of
+        * waiting for another #VMEXIT to handle the request.
+        */
+       except = kvm_get_running_vcpu();
+       kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
+                                        except);
+       if (except)
+               kvm_vcpu_update_apicv(except);
  }
  EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
  
@@ -8153,24 +8198,13 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
  
  void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
  {
-       struct page *page = NULL;
-
         if (!lapic_in_kernel(vcpu))
                 return;
  
         if (!kvm_x86_ops.set_apic_access_page_addr)
                 return;
  
-       page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
-       if (is_error_page(page))
-               return;
-       kvm_x86_ops.set_apic_access_page_addr(vcpu, page_to_phys(page));
-
-       /*
-        * Do not pin apic access page in memory, the MMU notifier
-        * will call us again if it is migrated or swapped out.
-        */
-       put_page(page);
+       kvm_x86_ops.set_apic_access_page_addr(vcpu);
  }
  
  void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -8190,13 +8224,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         bool req_int_win =
                 dm_request_for_irq_injection(vcpu) &&
                 kvm_cpu_accept_dm_intr(vcpu);
-       enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE;
+       fastpath_t exit_fastpath;
  
         bool req_immediate_exit = false;
  
         if (kvm_request_pending(vcpu)) {
                 if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
-                       if (unlikely(!kvm_x86_ops.get_vmcs12_pages(vcpu))) {
+                       if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) {
                                 r = 0;
                                 goto out;
                         }
@@ -8218,8 +8252,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                         kvm_mmu_sync_roots(vcpu);
                 if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
                         kvm_mmu_load_pgd(vcpu);
-               if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
-                       kvm_vcpu_flush_tlb(vcpu, true);
+               if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+                       kvm_vcpu_flush_tlb_all(vcpu);
+
+                       /* Flushing all ASIDs flushes the current ASID... */
+                       kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+               }
+               if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+                       kvm_vcpu_flush_tlb_current(vcpu);
+               if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+                       kvm_vcpu_flush_tlb_guest(vcpu);
+
                 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
                         vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
                         r = 0;
@@ -8326,6 +8369,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                                 kvm_x86_ops.enable_nmi_window(vcpu);
                         if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
                                 kvm_x86_ops.enable_irq_window(vcpu);
+                       if (is_guest_mode(vcpu) &&
+                           kvm_x86_ops.nested_ops->hv_timer_pending &&
+                           kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+                               req_immediate_exit = true;
                         WARN_ON(vcpu->arch.exception.pending);
                 }
  
@@ -8375,8 +8422,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
                 kvm_x86_ops.sync_pir_to_irr(vcpu);
  
-       if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
-           || need_resched() || signal_pending(current)) {
+       if (kvm_vcpu_exit_request(vcpu)) {
                 vcpu->mode = OUTSIDE_GUEST_MODE;
                 smp_wmb();
                 local_irq_enable();
@@ -8408,7 +8454,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
         }
  
-       kvm_x86_ops.run(vcpu);
+       exit_fastpath = kvm_x86_ops.run(vcpu);
  
         /*
          * Do this here before restoring debug registers on the host.  And
@@ -8420,7 +8466,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
                 kvm_x86_ops.sync_dirty_debug_regs(vcpu);
                 kvm_update_dr0123(vcpu);
-               kvm_update_dr6(vcpu);
                 kvm_update_dr7(vcpu);
                 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
         }
@@ -8440,7 +8485,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         vcpu->mode = OUTSIDE_GUEST_MODE;
         smp_wmb();
  
-       kvm_x86_ops.handle_exit_irqoff(vcpu, &exit_fastpath);
+       kvm_x86_ops.handle_exit_irqoff(vcpu);
  
         /*
          * Consume any pending interrupts, including the possible source of
@@ -8487,6 +8532,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         return r;
  
  cancel_injection:
+       if (req_immediate_exit)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
         kvm_x86_ops.cancel_injection(vcpu);
         if (unlikely(vcpu->arch.apic_attention))
                 kvm_lapic_sync_from_vapic(vcpu);
@@ -8529,8 +8576,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
  
  static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
  {
-       if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events)
-               kvm_x86_ops.check_nested_events(vcpu);
+       if (is_guest_mode(vcpu))
+               kvm_x86_ops.nested_ops->check_events(vcpu);
  
         return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                 !vcpu->arch.apf.halted);
@@ -8712,8 +8759,9 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
         trace_kvm_fpu(0);
  }
  
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
  {
+       struct kvm_run *kvm_run = vcpu->run;
         int r;
  
         vcpu_load(vcpu);
@@ -8731,18 +8779,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 r = -EAGAIN;
                 if (signal_pending(current)) {
                         r = -EINTR;
-                       vcpu->run->exit_reason = KVM_EXIT_INTR;
+                       kvm_run->exit_reason = KVM_EXIT_INTR;
                         ++vcpu->stat.signal_exits;
                 }
                 goto out;
         }
  
-       if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+       if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
                 r = -EINVAL;
                 goto out;
         }
  
-       if (vcpu->run->kvm_dirty_regs) {
+       if (kvm_run->kvm_dirty_regs) {
                 r = sync_regs(vcpu);
                 if (r != 0)
                         goto out;
@@ -8772,7 +8820,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  
  out:
         kvm_put_guest_fpu(vcpu);
-       if (vcpu->run->kvm_valid_regs)
+       if (kvm_run->kvm_valid_regs)
                 store_regs(vcpu);
         post_kvm_run_save(vcpu);
         kvm_sigset_deactivate(vcpu);
@@ -9364,8 +9412,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
         }
         fx_init(vcpu);
  
-       vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
-
         vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
  
         vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
@@ -9481,7 +9527,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
         memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
         kvm_update_dr0123(vcpu);
         vcpu->arch.dr6 = DR6_INIT;
-       kvm_update_dr6(vcpu);
         vcpu->arch.dr7 = DR7_FIXED_1;
         kvm_update_dr7(vcpu);
  
@@ -10026,7 +10071,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
  {
         /* Still write protect RO slot */
         if (new->flags & KVM_MEM_READONLY) {
-               kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL);
+               kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
                 return;
         }
  
@@ -10066,7 +10111,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
                 } else {
                         int level =
                                 kvm_dirty_log_manual_protect_and_init_set(kvm) ?
-                               PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL;
+                               PG_LEVEL_2M : PG_LEVEL_4K;
  
                         /*
                          * If we're with initial-all-set, we don't need
@@ -10168,11 +10213,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
  
         if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
             (vcpu->arch.nmi_pending &&
-            kvm_x86_ops.nmi_allowed(vcpu)))
+            kvm_x86_ops.nmi_allowed(vcpu, false)))
                 return true;
  
         if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
-           (vcpu->arch.smi_pending && !is_smm(vcpu)))
+           (vcpu->arch.smi_pending &&
+            kvm_x86_ops.smi_allowed(vcpu, false)))
                 return true;
  
         if (kvm_arch_interrupt_allowed(vcpu) &&
@@ -10183,6 +10229,11 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
         if (kvm_hv_has_stimer_pending(vcpu))
                 return true;
  
+       if (is_guest_mode(vcpu) &&
+           kvm_x86_ops.nested_ops->hv_timer_pending &&
+           kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+               return true;
+
         return false;
  }
  
@@ -10219,7 +10270,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
  
  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
  {
-       return kvm_x86_ops.interrupt_allowed(vcpu);
+       return kvm_x86_ops.interrupt_allowed(vcpu, false);
  }
  
  unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
@@ -10284,12 +10335,14 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
  
  static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
  {
+       BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
+
         return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
  }
  
  static inline u32 kvm_async_pf_next_probe(u32 key)
  {
-       return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
+       return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
  }
  
  static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -10307,7 +10360,7 @@ static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
         int i;
         u32 key = kvm_async_pf_hash_fn(gfn);
  
-       for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
+       for (i = 0; i < ASYNC_PF_PER_VCPU &&
                      (vcpu->arch.apf.gfns[key] != gfn &&
                       vcpu->arch.apf.gfns[key] != ~0); i++)
                 key = kvm_async_pf_next_probe(key);
@@ -10325,6 +10378,10 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
         u32 i, j, k;
  
         i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+
+       if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
+               return;
+
         while (true) {
                 vcpu->arch.apf.gfns[i] = ~0;
                 do {
@@ -10384,7 +10441,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
          * If interrupts are off we cannot even use an artificial
          * halt state.
          */
-       return kvm_x86_ops.interrupt_allowed(vcpu);
+       return kvm_arch_interrupt_allowed(vcpu);
  }
  
  void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h

index b968acc..6eb62e9 100644 (file)
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -125,6 +125,12 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
         return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
  }
  
+static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+       ++vcpu->stat.tlb_flush;
+       kvm_x86_ops.tlb_flush_current(vcpu);
+}
+
  static inline int is_pae(struct kvm_vcpu *vcpu)
  {
         return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
@@ -268,7 +274,7 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
  bool kvm_vector_hashing_enabled(void);
  int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                             int emulation_type, void *insn, int insn_len);
-enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
+fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
  
  extern u64 host_xcr0;
  extern u64 supported_xcr0;
@@ -358,5 +364,6 @@ static inline bool kvm_dr7_valid(u64 data)
  void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
  void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
  u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu);
+bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu);
  
  #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index 01276e3..1616846 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -23,7 +23,7 @@
  #include <linux/irqflags.h>
  #include <linux/context_tracking.h>
  #include <linux/irqbypass.h>
-#include <linux/swait.h>
+#include <linux/rcuwait.h>
  #include <linux/refcount.h>
  #include <linux/nospec.h>
  #include <asm/signal.h>
@@ -277,7 +277,7 @@ struct kvm_vcpu {
         struct mutex mutex;
         struct kvm_run *run;
  
-       struct swait_queue_head wq;
+       struct rcuwait wait;
         struct pid __rcu *pid;
         int sigset_active;
         sigset_t sigset;
@@ -503,6 +503,7 @@ struct kvm {
         struct srcu_struct srcu;
         struct srcu_struct irq_srcu;
         pid_t userspace_pid;
+       unsigned int max_halt_poll_ns;
  };
  
  #define kvm_err(fmt, ...) \
@@ -813,8 +814,11 @@ void kvm_flush_remote_tlbs(struct kvm *kvm);
  void kvm_reload_remote_mmus(struct kvm *kvm);
  
  bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
+                                struct kvm_vcpu *except,
                                  unsigned long *vcpu_bitmap, cpumask_var_t tmp);
  bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
+bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
+                                     struct kvm_vcpu *except);
  bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req,
                                 unsigned long *vcpu_bitmap);
  
@@ -866,7 +870,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                     struct kvm_mp_state *mp_state);
  int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                                         struct kvm_guest_debug *dbg);
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu);
  
  int kvm_arch_init(void *opaque);
  void kvm_arch_exit(void);
@@ -956,12 +960,12 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
  }
  #endif
  
-static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
+static inline struct rcuwait *kvm_arch_vcpu_get_wait(struct kvm_vcpu *vcpu)
  {
  #ifdef __KVM_HAVE_ARCH_WQP
-       return vcpu->arch.wqp;
+       return vcpu->arch.waitp;
  #else
-       return &vcpu->wq;
+       return &vcpu->wait;
  #endif
  }
  
@@ -1130,6 +1134,11 @@ struct kvm_stats_debugfs_item {
  #define KVM_DBGFS_GET_MODE(dbgfs_item)                                         \
         ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644)
  
+#define VM_STAT(n, x, ...)                                                     \
+       { n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ }
+#define VCPU_STAT(n, x, ...)                                                   \
+       { n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ }
+
  extern struct kvm_stats_debugfs_item debugfs_entries[];
  extern struct dentry *kvm_debugfs_dir;
  
@@ -1352,6 +1361,12 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
  }
  #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
  
+static inline bool kvm_is_visible_memslot(struct kvm_memory_slot *memslot)
+{
+       return (memslot && memslot->id < KVM_USER_MEM_SLOTS &&
+               !(memslot->flags & KVM_MEMSLOT_INVALID));
+}
+
  struct kvm_vcpu *kvm_get_running_vcpu(void);
  struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
  
diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h

index 2ffe1ee..61c56cc 100644 (file)
--- a/include/linux/rcuwait.h
+++ b/include/linux/rcuwait.h
@@ -25,16 +25,38 @@ static inline void rcuwait_init(struct rcuwait *w)
         w->task = NULL;
  }
  
-extern void rcuwait_wake_up(struct rcuwait *w);
+/*
+ * Note: this provides no serialization and, just as with waitqueues,
+ * requires care to estimate as to whether or not the wait is active.
+ */
+static inline int rcuwait_active(struct rcuwait *w)
+{
+       return !!rcu_access_pointer(w->task);
+}
+
+extern int rcuwait_wake_up(struct rcuwait *w);
  
  /*
   * The caller is responsible for locking around rcuwait_wait_event(),
- * such that writes to @task are properly serialized.
+ * and [prepare_to/finish]_rcuwait() such that writes to @task are
+ * properly serialized.
   */
+
+static inline void prepare_to_rcuwait(struct rcuwait *w)
+{
+       rcu_assign_pointer(w->task, current);
+}
+
+static inline void finish_rcuwait(struct rcuwait *w)
+{
+        rcu_assign_pointer(w->task, NULL);
+       __set_current_state(TASK_RUNNING);
+}
+
  #define rcuwait_wait_event(w, condition, state)                                \
  ({                                                                     \
         int __ret = 0;                                                  \
-       rcu_assign_pointer((w)->task, current);                         \
+       prepare_to_rcuwait(w);                                          \
         for (;;) {                                                      \
                 /*                                                      \
                  * Implicit barrier (A) pairs with (B) in               \
@@ -51,9 +73,7 @@ extern void rcuwait_wake_up(struct rcuwait *w);
                                                                         \
                 schedule();                                             \
         }                                                               \
-                                                                       \
-       WRITE_ONCE((w)->task, NULL);                                    \
-       __set_current_state(TASK_RUNNING);                              \
+       finish_rcuwait(w);                                              \
         __ret;                                                          \
  })
  
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h

index 428c7dd..ac9eba0 100644 (file)
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1017,6 +1017,7 @@ struct kvm_ppc_resize_hpt {
  #define KVM_CAP_S390_VCPU_RESETS 179
  #define KVM_CAP_S390_PROTECTED 180
  #define KVM_CAP_PPC_SECURE_GUEST 181
+#define KVM_CAP_HALT_POLL 182
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
diff --git a/kernel/exit.c b/kernel/exit.c

index ce2a75b..9f4beff 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -228,8 +228,9 @@ repeat:
                 goto repeat;
  }
  
-void rcuwait_wake_up(struct rcuwait *w)
+int rcuwait_wake_up(struct rcuwait *w)
  {
+       int ret = 0;
         struct task_struct *task;
  
         rcu_read_lock();
@@ -237,7 +238,7 @@ void rcuwait_wake_up(struct rcuwait *w)
         /*
          * Order condition vs @task, such that everything prior to the load
          * of @task is visible. This is the condition as to why the user called
-        * rcuwait_trywake() in the first place. Pairs with set_current_state()
+        * rcuwait_wake() in the first place. Pairs with set_current_state()
          * barrier (A) in rcuwait_wait_event().
          *
          *    WAIT                WAKE
@@ -249,8 +250,10 @@ void rcuwait_wake_up(struct rcuwait *w)
  
         task = rcu_dereference(w->task);
         if (task)
-               wake_up_process(task);
+               ret = wake_up_process(task);
         rcu_read_unlock();
+
+       return ret;
  }
  EXPORT_SYMBOL_GPL(rcuwait_wake_up);
  
diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat

index e83fc8e..d199a36 100755 (executable)
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -32,6 +32,7 @@ import resource
  import struct
  import re
  import subprocess
+import signal
  from collections import defaultdict, namedtuple
  from functools import reduce
  from datetime import datetime
@@ -228,6 +229,8 @@ IOCTL_NUMBERS = {
      'RESET':       0x00002403,
  }
  
+signal_received = False
+
  ENCODING = locale.getpreferredencoding(False)
  TRACE_FILTER = re.compile(r'^[^\(]*$')
  
@@ -1500,8 +1503,7 @@ class StdFormat(object):
      def get_banner(self):
          return self._banner
  
-    @staticmethod
-    def get_statline(keys, s):
+    def get_statline(self, keys, s):
          res = ''
          for key in keys:
              res += ' %9d' % s[key].delta
@@ -1517,27 +1519,71 @@ class CSVFormat(object):
      def get_banner(self):
          return self._banner
  
-    @staticmethod
-    def get_statline(keys, s):
+    def get_statline(self, keys, s):
          return reduce(lambda res, key: "{},{!s}".format(res, s[key].delta),
                        keys, '')
  
  
  def log(stats, opts, frmt, keys):
      """Prints statistics as reiterating key block, multiple value blocks."""
+    global signal_received
      line = 0
      banner_repeat = 20
+    f = None
+
+    def do_banner(opts):
+        nonlocal f
+        if opts.log_to_file:
+            if not f:
+                try:
+                     f = open(opts.log_to_file, 'a')
+                except (IOError, OSError):
+                    sys.exit("Error: Could not open file: %s" %
+                             opts.log_to_file)
+                if isinstance(frmt, CSVFormat) and f.tell() != 0:
+                    return
+        print(frmt.get_banner(), file=f or sys.stdout)
+
+    def do_statline(opts, values):
+        statline = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + \
+                   frmt.get_statline(keys, values)
+        print(statline, file=f or sys.stdout)
+
+    do_banner(opts)
+    banner_printed = True
      while True:
          try:
              time.sleep(opts.set_delay)
-            if line % banner_repeat == 0:
-                print(frmt.get_banner())
-            print(datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
-                  frmt.get_statline(keys, stats.get()))
-            line += 1
+            if signal_received:
+                banner_printed = True
+                line = 0
+                f.close()
+                do_banner(opts)
+                signal_received = False
+            if (line % banner_repeat == 0 and not banner_printed and
+                not (opts.log_to_file and isinstance(frmt, CSVFormat))):
+                do_banner(opts)
+                banner_printed = True
+            values = stats.get()
+            if (not opts.skip_zero_records or
+                any(values[k].delta != 0 for k in keys)):
+                do_statline(opts, values)
+                line += 1
+                banner_printed = False
          except KeyboardInterrupt:
              break
  
+    if opts.log_to_file:
+        f.close()
+
+
+def handle_signal(sig, frame):
+    global signal_received
+
+    signal_received = True
+
+    return
+
  
  def is_delay_valid(delay):
      """Verify delay is in valid value range."""
@@ -1610,7 +1656,7 @@ Press any other key to refresh statistics immediately.
      argparser.add_argument('-c', '--csv',
                             action='store_true',
                             default=False,
-                           help='log in csv format - requires option -l/--log',
+                           help='log in csv format - requires option -l/-L',
                             )
      argparser.add_argument('-d', '--debugfs',
                             action='store_true',
@@ -1638,6 +1684,11 @@ Press any other key to refresh statistics immediately.
                             default=False,
                             help='run in logging mode (like vmstat)',
                             )
+    argparser.add_argument('-L', '--log-to-file',
+                           type=str,
+                           metavar='FILE',
+                           help="like '--log', but logging to a file"
+                           )
      argparser.add_argument('-p', '--pid',
                             type=int,
                             default=0,
@@ -1655,9 +1706,16 @@ Press any other key to refresh statistics immediately.
                             default=False,
                             help='retrieve statistics from tracepoints',
                             )
+    argparser.add_argument('-z', '--skip-zero-records',
+                           action='store_true',
+                           default=False,
+                           help='omit records with all zeros in logging mode',
+                           )
      options = argparser.parse_args()
-    if options.csv and not options.log:
+    if options.csv and not (options.log or options.log_to_file):
          sys.exit('Error: Option -c/--csv requires -l/--log')
+    if options.skip_zero_records and not (options.log or options.log_to_file):
+        sys.exit('Error: Option -z/--skip-zero-records requires -l/-L')
      try:
          # verify that we were passed a valid regex up front
          re.compile(options.fields)
@@ -1737,7 +1795,9 @@ def main():
          sys.stdout.write('  ' + '\n  '.join(sorted(set(event_list))) + '\n')
          sys.exit(0)
  
-    if options.log:
+    if options.log or options.log_to_file:
+        if options.log_to_file:
+            signal.signal(signal.SIGHUP, handle_signal)
          keys = sorted(stats.get().keys())
          if options.csv:
              frmt = CSVFormat(keys)
diff --git a/tools/kvm/kvm_stat/kvm_stat.service b/tools/kvm/kvm_stat/kvm_stat.service

new file mode 100644 (file)

index 0000000..71aabaf
--- /dev/null
+++ b/tools/kvm/kvm_stat/kvm_stat.service
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+[Unit]
+Description=Service that logs KVM kernel module trace events
+Before=qemu-kvm.service
+
+[Service]
+Type=simple
+ExecStart=/usr/bin/kvm_stat -dtcz -s 10 -L /var/log/kvm_stat.csv
+ExecReload=/bin/kill -HUP $MAINPID
+Restart=always
+SyslogIdentifier=kvm_stat
+SyslogLevel=debug
+
+[Install]
+WantedBy=multi-user.target
diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt

index a97ded2..feaf464 100644 (file)
--- a/tools/kvm/kvm_stat/kvm_stat.txt
+++ b/tools/kvm/kvm_stat/kvm_stat.txt
@@ -65,8 +65,10 @@ OPTIONS
         run in batch mode for one second
  
  -c::
---csv=<file>::
-        log in csv format - requires option -l/--log
+--csv::
+        log in csv format. Requires option -l/--log or -L/--log-to-file.
+        When used with option -L/--log-to-file, the header is only ever
+        written to start of file to preserve the format.
  
  -d::
  --debugfs::
@@ -92,6 +94,11 @@ OPTIONS
  --log::
          run in logging mode (like vmstat)
  
+
+-L<file>::
+--log-to-file=<file>::
+        like -l/--log, but logging to a file. Appends to existing files.
+
  -p<pid>::
  --pid=<pid>::
         limit statistics to one virtual machine (pid)
@@ -104,6 +111,10 @@ OPTIONS
  --tracepoints::
          retrieve statistics from tracepoints
  
+*z*::
+--skip-zero-records::
+        omit records with all zeros in logging mode
+
  SEE ALSO
  --------
  'perf'(1), 'trace-cmd'(1)
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore

index a9b2b48..222e501 100644 (file)
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -7,7 +7,6 @@
  /x86_64/hyperv_cpuid
  /x86_64/mmio_warning_test
  /x86_64/platform_info_test
-/x86_64/set_memory_region_test
  /x86_64/set_sregs_test
  /x86_64/smm_test
  /x86_64/state_test
@@ -22,4 +21,5 @@
  /demand_paging_test
  /dirty_log_test
  /kvm_create_max_vcpus
+/set_memory_region_test
  /steal_time
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile

index b728c0a..29200b6 100644 (file)
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -43,7 +43,6 @@ TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
  TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
  TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
  TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
-TEST_GEN_PROGS_x86_64 += x86_64/set_memory_region_test
  TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
  TEST_GEN_PROGS_x86_64 += x86_64/smm_test
  TEST_GEN_PROGS_x86_64 += x86_64/state_test
@@ -54,16 +53,19 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
  TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
  TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
  TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
+TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
  TEST_GEN_PROGS_x86_64 += clear_dirty_log_test
  TEST_GEN_PROGS_x86_64 += demand_paging_test
  TEST_GEN_PROGS_x86_64 += dirty_log_test
  TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += set_memory_region_test
  TEST_GEN_PROGS_x86_64 += steal_time
  
  TEST_GEN_PROGS_aarch64 += clear_dirty_log_test
  TEST_GEN_PROGS_aarch64 += demand_paging_test
  TEST_GEN_PROGS_aarch64 += dirty_log_test
  TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_aarch64 += set_memory_region_test
  TEST_GEN_PROGS_aarch64 += steal_time
  
  TEST_GEN_PROGS_s390x = s390x/memop
@@ -72,6 +74,7 @@ TEST_GEN_PROGS_s390x += s390x/sync_regs_test
  TEST_GEN_PROGS_s390x += demand_paging_test
  TEST_GEN_PROGS_s390x += dirty_log_test
  TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
+TEST_GEN_PROGS_s390x += set_memory_region_test
  
  TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
  LIBKVM += $(LIBKVM_$(UNAME_M))
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h

index a99b875..e244c6e 100644 (file)
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -10,6 +10,7 @@
  #include "test_util.h"
  
  #include "asm/kvm.h"
+#include "linux/list.h"
  #include "linux/kvm.h"
  #include <sys/ioctl.h>
  
@@ -113,6 +114,7 @@ int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
  void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
  void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
  void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
+void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot);
  void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid);
  vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
                           uint32_t data_memslot, uint32_t pgd_memslot);
@@ -143,6 +145,8 @@ struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid);
  void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
  int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
  void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid,
+                         struct kvm_guest_debug *debug);
  void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
                        struct kvm_mp_state *mp_state);
  void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
@@ -254,6 +258,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm);
  unsigned int vm_get_page_size(struct kvm_vm *vm);
  unsigned int vm_get_page_shift(struct kvm_vm *vm);
  unsigned int vm_get_max_gfn(struct kvm_vm *vm);
+int vm_get_fd(struct kvm_vm *vm);
  
  unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size);
  unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages);
@@ -311,11 +316,26 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc);
  
  #define GUEST_SYNC(stage)      ucall(UCALL_SYNC, 2, "hello", stage)
  #define GUEST_DONE()           ucall(UCALL_DONE, 0)
-#define GUEST_ASSERT(_condition) do {                  \
-       if (!(_condition))                              \
-               ucall(UCALL_ABORT, 2,                   \
-                       "Failed guest assert: "         \
-                       #_condition, __LINE__);         \
+#define __GUEST_ASSERT(_condition, _nargs, _args...) do {      \
+       if (!(_condition))                                      \
+               ucall(UCALL_ABORT, 2 + _nargs,                  \
+                       "Failed guest assert: "                 \
+                       #_condition, __LINE__, _args);          \
  } while (0)
  
+#define GUEST_ASSERT(_condition) \
+       __GUEST_ASSERT((_condition), 0, 0)
+
+#define GUEST_ASSERT_1(_condition, arg1) \
+       __GUEST_ASSERT((_condition), 1, (arg1))
+
+#define GUEST_ASSERT_2(_condition, arg1, arg2) \
+       __GUEST_ASSERT((_condition), 2, (arg1), (arg2))
+
+#define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \
+       __GUEST_ASSERT((_condition), 3, (arg1), (arg2), (arg3))
+
+#define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \
+       __GUEST_ASSERT((_condition), 4, (arg1), (arg2), (arg3), (arg4))
+
  #endif /* SELFTEST_KVM_UTIL_H */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c

index 8a3523d..c9cede5 100644 (file)
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -161,6 +161,9 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
         vm = calloc(1, sizeof(*vm));
         TEST_ASSERT(vm != NULL, "Insufficient Memory");
  
+       INIT_LIST_HEAD(&vm->vcpus);
+       INIT_LIST_HEAD(&vm->userspace_mem_regions);
+
         vm->mode = mode;
         vm->type = 0;
  
@@ -258,8 +261,7 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm)
         if (vmp->has_irqchip)
                 vm_create_irqchip(vmp);
  
-       for (region = vmp->userspace_mem_region_head; region;
-               region = region->next) {
+       list_for_each_entry(region, &vmp->userspace_mem_regions, list) {
                 int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
                 TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
                             "  rc: %i errno: %i\n"
@@ -319,8 +321,7 @@ userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end)
  {
         struct userspace_mem_region *region;
  
-       for (region = vm->userspace_mem_region_head; region;
-               region = region->next) {
+       list_for_each_entry(region, &vm->userspace_mem_regions, list) {
                 uint64_t existing_start = region->region.guest_phys_addr;
                 uint64_t existing_end = region->region.guest_phys_addr
                         + region->region.memory_size - 1;
@@ -378,11 +379,11 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
   */
  struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid)
  {
-       struct vcpu *vcpup;
+       struct vcpu *vcpu;
  
-       for (vcpup = vm->vcpu_head; vcpup; vcpup = vcpup->next) {
-               if (vcpup->id == vcpuid)
-                       return vcpup;
+       list_for_each_entry(vcpu, &vm->vcpus, list) {
+               if (vcpu->id == vcpuid)
+                       return vcpu;
         }
  
         return NULL;
@@ -392,18 +393,16 @@ struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid)
   * VM VCPU Remove
   *
   * Input Args:
- *   vm - Virtual Machine
- *   vcpuid - VCPU ID
+ *   vcpu - VCPU to remove
   *
   * Output Args: None
   *
   * Return: None, TEST_ASSERT failures for all error conditions
   *
- * Within the VM specified by vm, removes the VCPU given by vcpuid.
+ * Removes a vCPU from a VM and frees its resources.
   */
-static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid)
+static void vm_vcpu_rm(struct vcpu *vcpu)
  {
-       struct vcpu *vcpu = vcpu_find(vm, vcpuid);
         int ret;
  
         ret = munmap(vcpu->state, sizeof(*vcpu->state));
@@ -413,21 +412,17 @@ static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid)
         TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i "
                 "errno: %i", ret, errno);
  
-       if (vcpu->next)
-               vcpu->next->prev = vcpu->prev;
-       if (vcpu->prev)
-               vcpu->prev->next = vcpu->next;
-       else
-               vm->vcpu_head = vcpu->next;
+       list_del(&vcpu->list);
         free(vcpu);
  }
  
  void kvm_vm_release(struct kvm_vm *vmp)
  {
+       struct vcpu *vcpu, *tmp;
         int ret;
  
-       while (vmp->vcpu_head)
-               vm_vcpu_rm(vmp, vmp->vcpu_head->id);
+       list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list)
+               vm_vcpu_rm(vcpu);
  
         ret = close(vmp->fd);
         TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
@@ -438,35 +433,38 @@ void kvm_vm_release(struct kvm_vm *vmp)
                 "  vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno);
  }
  
+static void __vm_mem_region_delete(struct kvm_vm *vm,
+                                  struct userspace_mem_region *region)
+{
+       int ret;
+
+       list_del(&region->list);
+
+       region->region.memory_size = 0;
+       ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
+       TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, "
+                   "rc: %i errno: %i", ret, errno);
+
+       sparsebit_free(&region->unused_phy_pages);
+       ret = munmap(region->mmap_start, region->mmap_size);
+       TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", ret, errno);
+
+       free(region);
+}
+
  /*
   * Destroys and frees the VM pointed to by vmp.
   */
  void kvm_vm_free(struct kvm_vm *vmp)
  {
-       int ret;
+       struct userspace_mem_region *region, *tmp;
  
         if (vmp == NULL)
                 return;
  
         /* Free userspace_mem_regions. */
-       while (vmp->userspace_mem_region_head) {
-               struct userspace_mem_region *region
-                       = vmp->userspace_mem_region_head;
-
-               region->region.memory_size = 0;
-               ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION,
-                       &region->region);
-               TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, "
-                       "rc: %i errno: %i", ret, errno);
-
-               vmp->userspace_mem_region_head = region->next;
-               sparsebit_free(&region->unused_phy_pages);
-               ret = munmap(region->mmap_start, region->mmap_size);
-               TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i",
-                           ret, errno);
-
-               free(region);
-       }
+       list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list)
+               __vm_mem_region_delete(vmp, region);
  
         /* Free sparsebit arrays. */
         sparsebit_free(&vmp->vpages_valid);
@@ -612,12 +610,10 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
                         (uint64_t) region->region.memory_size);
  
         /* Confirm no region with the requested slot already exists. */
-       for (region = vm->userspace_mem_region_head; region;
-               region = region->next) {
-               if (region->region.slot == slot)
-                       break;
-       }
-       if (region != NULL)
+       list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+               if (region->region.slot != slot)
+                       continue;
+
                 TEST_FAIL("A mem region with the requested slot "
                         "already exists.\n"
                         "  requested slot: %u paddr: 0x%lx npages: 0x%lx\n"
@@ -626,6 +622,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
                         region->region.slot,
                         (uint64_t) region->region.guest_phys_addr,
                         (uint64_t) region->region.memory_size);
+       }
  
         /* Allocate and initialize new mem region structure. */
         region = calloc(1, sizeof(*region));
@@ -686,10 +683,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
                 guest_paddr, (uint64_t) region->region.memory_size);
  
         /* Add to linked-list of memory regions. */
-       if (vm->userspace_mem_region_head)
-               vm->userspace_mem_region_head->prev = region;
-       region->next = vm->userspace_mem_region_head;
-       vm->userspace_mem_region_head = region;
+       list_add(&region->list, &vm->userspace_mem_regions);
  }
  
  /*
@@ -712,20 +706,17 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot)
  {
         struct userspace_mem_region *region;
  
-       for (region = vm->userspace_mem_region_head; region;
-               region = region->next) {
+       list_for_each_entry(region, &vm->userspace_mem_regions, list) {
                 if (region->region.slot == memslot)
-                       break;
-       }
-       if (region == NULL) {
-               fprintf(stderr, "No mem region with the requested slot found,\n"
-                       "  requested slot: %u\n", memslot);
-               fputs("---- vm dump ----\n", stderr);
-               vm_dump(stderr, vm, 2);
-               TEST_FAIL("Mem region not found");
+                       return region;
         }
  
-       return region;
+       fprintf(stderr, "No mem region with the requested slot found,\n"
+               "  requested slot: %u\n", memslot);
+       fputs("---- vm dump ----\n", stderr);
+       vm_dump(stderr, vm, 2);
+       TEST_FAIL("Mem region not found");
+       return NULL;
  }
  
  /*
@@ -788,6 +779,24 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
                     ret, errno, slot, new_gpa);
  }
  
+/*
+ * VM Memory Region Delete
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   slot - Slot of the memory region to delete
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Delete a memory region.
+ */
+void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
+{
+       __vm_mem_region_delete(vm, memslot2region(vm, slot));
+}
+
  /*
   * VCPU mmap Size
   *
@@ -863,10 +872,7 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid)
                 "vcpu id: %u errno: %i", vcpuid, errno);
  
         /* Add to linked-list of VCPUs. */
-       if (vm->vcpu_head)
-               vm->vcpu_head->prev = vcpu;
-       vcpu->next = vm->vcpu_head;
-       vm->vcpu_head = vcpu;
+       list_add(&vcpu->list, &vm->vcpus);
  }
  
  /*
@@ -1059,8 +1065,8 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
  void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
  {
         struct userspace_mem_region *region;
-       for (region = vm->userspace_mem_region_head; region;
-            region = region->next) {
+
+       list_for_each_entry(region, &vm->userspace_mem_regions, list) {
                 if ((gpa >= region->region.guest_phys_addr)
                         && (gpa <= (region->region.guest_phys_addr
                                 + region->region.memory_size - 1)))
@@ -1092,8 +1098,8 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
  vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
  {
         struct userspace_mem_region *region;
-       for (region = vm->userspace_mem_region_head; region;
-            region = region->next) {
+
+       list_for_each_entry(region, &vm->userspace_mem_regions, list) {
                 if ((hva >= region->host_mem)
                         && (hva <= (region->host_mem
                                 + region->region.memory_size - 1)))
@@ -1201,6 +1207,15 @@ void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid)
                     ret, errno);
  }
  
+void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid,
+                         struct kvm_guest_debug *debug)
+{
+       struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+       int ret = ioctl(vcpu->fd, KVM_SET_GUEST_DEBUG, debug);
+
+       TEST_ASSERT(ret == 0, "KVM_SET_GUEST_DEBUG failed: %d", ret);
+}
+
  /*
   * VM VCPU Set MP State
   *
@@ -1520,8 +1535,7 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
         fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd);
         fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size);
         fprintf(stream, "%*sMem Regions:\n", indent, "");
-       for (region = vm->userspace_mem_region_head; region;
-               region = region->next) {
+       list_for_each_entry(region, &vm->userspace_mem_regions, list) {
                 fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx "
                         "host_virt: %p\n", indent + 2, "",
                         (uint64_t) region->region.guest_phys_addr,
@@ -1540,7 +1554,7 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
                 virt_dump(stream, vm, indent + 4);
         }
         fprintf(stream, "%*sVCPUs:\n", indent, "");
-       for (vcpu = vm->vcpu_head; vcpu; vcpu = vcpu->next)
+       list_for_each_entry(vcpu, &vm->vcpus, list)
                 vcpu_dump(stream, vm, vcpu->id, indent + 2);
  }
  
@@ -1734,6 +1748,11 @@ unsigned int vm_get_max_gfn(struct kvm_vm *vm)
         return vm->max_gfn;
  }
  
+int vm_get_fd(struct kvm_vm *vm)
+{
+       return vm->fd;
+}
+
  static unsigned int vm_calc_num_pages(unsigned int num_pages,
                                       unsigned int page_shift,
                                       unsigned int new_page_shift,
diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h

index ca56a01..2ef4465 100644 (file)
--- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h
+++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
@@ -13,7 +13,6 @@
  #define KVM_DEV_PATH           "/dev/kvm"
  
  struct userspace_mem_region {
-       struct userspace_mem_region *next, *prev;
         struct kvm_userspace_memory_region region;
         struct sparsebit *unused_phy_pages;
         int fd;
@@ -21,10 +20,11 @@ struct userspace_mem_region {
         void *host_mem;
         void *mmap_start;
         size_t mmap_size;
+       struct list_head list;
  };
  
  struct vcpu {
-       struct vcpu *next, *prev;
+       struct list_head list;
         uint32_t id;
         int fd;
         struct kvm_run *state;
@@ -41,8 +41,8 @@ struct kvm_vm {
         unsigned int pa_bits;
         unsigned int va_bits;
         uint64_t max_gfn;
-       struct vcpu *vcpu_head;
-       struct userspace_mem_region *userspace_mem_region_head;
+       struct list_head vcpus;
+       struct list_head userspace_mem_regions;
         struct sparsebit *vpages_valid;
         struct sparsebit *vpages_mapped;
         bool has_irqchip;
diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c

index 8d94961..a88c5d6 100644 (file)
--- a/tools/testing/selftests/kvm/lib/s390x/processor.c
+++ b/tools/testing/selftests/kvm/lib/s390x/processor.c
@@ -233,7 +233,10 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
  
  void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
  {
-       struct vcpu *vcpu = vm->vcpu_head;
+       struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+       if (!vcpu)
+               return;
  
         fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n",
                 indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr);
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c

new file mode 100644 (file)

index 0000000..b3ece55
--- /dev/null
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -0,0 +1,408 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <linux/compiler.h>
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+#define VCPU_ID 0
+
+/*
+ * s390x needs at least 1MB alignment, and the x86_64 MOVE/DELETE tests need a
+ * 2MB sized and aligned region so that the initial region corresponds to
+ * exactly one large page.
+ */
+#define MEM_REGION_SIZE                0x200000
+
+#ifdef __x86_64__
+/*
+ * Somewhat arbitrary location and slot, intended to not overlap anything.
+ */
+#define MEM_REGION_GPA         0xc0000000
+#define MEM_REGION_SLOT                10
+
+static const uint64_t MMIO_VAL = 0xbeefull;
+
+extern const uint64_t final_rip_start;
+extern const uint64_t final_rip_end;
+
+static sem_t vcpu_ready;
+
+static inline uint64_t guest_spin_on_val(uint64_t spin_val)
+{
+       uint64_t val;
+
+       do {
+               val = READ_ONCE(*((uint64_t *)MEM_REGION_GPA));
+       } while (val == spin_val);
+
+       GUEST_SYNC(0);
+       return val;
+}
+
+static void *vcpu_worker(void *data)
+{
+       struct kvm_vm *vm = data;
+       struct kvm_run *run;
+       struct ucall uc;
+       uint64_t cmd;
+
+       /*
+        * Loop until the guest is done.  Re-enter the guest on all MMIO exits,
+        * which will occur if the guest attempts to access a memslot after it
+        * has been deleted or while it is being moved .
+        */
+       run = vcpu_state(vm, VCPU_ID);
+
+       while (1) {
+               vcpu_run(vm, VCPU_ID);
+
+               if (run->exit_reason == KVM_EXIT_IO) {
+                       cmd = get_ucall(vm, VCPU_ID, &uc);
+                       if (cmd != UCALL_SYNC)
+                               break;
+
+                       sem_post(&vcpu_ready);
+                       continue;
+               }
+
+               if (run->exit_reason != KVM_EXIT_MMIO)
+                       break;
+
+               TEST_ASSERT(!run->mmio.is_write, "Unexpected exit mmio write");
+               TEST_ASSERT(run->mmio.len == 8,
+                           "Unexpected exit mmio size = %u", run->mmio.len);
+
+               TEST_ASSERT(run->mmio.phys_addr == MEM_REGION_GPA,
+                           "Unexpected exit mmio address = 0x%llx",
+                           run->mmio.phys_addr);
+               memcpy(run->mmio.data, &MMIO_VAL, 8);
+       }
+
+       if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT)
+               TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0],
+                         __FILE__, uc.args[1], uc.args[2]);
+
+       return NULL;
+}
+
+static void wait_for_vcpu(void)
+{
+       struct timespec ts;
+
+       TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts),
+                   "clock_gettime() failed: %d\n", errno);
+
+       ts.tv_sec += 2;
+       TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts),
+                   "sem_timedwait() failed: %d\n", errno);
+
+       /* Wait for the vCPU thread to reenter the guest. */
+       usleep(100000);
+}
+
+static struct kvm_vm *spawn_vm(pthread_t *vcpu_thread, void *guest_code)
+{
+       struct kvm_vm *vm;
+       uint64_t *hva;
+       uint64_t gpa;
+
+       vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+       vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+       vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP,
+                                   MEM_REGION_GPA, MEM_REGION_SLOT,
+                                   MEM_REGION_SIZE / getpagesize(), 0);
+
+       /*
+        * Allocate and map two pages so that the GPA accessed by guest_code()
+        * stays valid across the memslot move.
+        */
+       gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT);
+       TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n");
+
+       virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0);
+
+       /* Ditto for the host mapping so that both pages can be zeroed. */
+       hva = addr_gpa2hva(vm, MEM_REGION_GPA);
+       memset(hva, 0, 2 * 4096);
+
+       pthread_create(vcpu_thread, NULL, vcpu_worker, vm);
+
+       /* Ensure the guest thread is spun up. */
+       wait_for_vcpu();
+
+       return vm;
+}
+
+
+static void guest_code_move_memory_region(void)
+{
+       uint64_t val;
+
+       GUEST_SYNC(0);
+
+       /*
+        * Spin until the memory region is moved to a misaligned address.  This
+        * may or may not trigger MMIO, as the window where the memslot is
+        * invalid is quite small.
+        */
+       val = guest_spin_on_val(0);
+       GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val);
+
+       /* Spin until the memory region is realigned. */
+       val = guest_spin_on_val(MMIO_VAL);
+       GUEST_ASSERT_1(val == 1, val);
+
+       GUEST_DONE();
+}
+
+static void test_move_memory_region(void)
+{
+       pthread_t vcpu_thread;
+       struct kvm_vm *vm;
+       uint64_t *hva;
+
+       vm = spawn_vm(&vcpu_thread, guest_code_move_memory_region);
+
+       hva = addr_gpa2hva(vm, MEM_REGION_GPA);
+
+       /*
+        * Shift the region's base GPA.  The guest should not see "2" as the
+        * hva->gpa translation is misaligned, i.e. the guest is accessing a
+        * different host pfn.
+        */
+       vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA - 4096);
+       WRITE_ONCE(*hva, 2);
+
+       /*
+        * The guest _might_ see an invalid memslot and trigger MMIO, but it's
+        * a tiny window.  Spin and defer the sync until the memslot is
+        * restored and guest behavior is once again deterministic.
+        */
+       usleep(100000);
+
+       /*
+        * Note, value in memory needs to be changed *before* restoring the
+        * memslot, else the guest could race the update and see "2".
+        */
+       WRITE_ONCE(*hva, 1);
+
+       /* Restore the original base, the guest should see "1". */
+       vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA);
+       wait_for_vcpu();
+       /* Defered sync from when the memslot was misaligned (above). */
+       wait_for_vcpu();
+
+       pthread_join(vcpu_thread, NULL);
+
+       kvm_vm_free(vm);
+}
+
+static void guest_code_delete_memory_region(void)
+{
+       uint64_t val;
+
+       GUEST_SYNC(0);
+
+       /* Spin until the memory region is deleted. */
+       val = guest_spin_on_val(0);
+       GUEST_ASSERT_1(val == MMIO_VAL, val);
+
+       /* Spin until the memory region is recreated. */
+       val = guest_spin_on_val(MMIO_VAL);
+       GUEST_ASSERT_1(val == 0, val);
+
+       /* Spin until the memory region is deleted. */
+       val = guest_spin_on_val(0);
+       GUEST_ASSERT_1(val == MMIO_VAL, val);
+
+       asm("1:\n\t"
+           ".pushsection .rodata\n\t"
+           ".global final_rip_start\n\t"
+           "final_rip_start: .quad 1b\n\t"
+           ".popsection");
+
+       /* Spin indefinitely (until the code memslot is deleted). */
+       guest_spin_on_val(MMIO_VAL);
+
+       asm("1:\n\t"
+           ".pushsection .rodata\n\t"
+           ".global final_rip_end\n\t"
+           "final_rip_end: .quad 1b\n\t"
+           ".popsection");
+
+       GUEST_ASSERT_1(0, 0);
+}
+
+static void test_delete_memory_region(void)
+{
+       pthread_t vcpu_thread;
+       struct kvm_regs regs;
+       struct kvm_run *run;
+       struct kvm_vm *vm;
+
+       vm = spawn_vm(&vcpu_thread, guest_code_delete_memory_region);
+
+       /* Delete the memory region, the guest should not die. */
+       vm_mem_region_delete(vm, MEM_REGION_SLOT);
+       wait_for_vcpu();
+
+       /* Recreate the memory region.  The guest should see "0". */
+       vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP,
+                                   MEM_REGION_GPA, MEM_REGION_SLOT,
+                                   MEM_REGION_SIZE / getpagesize(), 0);
+       wait_for_vcpu();
+
+       /* Delete the region again so that there's only one memslot left. */
+       vm_mem_region_delete(vm, MEM_REGION_SLOT);
+       wait_for_vcpu();
+
+       /*
+        * Delete the primary memslot.  This should cause an emulation error or
+        * shutdown due to the page tables getting nuked.
+        */
+       vm_mem_region_delete(vm, 0);
+
+       pthread_join(vcpu_thread, NULL);
+
+       run = vcpu_state(vm, VCPU_ID);
+
+       TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN ||
+                   run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
+                   "Unexpected exit reason = %d", run->exit_reason);
+
+       vcpu_regs_get(vm, VCPU_ID, &regs);
+
+       /*
+        * On AMD, after KVM_EXIT_SHUTDOWN the VMCB has been reinitialized already,
+        * so the instruction pointer would point to the reset vector.
+        */
+       if (run->exit_reason == KVM_EXIT_INTERNAL_ERROR)
+               TEST_ASSERT(regs.rip >= final_rip_start &&
+                           regs.rip < final_rip_end,
+                           "Bad rip, expected 0x%lx - 0x%lx, got 0x%llx\n",
+                           final_rip_start, final_rip_end, regs.rip);
+
+       kvm_vm_free(vm);
+}
+
+static void test_zero_memory_regions(void)
+{
+       struct kvm_run *run;
+       struct kvm_vm *vm;
+
+       pr_info("Testing KVM_RUN with zero added memory regions\n");
+
+       vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
+       vm_vcpu_add(vm, VCPU_ID);
+
+       TEST_ASSERT(!ioctl(vm_get_fd(vm), KVM_SET_NR_MMU_PAGES, 64),
+                   "KVM_SET_NR_MMU_PAGES failed, errno = %d\n", errno);
+       vcpu_run(vm, VCPU_ID);
+
+       run = vcpu_state(vm, VCPU_ID);
+       TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
+                   "Unexpected exit_reason = %u\n", run->exit_reason);
+
+       kvm_vm_free(vm);
+}
+#endif /* __x86_64__ */
+
+/*
+ * Test it can be added memory slots up to KVM_CAP_NR_MEMSLOTS, then any
+ * tentative to add further slots should fail.
+ */
+static void test_add_max_memory_regions(void)
+{
+       int ret;
+       struct kvm_vm *vm;
+       uint32_t max_mem_slots;
+       uint32_t slot;
+       uint64_t guest_addr = 0x0;
+       uint64_t mem_reg_npages;
+       void *mem;
+
+       max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
+       TEST_ASSERT(max_mem_slots > 0,
+                   "KVM_CAP_NR_MEMSLOTS should be greater than 0");
+       pr_info("Allowed number of memory slots: %i\n", max_mem_slots);
+
+       vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
+
+       mem_reg_npages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, MEM_REGION_SIZE);
+
+       /* Check it can be added memory slots up to the maximum allowed */
+       pr_info("Adding slots 0..%i, each memory region with %dK size\n",
+               (max_mem_slots - 1), MEM_REGION_SIZE >> 10);
+       for (slot = 0; slot < max_mem_slots; slot++) {
+               vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+                                           guest_addr, slot, mem_reg_npages,
+                                           0);
+               guest_addr += MEM_REGION_SIZE;
+       }
+
+       /* Check it cannot be added memory slots beyond the limit */
+       mem = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE,
+                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host");
+
+       ret = ioctl(vm_get_fd(vm), KVM_SET_USER_MEMORY_REGION,
+                   &(struct kvm_userspace_memory_region) {slot, 0, guest_addr,
+                   MEM_REGION_SIZE, (uint64_t) mem});
+       TEST_ASSERT(ret == -1 && errno == EINVAL,
+                   "Adding one more memory slot should fail with EINVAL");
+
+       munmap(mem, MEM_REGION_SIZE);
+       kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+#ifdef __x86_64__
+       int i, loops;
+#endif
+
+       /* Tell stdout not to buffer its content */
+       setbuf(stdout, NULL);
+
+#ifdef __x86_64__
+       /*
+        * FIXME: the zero-memslot test fails on aarch64 and s390x because
+        * KVM_RUN fails with ENOEXEC or EFAULT.
+        */
+       test_zero_memory_regions();
+#endif
+
+       test_add_max_memory_regions();
+
+#ifdef __x86_64__
+       if (argc > 1)
+               loops = atoi(argv[1]);
+       else
+               loops = 10;
+
+       pr_info("Testing MOVE of in-use region, %d loops\n", loops);
+       for (i = 0; i < loops; i++)
+               test_move_memory_region();
+
+       pr_info("Testing DELETE of in-use region, %d loops\n", loops);
+       for (i = 0; i < loops; i++)
+               test_delete_memory_region();
+#endif
+
+       return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c

new file mode 100644 (file)

index 0000000..8162c58
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM guest debug register tests
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+#define DR6_BD         (1 << 13)
+#define DR7_GD         (1 << 13)
+
+/* For testing data access debug BP */
+uint32_t guest_value;
+
+extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start;
+
+static void guest_code(void)
+{
+       /*
+        * Software BP tests.
+        *
+        * NOTE: sw_bp need to be before the cmd here, because int3 is an
+        * exception rather than a normal trap for KVM_SET_GUEST_DEBUG (we
+        * capture it using the vcpu exception bitmap).
+        */
+       asm volatile("sw_bp: int3");
+
+       /* Hardware instruction BP test */
+       asm volatile("hw_bp: nop");
+
+       /* Hardware data BP test */
+       asm volatile("mov $1234,%%rax;\n\t"
+                    "mov %%rax,%0;\n\t write_data:"
+                    : "=m" (guest_value) : : "rax");
+
+       /* Single step test, covers 2 basic instructions and 2 emulated */
+       asm volatile("ss_start: "
+                    "xor %%rax,%%rax\n\t"
+                    "cpuid\n\t"
+                    "movl $0x1a0,%%ecx\n\t"
+                    "rdmsr\n\t"
+                    : : : "rax", "ecx");
+
+       /* DR6.BD test */
+       asm volatile("bd_start: mov %%dr0, %%rax" : : : "rax");
+       GUEST_DONE();
+}
+
+#define  CLEAR_DEBUG()  memset(&debug, 0, sizeof(debug))
+#define  APPLY_DEBUG()  vcpu_set_guest_debug(vm, VCPU_ID, &debug)
+#define  CAST_TO_RIP(v)  ((unsigned long long)&(v))
+#define  SET_RIP(v)  do {                              \
+               vcpu_regs_get(vm, VCPU_ID, &regs);      \
+               regs.rip = (v);                         \
+               vcpu_regs_set(vm, VCPU_ID, &regs);      \
+       } while (0)
+#define  MOVE_RIP(v)  SET_RIP(regs.rip + (v));
+
+int main(void)
+{
+       struct kvm_guest_debug debug;
+       unsigned long long target_dr6, target_rip;
+       struct kvm_regs regs;
+       struct kvm_run *run;
+       struct kvm_vm *vm;
+       struct ucall uc;
+       uint64_t cmd;
+       int i;
+       /* Instruction lengths starting at ss_start */
+       int ss_size[4] = {
+               3,              /* xor */
+               2,              /* cpuid */
+               5,              /* mov */
+               2,              /* rdmsr */
+       };
+
+       if (!kvm_check_cap(KVM_CAP_SET_GUEST_DEBUG)) {
+               print_skip("KVM_CAP_SET_GUEST_DEBUG not supported");
+               return 0;
+       }
+
+       vm = vm_create_default(VCPU_ID, 0, guest_code);
+       vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+       run = vcpu_state(vm, VCPU_ID);
+
+       /* Test software BPs - int3 */
+       CLEAR_DEBUG();
+       debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
+       APPLY_DEBUG();
+       vcpu_run(vm, VCPU_ID);
+       TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+                   run->debug.arch.exception == BP_VECTOR &&
+                   run->debug.arch.pc == CAST_TO_RIP(sw_bp),
+                   "INT3: exit %d exception %d rip 0x%llx (should be 0x%llx)",
+                   run->exit_reason, run->debug.arch.exception,
+                   run->debug.arch.pc, CAST_TO_RIP(sw_bp));
+       MOVE_RIP(1);
+
+       /* Test instruction HW BP over DR[0-3] */
+       for (i = 0; i < 4; i++) {
+               CLEAR_DEBUG();
+               debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
+               debug.arch.debugreg[i] = CAST_TO_RIP(hw_bp);
+               debug.arch.debugreg[7] = 0x400 | (1UL << (2*i+1));
+               APPLY_DEBUG();
+               vcpu_run(vm, VCPU_ID);
+               target_dr6 = 0xffff0ff0 | (1UL << i);
+               TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+                           run->debug.arch.exception == DB_VECTOR &&
+                           run->debug.arch.pc == CAST_TO_RIP(hw_bp) &&
+                           run->debug.arch.dr6 == target_dr6,
+                           "INS_HW_BP (DR%d): exit %d exception %d rip 0x%llx "
+                           "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+                           i, run->exit_reason, run->debug.arch.exception,
+                           run->debug.arch.pc, CAST_TO_RIP(hw_bp),
+                           run->debug.arch.dr6, target_dr6);
+       }
+       /* Skip "nop" */
+       MOVE_RIP(1);
+
+       /* Test data access HW BP over DR[0-3] */
+       for (i = 0; i < 4; i++) {
+               CLEAR_DEBUG();
+               debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
+               debug.arch.debugreg[i] = CAST_TO_RIP(guest_value);
+               debug.arch.debugreg[7] = 0x00000400 | (1UL << (2*i+1)) |
+                   (0x000d0000UL << (4*i));
+               APPLY_DEBUG();
+               vcpu_run(vm, VCPU_ID);
+               target_dr6 = 0xffff0ff0 | (1UL << i);
+               TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+                           run->debug.arch.exception == DB_VECTOR &&
+                           run->debug.arch.pc == CAST_TO_RIP(write_data) &&
+                           run->debug.arch.dr6 == target_dr6,
+                           "DATA_HW_BP (DR%d): exit %d exception %d rip 0x%llx "
+                           "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+                           i, run->exit_reason, run->debug.arch.exception,
+                           run->debug.arch.pc, CAST_TO_RIP(write_data),
+                           run->debug.arch.dr6, target_dr6);
+               /* Rollback the 4-bytes "mov" */
+               MOVE_RIP(-7);
+       }
+       /* Skip the 4-bytes "mov" */
+       MOVE_RIP(7);
+
+       /* Test single step */
+       target_rip = CAST_TO_RIP(ss_start);
+       target_dr6 = 0xffff4ff0ULL;
+       vcpu_regs_get(vm, VCPU_ID, &regs);
+       for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) {
+               target_rip += ss_size[i];
+               CLEAR_DEBUG();
+               debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
+               debug.arch.debugreg[7] = 0x00000400;
+               APPLY_DEBUG();
+               vcpu_run(vm, VCPU_ID);
+               TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+                           run->debug.arch.exception == DB_VECTOR &&
+                           run->debug.arch.pc == target_rip &&
+                           run->debug.arch.dr6 == target_dr6,
+                           "SINGLE_STEP[%d]: exit %d exception %d rip 0x%llx "
+                           "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+                           i, run->exit_reason, run->debug.arch.exception,
+                           run->debug.arch.pc, target_rip, run->debug.arch.dr6,
+                           target_dr6);
+       }
+
+       /* Finally test global disable */
+       CLEAR_DEBUG();
+       debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
+       debug.arch.debugreg[7] = 0x400 | DR7_GD;
+       APPLY_DEBUG();
+       vcpu_run(vm, VCPU_ID);
+       target_dr6 = 0xffff0ff0 | DR6_BD;
+       TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+                   run->debug.arch.exception == DB_VECTOR &&
+                   run->debug.arch.pc == CAST_TO_RIP(bd_start) &&
+                   run->debug.arch.dr6 == target_dr6,
+                           "DR7.GD: exit %d exception %d rip 0x%llx "
+                           "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+                           run->exit_reason, run->debug.arch.exception,
+                           run->debug.arch.pc, target_rip, run->debug.arch.dr6,
+                           target_dr6);
+
+       /* Disable all debug controls, run to the end */
+       CLEAR_DEBUG();
+       APPLY_DEBUG();
+
+       vcpu_run(vm, VCPU_ID);
+       TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "KVM_EXIT_IO");
+       cmd = get_ucall(vm, VCPU_ID, &uc);
+       TEST_ASSERT(cmd == UCALL_DONE, "UCALL_DONE");
+
+       kvm_vm_free(vm);
+
+       return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/set_memory_region_test.c b/tools/testing/selftests/kvm/x86_64/set_memory_region_test.c

deleted file mode 100644 (file)

index c6691cf..0000000
--- a/tools/testing/selftests/kvm/x86_64/set_memory_region_test.c
+++ /dev/null
@@ -1,141 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE /* for program_invocation_short_name */
-#include <fcntl.h>
-#include <pthread.h>
-#include <sched.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include <linux/compiler.h>
-
-#include <test_util.h>
-#include <kvm_util.h>
-#include <processor.h>
-
-#define VCPU_ID 0
-
-/*
- * Somewhat arbitrary location and slot, intended to not overlap anything.  The
- * location and size are specifically 2mb sized/aligned so that the initial
- * region corresponds to exactly one large page.
- */
-#define MEM_REGION_GPA         0xc0000000
-#define MEM_REGION_SIZE                0x200000
-#define MEM_REGION_SLOT                10
-
-static void guest_code(void)
-{
-       uint64_t val;
-
-       do {
-               val = READ_ONCE(*((uint64_t *)MEM_REGION_GPA));
-       } while (!val);
-
-       if (val != 1)
-               ucall(UCALL_ABORT, 1, val);
-
-       GUEST_DONE();
-}
-
-static void *vcpu_worker(void *data)
-{
-       struct kvm_vm *vm = data;
-       struct kvm_run *run;
-       struct ucall uc;
-       uint64_t cmd;
-
-       /*
-        * Loop until the guest is done.  Re-enter the guest on all MMIO exits,
-        * which will occur if the guest attempts to access a memslot while it
-        * is being moved.
-        */
-       run = vcpu_state(vm, VCPU_ID);
-       do {
-               vcpu_run(vm, VCPU_ID);
-       } while (run->exit_reason == KVM_EXIT_MMIO);
-
-       TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-                   "Unexpected exit reason = %d", run->exit_reason);
-
-       cmd = get_ucall(vm, VCPU_ID, &uc);
-       TEST_ASSERT(cmd == UCALL_DONE, "Unexpected val in guest = %lu", uc.args[0]);
-       return NULL;
-}
-
-static void test_move_memory_region(void)
-{
-       pthread_t vcpu_thread;
-       struct kvm_vm *vm;
-       uint64_t *hva;
-       uint64_t gpa;
-
-       vm = vm_create_default(VCPU_ID, 0, guest_code);
-
-       vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
-
-       vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP,
-                                   MEM_REGION_GPA, MEM_REGION_SLOT,
-                                   MEM_REGION_SIZE / getpagesize(), 0);
-
-       /*
-        * Allocate and map two pages so that the GPA accessed by guest_code()
-        * stays valid across the memslot move.
-        */
-       gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT);
-       TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n");
-
-       virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0);
-
-       /* Ditto for the host mapping so that both pages can be zeroed. */
-       hva = addr_gpa2hva(vm, MEM_REGION_GPA);
-       memset(hva, 0, 2 * 4096);
-
-       pthread_create(&vcpu_thread, NULL, vcpu_worker, vm);
-
-       /* Ensure the guest thread is spun up. */
-       usleep(100000);
-
-       /*
-        * Shift the region's base GPA.  The guest should not see "2" as the
-        * hva->gpa translation is misaligned, i.e. the guest is accessing a
-        * different host pfn.
-        */
-       vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA - 4096);
-       WRITE_ONCE(*hva, 2);
-
-       usleep(100000);
-
-       /*
-        * Note, value in memory needs to be changed *before* restoring the
-        * memslot, else the guest could race the update and see "2".
-        */
-       WRITE_ONCE(*hva, 1);
-
-       /* Restore the original base, the guest should see "1". */
-       vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA);
-
-       pthread_join(vcpu_thread, NULL);
-
-       kvm_vm_free(vm);
-}
-
-int main(int argc, char *argv[])
-{
-       int i, loops;
-
-       /* Tell stdout not to buffer its content */
-       setbuf(stdout, NULL);
-
-       if (argc > 1)
-               loops = atoi(argv[1]);
-       else
-               loops = 10;
-
-       for (i = 0; i < loops; i++)
-               test_move_memory_region();
-
-       return 0;
-}
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c

index 93bd59b..d502441 100644 (file)
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -571,6 +571,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
  {
         struct arch_timer_cpu *timer = vcpu_timer(vcpu);
         struct timer_map map;
+       struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
  
         if (unlikely(!timer->enabled))
                 return;
@@ -593,7 +594,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
         if (map.emul_ptimer)
                 soft_timer_cancel(&map.emul_ptimer->hrtimer);
  
-       if (swait_active(kvm_arch_vcpu_wq(vcpu)))
+       if (rcuwait_active(wait))
                 kvm_timer_blocking(vcpu);
  
         /*
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c

index 48d0ec4..d5db0d6 100644 (file)
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -579,16 +579,17 @@ void kvm_arm_resume_guest(struct kvm *kvm)
  
         kvm_for_each_vcpu(i, vcpu, kvm) {
                 vcpu->arch.pause = false;
-               swake_up_one(kvm_arch_vcpu_wq(vcpu));
+               rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu));
         }
  }
  
  static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
  {
-       struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
+       struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
  
-       swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
-                                      (!vcpu->arch.pause)));
+       rcuwait_wait_event(wait,
+                          (!vcpu->arch.power_off) &&(!vcpu->arch.pause),
+                          TASK_INTERRUPTIBLE);
  
         if (vcpu->arch.power_off || vcpu->arch.pause) {
                 /* Awaken to handle a signal, request we sleep again later. */
@@ -639,7 +640,6 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
  /**
   * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
   * @vcpu:      The VCPU pointer
- * @run:       The kvm_run structure pointer used for userspace state exchange
   *
   * This function is called through the VCPU_RUN ioctl called from user space. It
   * will execute VM code in a loop until the time slice for the process is used
@@ -647,8 +647,9 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
   * return with return value 0 and with the kvm_run structure filled in with the
   * required data for the requested emulation.
   */
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
  {
+       struct kvm_run *run = vcpu->run;
         int ret;
  
         if (unlikely(!kvm_vcpu_initialized(vcpu)))
@@ -659,7 +660,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                 return ret;
  
         if (run->exit_reason == KVM_EXIT_MMIO) {
-               ret = kvm_handle_mmio_return(vcpu, vcpu->run);
+               ret = kvm_handle_mmio_return(vcpu, run);
                 if (ret)
                         return ret;
         }
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c

index 15e5b03..10b533f 100644 (file)
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,8 +80,7 @@ static void async_pf_execute(struct work_struct *work)
  
         trace_kvm_async_pf_completed(addr, cr2_or_gpa);
  
-       if (swq_has_sleeper(&vcpu->wq))
-               swake_up_one(&vcpu->wq);
+       rcuwait_wake_up(&vcpu->wait);
  
         mmput(mm);
         kvm_put_kvm(vcpu->kvm);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c

index 67b6fc1..0c4ede4 100644 (file)
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -721,7 +721,7 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
                 return false;
         }
  
-       return _val == p->datamatch ? true : false;
+       return _val == p->datamatch;
  }
  
  /* MMIO/PIO writes trigger an event if the addr/val match */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 74bdb7b..da6da38 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -259,6 +259,7 @@ static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
  }
  
  bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
+                                struct kvm_vcpu *except,
                                  unsigned long *vcpu_bitmap, cpumask_var_t tmp)
  {
         int i, cpu, me;
@@ -268,7 +269,8 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
         me = get_cpu();
  
         kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
+               if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
+                   vcpu == except)
                         continue;
  
                 kvm_make_request(req, vcpu);
@@ -288,19 +290,25 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
         return called;
  }
  
-bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
+bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
+                                     struct kvm_vcpu *except)
  {
         cpumask_var_t cpus;
         bool called;
  
         zalloc_cpumask_var(&cpus, GFP_ATOMIC);
  
-       called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus);
+       called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus);
  
         free_cpumask_var(cpus);
         return called;
  }
  
+bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
+{
+       return kvm_make_all_cpus_request_except(kvm, req, NULL);
+}
+
  #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
  void kvm_flush_remote_tlbs(struct kvm *kvm)
  {
@@ -341,7 +349,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
         vcpu->kvm = kvm;
         vcpu->vcpu_id = id;
         vcpu->pid = NULL;
-       init_swait_queue_head(&vcpu->wq);
+       rcuwait_init(&vcpu->wait);
         kvm_async_pf_vcpu_init(vcpu);
  
         vcpu->pre_pcpu = -1;
@@ -710,6 +718,8 @@ static struct kvm *kvm_create_vm(unsigned long type)
                         goto out_err_no_arch_destroy_vm;
         }
  
+       kvm->max_halt_poll_ns = halt_poll_ns;
+
         r = kvm_arch_init_vm(kvm, type);
         if (r)
                 goto out_err_no_arch_destroy_vm;
@@ -1602,16 +1612,13 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
  {
         return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
  }
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
  
  bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
  {
         struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
  
-       if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS ||
-             memslot->flags & KVM_MEMSLOT_INVALID)
-               return false;
-
-       return true;
+       return kvm_is_visible_memslot(memslot);
  }
  EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
  
@@ -1824,8 +1831,6 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
                 r = fixup_user_fault(current, current->mm, addr,
                                      (write_fault ? FAULT_FLAG_WRITE : 0),
                                      &unlocked);
-               if (unlocked)
-                       return -EAGAIN;
                 if (r)
                         return r;
  
@@ -1896,15 +1901,12 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
                 goto exit;
         }
  
-retry:
         vma = find_vma_intersection(current->mm, addr, addr + 1);
  
         if (vma == NULL)
                 pfn = KVM_PFN_ERR_FAULT;
         else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
                 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
-               if (r == -EAGAIN)
-                       goto retry;
                 if (r < 0)
                         pfn = KVM_PFN_ERR_FAULT;
         } else {
@@ -2665,19 +2667,27 @@ out:
         return ret;
  }
  
+static inline void
+update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
+{
+       if (waited)
+               vcpu->stat.halt_poll_fail_ns += poll_ns;
+       else
+               vcpu->stat.halt_poll_success_ns += poll_ns;
+}
+
  /*
   * The vCPU has executed a HLT instruction with in-kernel mode enabled.
   */
  void kvm_vcpu_block(struct kvm_vcpu *vcpu)
  {
-       ktime_t start, cur;
-       DECLARE_SWAITQUEUE(wait);
+       ktime_t start, cur, poll_end;
         bool waited = false;
         u64 block_ns;
  
         kvm_arch_vcpu_blocking(vcpu);
  
-       start = cur = ktime_get();
+       start = cur = poll_end = ktime_get();
         if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
                 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
  
@@ -2693,12 +2703,13 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
                                         ++vcpu->stat.halt_poll_invalid;
                                 goto out;
                         }
-                       cur = ktime_get();
+                       poll_end = cur = ktime_get();
                 } while (single_task_running() && ktime_before(cur, stop));
         }
  
+       prepare_to_rcuwait(&vcpu->wait);
         for (;;) {
-               prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+               set_current_state(TASK_INTERRUPTIBLE);
  
                 if (kvm_vcpu_check_block(vcpu) < 0)
                         break;
@@ -2706,25 +2717,28 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
                 waited = true;
                 schedule();
         }
-
-       finish_swait(&vcpu->wq, &wait);
+       finish_rcuwait(&vcpu->wait);
         cur = ktime_get();
  out:
         kvm_arch_vcpu_unblocking(vcpu);
         block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
  
+       update_halt_poll_stats(
+               vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
+
         if (!kvm_arch_no_poll(vcpu)) {
                 if (!vcpu_valid_wakeup(vcpu)) {
                         shrink_halt_poll_ns(vcpu);
-               } else if (halt_poll_ns) {
+               } else if (vcpu->kvm->max_halt_poll_ns) {
                         if (block_ns <= vcpu->halt_poll_ns)
                                 ;
                         /* we had a long block, shrink polling */
-                       else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+                       else if (vcpu->halt_poll_ns &&
+                                       block_ns > vcpu->kvm->max_halt_poll_ns)
                                 shrink_halt_poll_ns(vcpu);
                         /* we had a short halt and our poll time is too small */
-                       else if (vcpu->halt_poll_ns < halt_poll_ns &&
-                               block_ns < halt_poll_ns)
+                       else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
+                                       block_ns < vcpu->kvm->max_halt_poll_ns)
                                 grow_halt_poll_ns(vcpu);
                 } else {
                         vcpu->halt_poll_ns = 0;
@@ -2738,11 +2752,10 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_block);
  
  bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
  {
-       struct swait_queue_head *wqp;
+       struct rcuwait *waitp;
  
-       wqp = kvm_arch_vcpu_wq(vcpu);
-       if (swq_has_sleeper(wqp)) {
-               swake_up_one(wqp);
+       waitp = kvm_arch_vcpu_get_wait(vcpu);
+       if (rcuwait_wake_up(waitp)) {
                 WRITE_ONCE(vcpu->ready, true);
                 ++vcpu->stat.halt_wakeup;
                 return true;
@@ -2884,7 +2897,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
                                 continue;
                         if (vcpu == me)
                                 continue;
-                       if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
+                       if (rcuwait_active(&vcpu->wait) &&
+                           !vcpu_dy_runnable(vcpu))
                                 continue;
                         if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
                                 !kvm_arch_vcpu_in_kernel(vcpu))
@@ -3031,8 +3045,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
         if (r)
                 goto vcpu_free_run_page;
  
-       kvm_create_vcpu_debugfs(vcpu);
-
         mutex_lock(&kvm->lock);
         if (kvm_get_vcpu_by_id(kvm, id)) {
                 r = -EEXIST;
@@ -3061,11 +3073,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
  
         mutex_unlock(&kvm->lock);
         kvm_arch_vcpu_postcreate(vcpu);
+       kvm_create_vcpu_debugfs(vcpu);
         return r;
  
  unlock_vcpu_destroy:
         mutex_unlock(&kvm->lock);
-       debugfs_remove_recursive(vcpu->debugfs_dentry);
         kvm_arch_vcpu_destroy(vcpu);
  vcpu_free_run_page:
         free_page((unsigned long)vcpu->run);
@@ -3135,7 +3147,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
                                 synchronize_rcu();
                         put_pid(oldpid);
                 }
-               r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+               r = kvm_arch_vcpu_ioctl_run(vcpu);
                 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
                 break;
         }
@@ -3160,7 +3172,6 @@ out_free1:
         case KVM_SET_REGS: {
                 struct kvm_regs *kvm_regs;
  
-               r = -ENOMEM;
                 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
                 if (IS_ERR(kvm_regs)) {
                         r = PTR_ERR(kvm_regs);
@@ -3516,6 +3527,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
         case KVM_CAP_IOEVENTFD_ANY_LENGTH:
         case KVM_CAP_CHECK_EXTENSION_VM:
         case KVM_CAP_ENABLE_CAP_VM:
+       case KVM_CAP_HALT_POLL:
                 return 1;
  #ifdef CONFIG_KVM_MMIO
         case KVM_CAP_COALESCED_MMIO:
@@ -3566,6 +3578,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
                 return 0;
         }
  #endif
+       case KVM_CAP_HALT_POLL: {
+               if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
+                       return -EINVAL;
+
+               kvm->max_halt_poll_ns = cap->args[0];
+               return 0;
+       }
         default:
                 return kvm_vm_ioctl_enable_cap(kvm, cap);
         }
@@ -4639,6 +4658,7 @@ struct kvm_vcpu *kvm_get_running_vcpu(void)
  
         return vcpu;
  }
+EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
  
  /**
   * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
author	Paolo Bonzini <pbonzini@redhat.com>
	Wed, 20 May 2020 07:40:09 +0000 (03:40 -0400)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Wed, 20 May 2020 07:40:09 +0000 (03:40 -0400)
Documentation/virt/kvm/api.rst		patch \| blob \| history
Documentation/virt/kvm/cpuid.rst		patch \| blob \| history
Documentation/virt/kvm/nested-vmx.rst		patch \| blob \| history
arch/arm64/include/asm/kvm_host.h		patch \| blob \| history
arch/arm64/kvm/guest.c		patch \| blob \| history
arch/mips/include/asm/kvm_host.h		patch \| blob \| history
arch/mips/kvm/mips.c		patch \| blob \| history
arch/powerpc/include/asm/kvm_book3s.h		patch \| blob \| history
arch/powerpc/include/asm/kvm_host.h		patch \| blob \| history
arch/powerpc/kvm/book3s.c		patch \| blob \| history
arch/powerpc/kvm/book3s_hv.c		patch \| blob \| history
arch/powerpc/kvm/booke.c		patch \| blob \| history
arch/powerpc/kvm/powerpc.c		patch \| blob \| history
arch/s390/include/asm/kvm_host.h		patch \| blob \| history
arch/s390/kvm/kvm-s390.c		patch \| blob \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/include/asm/vmx.h		patch \| blob \| history
arch/x86/include/uapi/asm/vmx.h		patch \| blob \| history
arch/x86/kvm/cpuid.c		patch \| blob \| history
arch/x86/kvm/emulate.c		patch \| blob \| history
arch/x86/kvm/hyperv.c		patch \| blob \| history
arch/x86/kvm/irq.c		patch \| blob \| history
arch/x86/kvm/kvm_cache_regs.h		patch \| blob \| history
arch/x86/kvm/lapic.c		patch \| blob \| history
arch/x86/kvm/lapic.h		patch \| blob \| history
arch/x86/kvm/mmu/mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/page_track.c		patch \| blob \| history
arch/x86/kvm/mmu/paging_tmpl.h		patch \| blob \| history
arch/x86/kvm/mmu_audit.c		patch \| blob \| history
arch/x86/kvm/svm/nested.c		patch \| blob \| history
arch/x86/kvm/svm/svm.c		patch \| blob \| history
arch/x86/kvm/svm/svm.h		patch \| blob \| history
arch/x86/kvm/trace.h		patch \| blob \| history
arch/x86/kvm/vmx/evmcs.c		patch \| blob \| history
arch/x86/kvm/vmx/nested.c		patch \| blob \| history
arch/x86/kvm/vmx/nested.h		patch \| blob \| history
arch/x86/kvm/vmx/ops.h		patch \| blob \| history
arch/x86/kvm/vmx/vmcs12.c		patch \| blob \| history
arch/x86/kvm/vmx/vmcs12.h		patch \| blob \| history
arch/x86/kvm/vmx/vmenter.S		patch \| blob \| history
arch/x86/kvm/vmx/vmx.c		patch \| blob \| history
arch/x86/kvm/vmx/vmx.h		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
arch/x86/kvm/x86.h		patch \| blob \| history
include/linux/kvm_host.h		patch \| blob \| history
include/linux/rcuwait.h		patch \| blob \| history
include/uapi/linux/kvm.h		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
tools/kvm/kvm_stat/kvm_stat		patch \| blob \| history
tools/kvm/kvm_stat/kvm_stat.service	[new file with mode: 0644]	patch \| blob
tools/kvm/kvm_stat/kvm_stat.txt		patch \| blob \| history
tools/testing/selftests/kvm/.gitignore		patch \| blob \| history
tools/testing/selftests/kvm/Makefile		patch \| blob \| history
tools/testing/selftests/kvm/include/kvm_util.h		patch \| blob \| history
tools/testing/selftests/kvm/lib/kvm_util.c		patch \| blob \| history
tools/testing/selftests/kvm/lib/kvm_util_internal.h		patch \| blob \| history
tools/testing/selftests/kvm/lib/s390x/processor.c		patch \| blob \| history
tools/testing/selftests/kvm/set_memory_region_test.c	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/kvm/x86_64/debug_regs.c	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/kvm/x86_64/set_memory_region_test.c	[deleted file]	patch \| blob \| history
virt/kvm/arm/arch_timer.c		patch \| blob \| history
virt/kvm/arm/arm.c		patch \| blob \| history
virt/kvm/async_pf.c		patch \| blob \| history
virt/kvm/eventfd.c		patch \| blob \| history
virt/kvm/kvm_main.c		patch \| blob \| history