Merge tag 'kvm-s390-next-20150602' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorPaolo Bonzini <pbonzini@redhat.com>
Wed, 3 Jun 2015 12:51:02 +0000 (14:51 +0200)
committerPaolo Bonzini <pbonzini@redhat.com>
Wed, 3 Jun 2015 12:51:02 +0000 (14:51 +0200)
KVM: s390: Fix and cleanup for 4.2 (kvm/next)

One small fix for a commit targetted for 4.2 and one cleanup
regarding our printks.

41 files changed:
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/mmu.txt
arch/arm/kvm/arm.c
arch/arm/kvm/mmu.c
arch/mips/include/asm/kvm_host.h
arch/mips/kvm/mips.c
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/powerpc.c
arch/s390/include/asm/kvm_host.h
arch/s390/kvm/kvm-s390.c
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/pvclock-abi.h
arch/x86/include/asm/pvclock.h
arch/x86/include/uapi/asm/kvm.h
arch/x86/kernel/kvm.c
arch/x86/kernel/kvmclock.c
arch/x86/kvm/cpuid.c
arch/x86/kvm/cpuid.h
arch/x86/kvm/emulate.c
arch/x86/kvm/ioapic.c
arch/x86/kvm/irq_comm.c
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/mmu_audit.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
include/linux/kvm_host.h
include/linux/kvm_types.h
include/uapi/linux/kvm.h
virt/kvm/kvm_main.c

index 9fa2bf8..6955444 100644 (file)
@@ -959,7 +959,8 @@ documentation when it pops into existence).
 4.37 KVM_ENABLE_CAP
 
 Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM
-Architectures: ppc, s390
+Architectures: x86 (only KVM_CAP_ENABLE_CAP_VM),
+              mips (only KVM_CAP_ENABLE_CAP), ppc, s390
 Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM)
 Parameters: struct kvm_enable_cap (in)
 Returns: 0 on success; -1 on error
index 53838d9..c59bd9b 100644 (file)
@@ -169,6 +169,10 @@ Shadow pages contain the following information:
     Contains the value of cr4.smep && !cr0.wp for which the page is valid
     (pages for which this is true are different from other pages; see the
     treatment of cr0.wp=0 below).
+  role.smap_andnot_wp:
+    Contains the value of cr4.smap && !cr0.wp for which the page is valid
+    (pages for which this is true are different from other pages; see the
+    treatment of cr0.wp=0 below).
   gfn:
     Either the guest page table containing the translations shadowed by this
     page, or the base page frame for linear translations.  See role.direct.
@@ -344,10 +348,16 @@ on fault type:
 
 (user write faults generate a #PF)
 
-In the first case there is an additional complication if CR4.SMEP is
-enabled: since we've turned the page into a kernel page, the kernel may now
-execute it.  We handle this by also setting spte.nx.  If we get a user
-fetch or read fault, we'll change spte.u=1 and spte.nx=gpte.nx back.
+In the first case there are two additional complications:
+- if CR4.SMEP is enabled: since we've turned the page into a kernel page,
+  the kernel may now execute it.  We handle this by also setting spte.nx.
+  If we get a user fetch or read fault, we'll change spte.u=1 and
+  spte.nx=gpte.nx back.
+- if CR4.SMAP is disabled: since the page has been changed to a kernel
+  page, it can not be reused when CR4.SMAP is enabled. We set
+  CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note,
+  here we do not care the case that CR4.SMAP is enabled since KVM will
+  directly inject #PF to guest due to failed permission check.
 
 To prevent an spte that was converted into a kernel page with cr0.wp=0
 from being written by the kernel after cr0.wp has changed to 1, we make
index d9631ec..e41cb11 100644 (file)
@@ -553,13 +553,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                 * Enter the guest
                 */
                trace_kvm_entry(*vcpu_pc(vcpu));
-               kvm_guest_enter();
+               __kvm_guest_enter();
                vcpu->mode = IN_GUEST_MODE;
 
                ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
 
                vcpu->mode = OUTSIDE_GUEST_MODE;
-               kvm_guest_exit();
+               __kvm_guest_exit();
                trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
                /*
                 * We may have taken a host interrupt in HYP mode (ie
index 1d5accb..7f473e6 100644 (file)
@@ -1155,7 +1155,8 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
  */
 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 {
-       struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot);
+       struct kvm_memslots *slots = kvm_memslots(kvm);
+       struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
        phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
        phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
 
@@ -1718,8 +1719,9 @@ out:
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-                                  struct kvm_userspace_memory_region *mem,
+                                  const struct kvm_userspace_memory_region *mem,
                                   const struct kvm_memory_slot *old,
+                                  const struct kvm_memory_slot *new,
                                   enum kvm_mr_change change)
 {
        /*
@@ -1733,7 +1735,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
-                                  struct kvm_userspace_memory_region *mem,
+                                  const struct kvm_userspace_memory_region *mem,
                                   enum kvm_mr_change change)
 {
        hva_t hva = mem->userspace_addr;
@@ -1838,7 +1840,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
        return 0;
 }
 
-void kvm_arch_memslots_updated(struct kvm *kvm)
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
 {
 }
 
index 4c25823..e8c8d9d 100644 (file)
@@ -839,7 +839,7 @@ static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
                struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
                struct kvm_memory_slot *slot) {}
index bb68e8d..cd4c129 100644 (file)
@@ -198,15 +198,16 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
-                                  struct kvm_userspace_memory_region *mem,
+                                  const struct kvm_userspace_memory_region *mem,
                                   enum kvm_mr_change change)
 {
        return 0;
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-                                  struct kvm_userspace_memory_region *mem,
+                                  const struct kvm_userspace_memory_region *mem,
                                   const struct kvm_memory_slot *old,
+                                  const struct kvm_memory_slot *new,
                                   enum kvm_mr_change change)
 {
        unsigned long npages = 0;
@@ -393,7 +394,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
        kvm_mips_deliver_interrupts(vcpu,
                                    kvm_read_c0_guest_cause(vcpu->arch.cop0));
 
-       kvm_guest_enter();
+       __kvm_guest_enter();
 
        /* Disable hardware page table walking while in guest */
        htw_stop();
@@ -403,7 +404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
        /* Re-enable HTW before enabling interrupts */
        htw_start();
 
-       kvm_guest_exit();
+       __kvm_guest_exit();
        local_irq_enable();
 
        if (vcpu->sigset_active)
@@ -968,6 +969,7 @@ out:
 /* Get (and clear) the dirty memory log for a memory slot. */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
+       struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        unsigned long ga, ga_end;
        int is_dirty = 0;
@@ -982,7 +984,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 
        /* If nothing is dirty, don't bother messing with page tables. */
        if (is_dirty) {
-               memslot = &kvm->memslots->memslots[log->slot];
+               slots = kvm_memslots(kvm);
+               memslot = id_to_memslot(slots, log->slot);
 
                ga = memslot->base_gfn << PAGE_SHIFT;
                ga_end = ga + (memslot->npages << PAGE_SHIFT);
index a193a13..d91f65b 100644 (file)
@@ -698,7 +698,7 @@ struct kvm_vcpu_arch {
 static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_exit(void) {}
index b8475da..c6ef05b 100644 (file)
@@ -182,10 +182,11 @@ extern int kvmppc_core_create_memslot(struct kvm *kvm,
                                      unsigned long npages);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
-                               struct kvm_userspace_memory_region *mem);
+                               const struct kvm_userspace_memory_region *mem);
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem,
-                               const struct kvm_memory_slot *old);
+                               const struct kvm_userspace_memory_region *mem,
+                               const struct kvm_memory_slot *old,
+                               const struct kvm_memory_slot *new);
 extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
                                      struct kvm_ppc_smmu_info *info);
 extern void kvmppc_core_flush_memslot(struct kvm *kvm,
@@ -243,10 +244,11 @@ struct kvmppc_ops {
        void (*flush_memslot)(struct kvm *kvm, struct kvm_memory_slot *memslot);
        int (*prepare_memory_region)(struct kvm *kvm,
                                     struct kvm_memory_slot *memslot,
-                                    struct kvm_userspace_memory_region *mem);
+                                    const struct kvm_userspace_memory_region *mem);
        void (*commit_memory_region)(struct kvm *kvm,
-                                    struct kvm_userspace_memory_region *mem,
-                                    const struct kvm_memory_slot *old);
+                                    const struct kvm_userspace_memory_region *mem,
+                                    const struct kvm_memory_slot *old,
+                                    const struct kvm_memory_slot *new);
        int (*unmap_hva)(struct kvm *kvm, unsigned long hva);
        int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
                           unsigned long end);
index 453a8a4..05ea8fc 100644 (file)
@@ -757,16 +757,17 @@ void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
-                               struct kvm_userspace_memory_region *mem)
+                               const struct kvm_userspace_memory_region *mem)
 {
        return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem);
 }
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem,
-                               const struct kvm_memory_slot *old)
+                               const struct kvm_userspace_memory_region *mem,
+                               const struct kvm_memory_slot *old,
+                               const struct kvm_memory_slot *new)
 {
-       kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old);
+       kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new);
 }
 
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
index 1a4acf8..dab68b7 100644 (file)
@@ -650,7 +650,7 @@ static void kvmppc_rmap_reset(struct kvm *kvm)
        int srcu_idx;
 
        srcu_idx = srcu_read_lock(&kvm->srcu);
-       slots = kvm->memslots;
+       slots = kvm_memslots(kvm);
        kvm_for_each_memslot(memslot, slots) {
                /*
                 * This assumes it is acceptable to lose reference and
index 48d3c5d..68d067a 100644 (file)
@@ -1952,7 +1952,7 @@ static void post_guest_process(struct kvmppc_vcore *vc)
  */
 static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-       struct kvm_vcpu *vcpu;
+       struct kvm_vcpu *vcpu, *vnext;
        int i;
        int srcu_idx;
 
@@ -1982,7 +1982,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
         */
        if ((threads_per_core > 1) &&
            ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
-               list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+               list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+                                        arch.run_list) {
                        vcpu->arch.ret = -EBUSY;
                        kvmppc_remove_runnable(vc, vcpu);
                        wake_up(&vcpu->arch.cpu_run);
@@ -2320,6 +2321,7 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
 static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
                                         struct kvm_dirty_log *log)
 {
+       struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        int r;
        unsigned long n;
@@ -2330,7 +2332,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
        if (log->slot >= KVM_USER_MEM_SLOTS)
                goto out;
 
-       memslot = id_to_memslot(kvm->memslots, log->slot);
+       slots = kvm_memslots(kvm);
+       memslot = id_to_memslot(slots, log->slot);
        r = -ENOENT;
        if (!memslot->dirty_bitmap)
                goto out;
@@ -2373,16 +2376,18 @@ static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
 
 static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
                                        struct kvm_memory_slot *memslot,
-                                       struct kvm_userspace_memory_region *mem)
+                                       const struct kvm_userspace_memory_region *mem)
 {
        return 0;
 }
 
 static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem,
-                               const struct kvm_memory_slot *old)
+                               const struct kvm_userspace_memory_region *mem,
+                               const struct kvm_memory_slot *old,
+                               const struct kvm_memory_slot *new)
 {
        unsigned long npages = mem->memory_size >> PAGE_SHIFT;
+       struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
 
        if (npages && old->npages) {
@@ -2392,7 +2397,8 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
                 * since the rmap array starts out as all zeroes,
                 * i.e. no pages are dirty.
                 */
-               memslot = id_to_memslot(kvm->memslots, mem->slot);
+               slots = kvm_memslots(kvm);
+               memslot = id_to_memslot(slots, mem->slot);
                kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
        }
 }
index f573839..64891b0 100644 (file)
@@ -1530,6 +1530,7 @@ out:
 static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm,
                                         struct kvm_dirty_log *log)
 {
+       struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        struct kvm_vcpu *vcpu;
        ulong ga, ga_end;
@@ -1545,7 +1546,8 @@ static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm,
 
        /* If nothing is dirty, don't bother messing with page tables. */
        if (is_dirty) {
-               memslot = id_to_memslot(kvm->memslots, log->slot);
+               slots = kvm_memslots(kvm);
+               memslot = id_to_memslot(slots, log->slot);
 
                ga = memslot->base_gfn << PAGE_SHIFT;
                ga_end = ga + (memslot->npages << PAGE_SHIFT);
@@ -1571,14 +1573,15 @@ static void kvmppc_core_flush_memslot_pr(struct kvm *kvm,
 
 static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm,
                                        struct kvm_memory_slot *memslot,
-                                       struct kvm_userspace_memory_region *mem)
+                                       const struct kvm_userspace_memory_region *mem)
 {
        return 0;
 }
 
 static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem,
-                               const struct kvm_memory_slot *old)
+                               const struct kvm_userspace_memory_region *mem,
+                               const struct kvm_memory_slot *old,
+                               const struct kvm_memory_slot *new)
 {
        return;
 }
index 6c1316a..cc58426 100644 (file)
@@ -1004,10 +1004,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                break;
        }
 
-       local_irq_enable();
-
        trace_kvm_exit(exit_nr, vcpu);
-       kvm_guest_exit();
+       __kvm_guest_exit();
+
+       local_irq_enable();
 
        run->exit_reason = KVM_EXIT_UNKNOWN;
        run->ready_for_interrupt_injection = 1;
@@ -1784,14 +1784,15 @@ int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
                                      struct kvm_memory_slot *memslot,
-                                     struct kvm_userspace_memory_region *mem)
+                                     const struct kvm_userspace_memory_region *mem)
 {
        return 0;
 }
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem,
-                               const struct kvm_memory_slot *old)
+                               const struct kvm_userspace_memory_region *mem,
+                               const struct kvm_memory_slot *old,
+                               const struct kvm_memory_slot *new)
 {
 }
 
index ac3ddf1..e5dde32 100644 (file)
@@ -115,7 +115,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
                        continue;
                }
 
-               kvm_guest_enter();
+               __kvm_guest_enter();
                return 1;
        }
 
@@ -595,18 +595,19 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
-                                  struct kvm_userspace_memory_region *mem,
+                                  const struct kvm_userspace_memory_region *mem,
                                   enum kvm_mr_change change)
 {
        return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-                                  struct kvm_userspace_memory_region *mem,
+                                  const struct kvm_userspace_memory_region *mem,
                                   const struct kvm_memory_slot *old,
+                                  const struct kvm_memory_slot *new,
                                   enum kvm_mr_change change)
 {
-       kvmppc_core_commit_memory_region(kvm, mem, old);
+       kvmppc_core_commit_memory_region(kvm, mem, old, new);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
index 444c412..3024acb 100644 (file)
@@ -636,7 +636,7 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
                struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
                struct kvm_memory_slot *slot) {}
index 9cb6cfa..71530a4 100644 (file)
@@ -240,6 +240,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 {
        int r;
        unsigned long n;
+       struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        int is_dirty = 0;
 
@@ -249,7 +250,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
        if (log->slot >= KVM_USER_MEM_SLOTS)
                goto out;
 
-       memslot = id_to_memslot(kvm->memslots, log->slot);
+       slots = kvm_memslots(kvm);
+       memslot = id_to_memslot(slots, log->slot);
        r = -ENOENT;
        if (!memslot->dirty_bitmap)
                goto out;
@@ -2015,12 +2017,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                 * As PF_VCPU will be used in fault handler, between
                 * guest_enter and guest_exit should be no uaccess.
                 */
-               preempt_disable();
-               kvm_guest_enter();
-               preempt_enable();
+               local_irq_disable();
+               __kvm_guest_enter();
+               local_irq_enable();
                exit_reason = sie64a(vcpu->arch.sie_block,
                                     vcpu->run->s.regs.gprs);
-               kvm_guest_exit();
+               local_irq_disable();
+               __kvm_guest_exit();
+               local_irq_enable();
                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
                rc = vcpu_post_run(vcpu, exit_reason);
@@ -2583,7 +2587,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
-                                  struct kvm_userspace_memory_region *mem,
+                                  const struct kvm_userspace_memory_region *mem,
                                   enum kvm_mr_change change)
 {
        /* A few sanity checks. We can have memory slots which have to be
@@ -2601,8 +2605,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem,
+                               const struct kvm_userspace_memory_region *mem,
                                const struct kvm_memory_slot *old,
+                               const struct kvm_memory_slot *new,
                                enum kvm_mr_change change)
 {
        int rc;
index dea2e7e..7276107 100644 (file)
@@ -207,6 +207,7 @@ union kvm_mmu_page_role {
                unsigned nxe:1;
                unsigned cr0_wp:1;
                unsigned smep_andnot_wp:1;
+               unsigned smap_andnot_wp:1;
        };
 };
 
@@ -400,6 +401,7 @@ struct kvm_vcpu_arch {
        struct kvm_mmu_memory_cache mmu_page_header_cache;
 
        struct fpu guest_fpu;
+       bool eager_fpu;
        u64 xcr0;
        u64 guest_supported_xcr0;
        u32 guest_xstate_size;
@@ -635,6 +637,8 @@ struct kvm_arch {
        #endif
 
        bool boot_vcpu_runs_old_kvmclock;
+
+       u64 disabled_quirks;
 };
 
 struct kvm_vm_stat {
@@ -687,12 +691,13 @@ struct msr_data {
 
 struct kvm_lapic_irq {
        u32 vector;
-       u32 delivery_mode;
-       u32 dest_mode;
-       u32 level;
-       u32 trig_mode;
+       u16 delivery_mode;
+       u16 dest_mode;
+       bool level;
+       u16 trig_mode;
        u32 shorthand;
        u32 dest_id;
+       bool msi_redir_hint;
 };
 
 struct kvm_x86_ops {
@@ -709,7 +714,7 @@ struct kvm_x86_ops {
        /* Create, but do not attach this VCPU */
        struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
        void (*vcpu_free)(struct kvm_vcpu *vcpu);
-       void (*vcpu_reset)(struct kvm_vcpu *vcpu);
+       void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
 
        void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
        void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
@@ -743,6 +748,7 @@ struct kvm_x86_ops {
        void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
        unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
        void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+       void (*fpu_activate)(struct kvm_vcpu *vcpu);
        void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
 
        void (*tlb_flush)(struct kvm_vcpu *vcpu);
@@ -868,7 +874,7 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
                                      struct kvm_memory_slot *memslot);
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot);
+                                  const struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
@@ -999,7 +1005,7 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
-int fx_init(struct kvm_vcpu *vcpu);
+int fx_init(struct kvm_vcpu *vcpu, bool init_event);
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                       const u8 *new, int bytes);
@@ -1143,7 +1149,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
 void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
                                           unsigned long address);
index 6167fd7..655e07a 100644 (file)
@@ -41,5 +41,6 @@ struct pvclock_wall_clock {
 
 #define PVCLOCK_TSC_STABLE_BIT (1 << 0)
 #define PVCLOCK_GUEST_STOPPED  (1 << 1)
+#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_PVCLOCK_ABI_H */
index d6b078e..628954c 100644 (file)
@@ -86,7 +86,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
        offset = pvclock_get_nsec_offset(src);
        ret = src->system_time + offset;
        ret_flags = src->flags;
-       rdtsc_barrier();
 
        *cycles = ret;
        *flags = ret_flags;
index d7dcef5..2fec75e 100644 (file)
@@ -345,4 +345,7 @@ struct kvm_xcrs {
 struct kvm_sync_regs {
 };
 
+#define KVM_QUIRK_LINT0_REENABLED      (1 << 0)
+#define KVM_QUIRK_CD_NW_CLEARED                (1 << 1)
+
 #endif /* _ASM_X86_KVM_H */
index 9435620..cc34cec 100644 (file)
@@ -331,7 +331,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
        apic_write(APIC_EOI, APIC_EOI_ACK);
 }
 
-void kvm_guest_cpu_init(void)
+static void kvm_guest_cpu_init(void)
 {
        if (!kvm_para_available())
                return;
@@ -655,7 +655,7 @@ static inline void spin_time_accum_blocked(u64 start)
 static struct dentry *d_spin_debug;
 static struct dentry *d_kvm_debug;
 
-struct dentry *kvm_init_debugfs(void)
+static struct dentry *kvm_init_debugfs(void)
 {
        d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
        if (!d_kvm_debug)
index 42caaef..49487b4 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/memblock.h>
+#include <linux/sched.h>
 
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
@@ -217,8 +218,10 @@ static void kvm_shutdown(void)
 
 void __init kvmclock_init(void)
 {
+       struct pvclock_vcpu_time_info *vcpu_time;
        unsigned long mem;
-       int size;
+       int size, cpu;
+       u8 flags;
 
        size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
 
@@ -264,7 +267,14 @@ void __init kvmclock_init(void)
        pv_info.name = "KVM";
 
        if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
-               pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+               pvclock_set_flags(~0);
+
+       cpu = get_cpu();
+       vcpu_time = &hv_clock[cpu].pvti;
+       flags = pvclock_read_flags(vcpu_time);
+       if (flags & PVCLOCK_COUNTS_FROM_ZERO)
+               set_sched_clock_stable();
+       put_cpu();
 }
 
 int __init kvm_setup_vsyscall_timeinfo(void)
index 59b69f6..9dadf8d 100644 (file)
@@ -16,6 +16,8 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <asm/i387.h> /* For use_eager_fpu.  Ugh! */
+#include <asm/fpu-internal.h> /* For use_eager_fpu.  Ugh! */
 #include <asm/user.h>
 #include <asm/xsave.h>
 #include "cpuid.h"
@@ -95,6 +97,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
        if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
                best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
+       vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu);
+
        /*
         * The existing code assumes virtual address is 48-bit in the canonical
         * address checks; exit if it is ever changed.
@@ -411,6 +415,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                }
                break;
        }
+       case 6: /* Thermal management */
+               entry->eax = 0x4; /* allow ARAT */
+               entry->ebx = 0;
+               entry->ecx = 0;
+               entry->edx = 0;
+               break;
        case 7: {
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                /* Mask ebx against host capability word 9 */
@@ -587,7 +597,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                break;
        case 3: /* Processor serial number */
        case 5: /* MONITOR/MWAIT */
-       case 6: /* Thermal management */
        case 0xC0000002:
        case 0xC0000003:
        case 0xC0000004:
index c3b1ad9..496b369 100644 (file)
@@ -117,4 +117,12 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
        best = kvm_find_cpuid_entry(vcpu, 7, 0);
        return best && (best->ebx & bit(X86_FEATURE_RTM));
 }
+
+static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = kvm_find_cpuid_entry(vcpu, 7, 0);
+       return best && (best->ebx & bit(X86_FEATURE_MPX));
+}
 #endif
index 630bcb0..9b655d1 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <asm/kvm_emulate.h>
 #include <linux/stringify.h>
+#include <asm/debugreg.h>
 
 #include "x86.h"
 #include "tss.h"
@@ -523,13 +524,9 @@ static void masked_increment(ulong *reg, ulong mask, int inc)
 static inline void
 register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
 {
-       ulong mask;
+       ulong *preg = reg_rmw(ctxt, reg);
 
-       if (ctxt->ad_bytes == sizeof(unsigned long))
-               mask = ~0UL;
-       else
-               mask = ad_mask(ctxt);
-       masked_increment(reg_rmw(ctxt, reg), mask, inc);
+       assign_register(preg, *preg + inc, ctxt->ad_bytes);
 }
 
 static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
@@ -2573,6 +2570,30 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
        return true;
 }
 
+static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
+{
+       /*
+        * Intel CPUs mask the counter and pointers in quite strange
+        * manner when ECX is zero due to REP-string optimizations.
+        */
+#ifdef CONFIG_X86_64
+       if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
+               return;
+
+       *reg_write(ctxt, VCPU_REGS_RCX) = 0;
+
+       switch (ctxt->b) {
+       case 0xa4:      /* movsb */
+       case 0xa5:      /* movsd/w */
+               *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1;
+               /* fall through */
+       case 0xaa:      /* stosb */
+       case 0xab:      /* stosd/w */
+               *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1;
+       }
+#endif
+}
+
 static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
                                struct tss_segment_16 *tss)
 {
@@ -2849,7 +2870,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
        ulong old_tss_base =
                ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
        u32 desc_limit;
-       ulong desc_addr;
+       ulong desc_addr, dr7;
 
        /* FIXME: old_tss_base == ~0 ? */
 
@@ -2934,6 +2955,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
                ret = em_push(ctxt);
        }
 
+       ops->get_dr(ctxt, 7, &dr7);
+       ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
+
        return ret;
 }
 
@@ -3840,7 +3864,7 @@ static const struct opcode group5[] = {
        F(DstMem | SrcNone | Lock,              em_inc),
        F(DstMem | SrcNone | Lock,              em_dec),
        I(SrcMem | NearBranch,                  em_call_near_abs),
-       I(SrcMemFAddr | ImplicitOps | Stack,    em_call_far),
+       I(SrcMemFAddr | ImplicitOps,            em_call_far),
        I(SrcMem | NearBranch,                  em_jmp_abs),
        I(SrcMemFAddr | ImplicitOps,            em_jmp_far),
        I(SrcMem | Stack,                       em_push), D(Undefined),
@@ -4910,6 +4934,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
                if (ctxt->rep_prefix && (ctxt->d & String)) {
                        /* All REP prefixes have the same first termination condition */
                        if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+                               string_registers_quirk(ctxt);
                                ctxt->eip = ctxt->_eip;
                                ctxt->eflags &= ~X86_EFLAGS_RF;
                                goto done;
index 28146f0..856f791 100644 (file)
@@ -349,6 +349,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
        irqe.delivery_mode = entry->fields.delivery_mode << 8;
        irqe.level = 1;
        irqe.shorthand = 0;
+       irqe.msi_redir_hint = false;
 
        if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
                ioapic->irr_delivered |= 1 << irq;
@@ -637,11 +638,9 @@ void kvm_ioapic_destroy(struct kvm *kvm)
        struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
        cancel_delayed_work_sync(&ioapic->eoi_inject);
-       if (ioapic) {
-               kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
-               kvm->arch.vioapic = NULL;
-               kfree(ioapic);
-       }
+       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+       kvm->arch.vioapic = NULL;
+       kfree(ioapic);
 }
 
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
index 72298b3..9efff9e 100644 (file)
@@ -31,6 +31,8 @@
 
 #include "ioapic.h"
 
+#include "lapic.h"
+
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
                           struct kvm *kvm, int irq_source_id, int level,
                           bool line_status)
@@ -48,11 +50,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
                                line_status);
 }
 
-inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
-{
-       return irq->delivery_mode == APIC_DM_LOWEST;
-}
-
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
                struct kvm_lapic_irq *irq, unsigned long *dest_map)
 {
@@ -60,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
        struct kvm_vcpu *vcpu, *lowest = NULL;
 
        if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
-                       kvm_is_dm_lowest_prio(irq)) {
+                       kvm_lowest_prio_delivery(irq)) {
                printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
                irq->delivery_mode = APIC_DM_FIXED;
        }
@@ -76,7 +73,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
                                        irq->dest_id, irq->dest_mode))
                        continue;
 
-               if (!kvm_is_dm_lowest_prio(irq)) {
+               if (!kvm_lowest_prio_delivery(irq)) {
                        if (r < 0)
                                r = 0;
                        r += kvm_apic_set_irq(vcpu, irq, dest_map);
@@ -106,9 +103,10 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
        irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
        irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
        irq->delivery_mode = e->msi.data & 0x700;
+       irq->msi_redir_hint = ((e->msi.address_lo
+               & MSI_ADDR_REDIRECTION_LOWPRI) > 0);
        irq->level = 1;
        irq->shorthand = 0;
-       /* TODO Deal with RH bit of MSI message address */
 }
 
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
index 629af0f..c789e00 100644 (file)
@@ -240,6 +240,15 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
        recalculate_apic_map(apic->vcpu->kvm);
 }
 
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id)
+{
+       u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+
+       apic_set_reg(apic, APIC_ID, id << 24);
+       apic_set_reg(apic, APIC_LDR, ldr);
+       recalculate_apic_map(apic->vcpu->kvm);
+}
+
 static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
 {
        return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
@@ -728,7 +737,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
                dst = map->logical_map[cid];
 
-               if (irq->delivery_mode == APIC_DM_LOWEST) {
+               if (kvm_lowest_prio_delivery(irq)) {
                        int l = -1;
                        for_each_set_bit(i, &bitmap, 16) {
                                if (!dst[i])
@@ -914,9 +923,10 @@ static void apic_send_ipi(struct kvm_lapic *apic)
        irq.vector = icr_low & APIC_VECTOR_MASK;
        irq.delivery_mode = icr_low & APIC_MODE_MASK;
        irq.dest_mode = icr_low & APIC_DEST_MASK;
-       irq.level = icr_low & APIC_INT_ASSERT;
+       irq.level = (icr_low & APIC_INT_ASSERT) != 0;
        irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
        irq.shorthand = icr_low & APIC_SHORT_MASK;
+       irq.msi_redir_hint = false;
        if (apic_x2apic_mode(apic))
                irq.dest_id = icr_high;
        else
@@ -926,10 +936,11 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 
        apic_debug("icr_high 0x%x, icr_low 0x%x, "
                   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
-                  "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
+                  "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, "
+                  "msi_redir_hint 0x%x\n",
                   icr_high, icr_low, irq.shorthand, irq.dest_id,
                   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
-                  irq.vector);
+                  irq.vector, irq.msi_redir_hint);
 
        kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
 }
@@ -1536,9 +1547,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 
        if ((old_value ^ value) & X2APIC_ENABLE) {
                if (value & X2APIC_ENABLE) {
-                       u32 id = kvm_apic_id(apic);
-                       u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
-                       kvm_apic_set_ldr(apic, ldr);
+                       kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
                        kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
                } else
                        kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
@@ -1557,7 +1566,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 
 }
 
-void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
        struct kvm_lapic *apic;
        int i;
@@ -1571,19 +1580,22 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
        /* Stop the timer in case it's a reset to an active apic */
        hrtimer_cancel(&apic->lapic_timer.timer);
 
-       kvm_apic_set_id(apic, vcpu->vcpu_id);
+       if (!init_event)
+               kvm_apic_set_id(apic, vcpu->vcpu_id);
        kvm_apic_set_version(apic->vcpu);
 
        for (i = 0; i < APIC_LVT_NUM; i++)
                apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
        apic->lapic_timer.timer_mode = 0;
-       apic_set_reg(apic, APIC_LVT0,
-                    SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+       if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_LINT0_REENABLED))
+               apic_set_reg(apic, APIC_LVT0,
+                            SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
 
        apic_set_reg(apic, APIC_DFR, 0xffffffffU);
        apic_set_spiv(apic, 0xff);
        apic_set_reg(apic, APIC_TASKPRI, 0);
-       kvm_apic_set_ldr(apic, 0);
+       if (!apic_x2apic_mode(apic))
+               kvm_apic_set_ldr(apic, 0);
        apic_set_reg(apic, APIC_ESR, 0);
        apic_set_reg(apic, APIC_ICR, 0);
        apic_set_reg(apic, APIC_ICR2, 0);
@@ -1712,7 +1724,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
                        APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
 
        static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
-       kvm_lapic_reset(vcpu);
+       kvm_lapic_reset(vcpu, false);
        kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
 
        return 0;
@@ -2046,8 +2058,8 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
        pe = xchg(&apic->pending_events, 0);
 
        if (test_bit(KVM_APIC_INIT, &pe)) {
-               kvm_lapic_reset(vcpu);
-               kvm_vcpu_reset(vcpu);
+               kvm_lapic_reset(vcpu, true);
+               kvm_vcpu_reset(vcpu, true);
                if (kvm_vcpu_is_bsp(apic->vcpu))
                        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
                else
index 9d28383..71b150c 100644 (file)
@@ -48,7 +48,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
 int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
 void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
@@ -153,6 +153,12 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
        return vcpu->arch.apic->pending_events;
 }
 
+static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
+{
+       return (irq->delivery_mode == APIC_DM_LOWEST ||
+                       irq->msi_redir_hint);
+}
+
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
 void wait_lapic_expire(struct kvm_vcpu *vcpu);
index d43867c..a65ce12 100644 (file)
@@ -804,30 +804,32 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
        return &slot->arch.lpage_info[level - 2][idx];
 }
 
-static void account_shadowed(struct kvm *kvm, gfn_t gfn)
+static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        struct kvm_memory_slot *slot;
        struct kvm_lpage_info *linfo;
+       gfn_t gfn;
        int i;
 
+       gfn = sp->gfn;
        slot = gfn_to_memslot(kvm, gfn);
-       for (i = PT_DIRECTORY_LEVEL;
-            i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
                linfo = lpage_info_slot(gfn, slot, i);
                linfo->write_count += 1;
        }
        kvm->arch.indirect_shadow_pages++;
 }
 
-static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
+static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        struct kvm_memory_slot *slot;
        struct kvm_lpage_info *linfo;
+       gfn_t gfn;
        int i;
 
+       gfn = sp->gfn;
        slot = gfn_to_memslot(kvm, gfn);
-       for (i = PT_DIRECTORY_LEVEL;
-            i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
                linfo = lpage_info_slot(gfn, slot, i);
                linfo->write_count -= 1;
                WARN_ON(linfo->write_count < 0);
@@ -858,8 +860,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 
        page_size = kvm_host_page_size(kvm, gfn);
 
-       for (i = PT_PAGE_TABLE_LEVEL;
-            i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
+       for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
                if (page_size >= KVM_HPAGE_SIZE(i))
                        ret = i;
                else
@@ -1142,6 +1143,11 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
        return NULL;
 }
 
+#define for_each_rmap_spte(_rmap_, _iter_, _spte_)                         \
+          for (_spte_ = rmap_get_first(*_rmap_, _iter_);                   \
+               _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;});  \
+                       _spte_ = rmap_get_next(_iter_))
+
 static void drop_spte(struct kvm *kvm, u64 *sptep)
 {
        if (mmu_spte_clear_track_bits(sptep))
@@ -1205,12 +1211,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
        struct rmap_iterator iter;
        bool flush = false;
 
-       for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-               BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+       for_each_rmap_spte(rmapp, &iter, sptep)
                flush |= spte_write_protect(kvm, sptep, pt_protect);
-               sptep = rmap_get_next(&iter);
-       }
 
        return flush;
 }
@@ -1232,12 +1234,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
        struct rmap_iterator iter;
        bool flush = false;
 
-       for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-               BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+       for_each_rmap_spte(rmapp, &iter, sptep)
                flush |= spte_clear_dirty(kvm, sptep);
-               sptep = rmap_get_next(&iter);
-       }
 
        return flush;
 }
@@ -1259,12 +1257,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
        struct rmap_iterator iter;
        bool flush = false;
 
-       for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-               BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+       for_each_rmap_spte(rmapp, &iter, sptep)
                flush |= spte_set_dirty(kvm, sptep);
-               sptep = rmap_get_next(&iter);
-       }
 
        return flush;
 }
@@ -1351,8 +1345,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 
        slot = gfn_to_memslot(kvm, gfn);
 
-       for (i = PT_PAGE_TABLE_LEVEL;
-            i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+       for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
                rmapp = __gfn_to_rmap(gfn, i, slot);
                write_protected |= __rmap_write_protect(kvm, rmapp, true);
        }
@@ -1360,24 +1353,28 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
        return write_protected;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
-                          struct kvm_memory_slot *slot, gfn_t gfn, int level,
-                          unsigned long data)
+static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp)
 {
        u64 *sptep;
        struct rmap_iterator iter;
-       int need_tlb_flush = 0;
+       bool flush = false;
 
        while ((sptep = rmap_get_first(*rmapp, &iter))) {
                BUG_ON(!(*sptep & PT_PRESENT_MASK));
-               rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n",
-                            sptep, *sptep, gfn, level);
+               rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
 
                drop_spte(kvm, sptep);
-               need_tlb_flush = 1;
+               flush = true;
        }
 
-       return need_tlb_flush;
+       return flush;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+                          struct kvm_memory_slot *slot, gfn_t gfn, int level,
+                          unsigned long data)
+{
+       return kvm_zap_rmapp(kvm, rmapp);
 }
 
 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
@@ -1394,8 +1391,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
        WARN_ON(pte_huge(*ptep));
        new_pfn = pte_pfn(*ptep);
 
-       for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-               BUG_ON(!is_shadow_present_pte(*sptep));
+restart:
+       for_each_rmap_spte(rmapp, &iter, sptep) {
                rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
                             sptep, *sptep, gfn, level);
 
@@ -1403,7 +1400,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
                if (pte_write(*ptep)) {
                        drop_spte(kvm, sptep);
-                       sptep = rmap_get_first(*rmapp, &iter);
+                       goto restart;
                } else {
                        new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
                        new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1414,7 +1411,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
                        mmu_spte_clear_track_bits(sptep);
                        mmu_spte_set(sptep, new_spte);
-                       sptep = rmap_get_next(&iter);
                }
        }
 
@@ -1424,6 +1420,74 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
        return 0;
 }
 
+struct slot_rmap_walk_iterator {
+       /* input fields. */
+       struct kvm_memory_slot *slot;
+       gfn_t start_gfn;
+       gfn_t end_gfn;
+       int start_level;
+       int end_level;
+
+       /* output fields. */
+       gfn_t gfn;
+       unsigned long *rmap;
+       int level;
+
+       /* private field. */
+       unsigned long *end_rmap;
+};
+
+static void
+rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
+{
+       iterator->level = level;
+       iterator->gfn = iterator->start_gfn;
+       iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
+       iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
+                                          iterator->slot);
+}
+
+static void
+slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+                   struct kvm_memory_slot *slot, int start_level,
+                   int end_level, gfn_t start_gfn, gfn_t end_gfn)
+{
+       iterator->slot = slot;
+       iterator->start_level = start_level;
+       iterator->end_level = end_level;
+       iterator->start_gfn = start_gfn;
+       iterator->end_gfn = end_gfn;
+
+       rmap_walk_init_level(iterator, iterator->start_level);
+}
+
+static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
+{
+       return !!iterator->rmap;
+}
+
+static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
+{
+       if (++iterator->rmap <= iterator->end_rmap) {
+               iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+               return;
+       }
+
+       if (++iterator->level > iterator->end_level) {
+               iterator->rmap = NULL;
+               return;
+       }
+
+       rmap_walk_init_level(iterator, iterator->level);
+}
+
+#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,   \
+          _start_gfn, _end_gfn, _iter_)                                \
+       for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
+                                _end_level_, _start_gfn, _end_gfn);    \
+            slot_rmap_walk_okay(_iter_);                               \
+            slot_rmap_walk_next(_iter_))
+
 static int kvm_handle_hva_range(struct kvm *kvm,
                                unsigned long start,
                                unsigned long end,
@@ -1435,10 +1499,10 @@ static int kvm_handle_hva_range(struct kvm *kvm,
                                               int level,
                                               unsigned long data))
 {
-       int j;
-       int ret = 0;
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
+       struct slot_rmap_walk_iterator iterator;
+       int ret = 0;
 
        slots = kvm_memslots(kvm);
 
@@ -1458,26 +1522,11 @@ static int kvm_handle_hva_range(struct kvm *kvm,
                gfn_start = hva_to_gfn_memslot(hva_start, memslot);
                gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
-               for (j = PT_PAGE_TABLE_LEVEL;
-                    j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
-                       unsigned long idx, idx_end;
-                       unsigned long *rmapp;
-                       gfn_t gfn = gfn_start;
-
-                       /*
-                        * {idx(page_j) | page_j intersects with
-                        *  [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
-                        */
-                       idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
-                       idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
-
-                       rmapp = __gfn_to_rmap(gfn_start, j, memslot);
-
-                       for (; idx <= idx_end;
-                              ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j)))
-                               ret |= handler(kvm, rmapp++, memslot,
-                                              gfn, j, data);
-               }
+               for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
+                               PT_MAX_HUGEPAGE_LEVEL, gfn_start, gfn_end - 1,
+                               &iterator)
+                       ret |= handler(kvm, iterator.rmap, memslot,
+                                      iterator.gfn, iterator.level, data);
        }
 
        return ret;
@@ -1518,16 +1567,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
        BUG_ON(!shadow_accessed_mask);
 
-       for (sptep = rmap_get_first(*rmapp, &iter); sptep;
-            sptep = rmap_get_next(&iter)) {
-               BUG_ON(!is_shadow_present_pte(*sptep));
-
+       for_each_rmap_spte(rmapp, &iter, sptep)
                if (*sptep & shadow_accessed_mask) {
                        young = 1;
                        clear_bit((ffs(shadow_accessed_mask) - 1),
                                 (unsigned long *)sptep);
                }
-       }
+
        trace_kvm_age_page(gfn, level, slot, young);
        return young;
 }
@@ -1548,15 +1594,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
        if (!shadow_accessed_mask)
                goto out;
 
-       for (sptep = rmap_get_first(*rmapp, &iter); sptep;
-            sptep = rmap_get_next(&iter)) {
-               BUG_ON(!is_shadow_present_pte(*sptep));
-
+       for_each_rmap_spte(rmapp, &iter, sptep)
                if (*sptep & shadow_accessed_mask) {
                        young = 1;
                        break;
                }
-       }
 out:
        return young;
 }
@@ -2093,7 +2135,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                if (level > PT_PAGE_TABLE_LEVEL && need_sync)
                        kvm_sync_pages(vcpu, gfn);
 
-               account_shadowed(vcpu->kvm, gfn);
+               account_shadowed(vcpu->kvm, sp);
        }
        sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
        init_shadow_page_table(sp);
@@ -2274,7 +2316,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
        kvm_mmu_unlink_parents(kvm, sp);
 
        if (!sp->role.invalid && !sp->role.direct)
-               unaccount_shadowed(kvm, sp->gfn);
+               unaccount_shadowed(kvm, sp);
 
        if (sp->unsync)
                kvm_unlink_unsync_page(kvm, sp);
@@ -2393,19 +2435,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
                         u64 start, u64 end)
 {
-       int i;
        u64 base, mask;
        u8 prev_match, curr_match;
-       int num_var_ranges = KVM_NR_VAR_MTRR;
+       int i, num_var_ranges = KVM_NR_VAR_MTRR;
 
-       if (!mtrr_state->enabled)
-               return 0xFF;
+       /* MTRR is completely disabled, use UC for all of physical memory. */
+       if (!(mtrr_state->enabled & 0x2))
+               return MTRR_TYPE_UNCACHABLE;
 
        /* Make end inclusive end, instead of exclusive */
        end--;
 
        /* Look in fixed ranges. Just return the type as per start */
-       if (mtrr_state->have_fixed && (start < 0x100000)) {
+       if (mtrr_state->have_fixed && (mtrr_state->enabled & 0x1) &&
+             (start < 0x100000)) {
                int idx;
 
                if (start < 0x80000) {
@@ -2428,9 +2471,6 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
         * Look of multiple ranges matching this address and pick type
         * as per MTRR precedence
         */
-       if (!(mtrr_state->enabled & 2))
-               return mtrr_state->def_type;
-
        prev_match = 0xFF;
        for (i = 0; i < num_var_ranges; ++i) {
                unsigned short start_state, end_state;
@@ -2692,15 +2732,17 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
                                    u64 *start, u64 *end)
 {
        struct page *pages[PTE_PREFETCH_NUM];
+       struct kvm_memory_slot *slot;
        unsigned access = sp->role.access;
        int i, ret;
        gfn_t gfn;
 
        gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
-       if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
+       slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
+       if (!slot)
                return -1;
 
-       ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
+       ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
        if (ret <= 0)
                return -1;
 
@@ -3475,10 +3517,12 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                         gva_t gva, pfn_t *pfn, bool write, bool *writable)
 {
+       struct kvm_memory_slot *slot;
        bool async;
 
-       *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
-
+       slot = gfn_to_memslot(vcpu->kvm, gfn);
+       async = false;
+       *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
        if (!async)
                return false; /* *pfn has correct page already */
 
@@ -3492,8 +3536,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                        return true;
        }
 
-       *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
-
+       *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
        return false;
 }
 
@@ -3736,8 +3779,8 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
        }
 }
 
-void update_permission_bitmask(struct kvm_vcpu *vcpu,
-               struct kvm_mmu *mmu, bool ept)
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+                                     struct kvm_mmu *mmu, bool ept)
 {
        unsigned bit, byte, pfec;
        u8 map;
@@ -3918,6 +3961,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 {
        bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+       bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
        struct kvm_mmu *context = &vcpu->arch.mmu;
 
        MMU_WARN_ON(VALID_PAGE(context->root_hpa));
@@ -3936,6 +3980,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
        context->base_role.cr0_wp  = is_write_protection(vcpu);
        context->base_role.smep_andnot_wp
                = smep && !is_write_protection(vcpu);
+       context->base_role.smap_andnot_wp
+               = smap && !is_write_protection(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
@@ -4207,12 +4253,18 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                       const u8 *new, int bytes)
 {
        gfn_t gfn = gpa >> PAGE_SHIFT;
-       union kvm_mmu_page_role mask = { .word = 0 };
        struct kvm_mmu_page *sp;
        LIST_HEAD(invalid_list);
        u64 entry, gentry, *spte;
        int npte;
        bool remote_flush, local_flush, zap_page;
+       union kvm_mmu_page_role mask = { };
+
+       mask.cr0_wp = 1;
+       mask.cr4_pae = 1;
+       mask.nxe = 1;
+       mask.smep_andnot_wp = 1;
+       mask.smap_andnot_wp = 1;
 
        /*
         * If we don't have indirect shadow pages, it means no page is
@@ -4238,7 +4290,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        ++vcpu->kvm->stat.mmu_pte_write;
        kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 
-       mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
        for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
                if (detect_write_misaligned(sp, gpa, bytes) ||
                      detect_write_flooding(sp)) {
@@ -4412,36 +4463,113 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
        init_kvm_mmu(vcpu);
 }
 
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
-                                     struct kvm_memory_slot *memslot)
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap);
+
+/* The caller should hold mmu-lock before calling this function. */
+static bool
+slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                       slot_level_handler fn, int start_level, int end_level,
+                       gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
 {
-       gfn_t last_gfn;
-       int i;
+       struct slot_rmap_walk_iterator iterator;
        bool flush = false;
 
-       last_gfn = memslot->base_gfn + memslot->npages - 1;
+       for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+                       end_gfn, &iterator) {
+               if (iterator.rmap)
+                       flush |= fn(kvm, iterator.rmap);
 
-       spin_lock(&kvm->mmu_lock);
+               if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+                       if (flush && lock_flush_tlb) {
+                               kvm_flush_remote_tlbs(kvm);
+                               flush = false;
+                       }
+                       cond_resched_lock(&kvm->mmu_lock);
+               }
+       }
 
-       for (i = PT_PAGE_TABLE_LEVEL;
-            i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-               unsigned long *rmapp;
-               unsigned long last_index, index;
+       if (flush && lock_flush_tlb) {
+               kvm_flush_remote_tlbs(kvm);
+               flush = false;
+       }
+
+       return flush;
+}
 
-               rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
-               last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+static bool
+slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                 slot_level_handler fn, int start_level, int end_level,
+                 bool lock_flush_tlb)
+{
+       return slot_handle_level_range(kvm, memslot, fn, start_level,
+                       end_level, memslot->base_gfn,
+                       memslot->base_gfn + memslot->npages - 1,
+                       lock_flush_tlb);
+}
 
-               for (index = 0; index <= last_index; ++index, ++rmapp) {
-                       if (*rmapp)
-                               flush |= __rmap_write_protect(kvm, rmapp,
-                                               false);
+static bool
+slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                     slot_level_handler fn, bool lock_flush_tlb)
+{
+       return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+                                PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
 
-                       if (need_resched() || spin_needbreak(&kvm->mmu_lock))
-                               cond_resched_lock(&kvm->mmu_lock);
-               }
+static bool
+slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                       slot_level_handler fn, bool lock_flush_tlb)
+{
+       return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
+                                PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static bool
+slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                slot_level_handler fn, bool lock_flush_tlb)
+{
+       return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+                                PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+}
+
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *memslot;
+
+       slots = kvm_memslots(kvm);
+
+       spin_lock(&kvm->mmu_lock);
+       kvm_for_each_memslot(memslot, slots) {
+               gfn_t start, end;
+
+               start = max(gfn_start, memslot->base_gfn);
+               end = min(gfn_end, memslot->base_gfn + memslot->npages);
+               if (start >= end)
+                       continue;
+
+               slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+                               PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
+                               start, end - 1, true);
        }
 
        spin_unlock(&kvm->mmu_lock);
+}
+
+static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp)
+{
+       return __rmap_write_protect(kvm, rmapp, false);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+                                     struct kvm_memory_slot *memslot)
+{
+       bool flush;
+
+       spin_lock(&kvm->mmu_lock);
+       flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
+                                     false);
+       spin_unlock(&kvm->mmu_lock);
 
        /*
         * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
@@ -4474,9 +4602,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
        pfn_t pfn;
        struct kvm_mmu_page *sp;
 
-       for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-               BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+restart:
+       for_each_rmap_spte(rmapp, &iter, sptep) {
                sp = page_header(__pa(sptep));
                pfn = spte_to_pfn(*sptep);
 
@@ -4491,71 +4618,31 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
                        !kvm_is_reserved_pfn(pfn) &&
                        PageTransCompound(pfn_to_page(pfn))) {
                        drop_spte(kvm, sptep);
-                       sptep = rmap_get_first(*rmapp, &iter);
                        need_tlb_flush = 1;
-               } else
-                       sptep = rmap_get_next(&iter);
+                       goto restart;
+               }
        }
 
        return need_tlb_flush;
 }
 
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
-                       struct kvm_memory_slot *memslot)
+                                  const struct kvm_memory_slot *memslot)
 {
-       bool flush = false;
-       unsigned long *rmapp;
-       unsigned long last_index, index;
-
+       /* FIXME: const-ify all uses of struct kvm_memory_slot.  */
        spin_lock(&kvm->mmu_lock);
-
-       rmapp = memslot->arch.rmap[0];
-       last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1,
-                               memslot->base_gfn, PT_PAGE_TABLE_LEVEL);
-
-       for (index = 0; index <= last_index; ++index, ++rmapp) {
-               if (*rmapp)
-                       flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp);
-
-               if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
-                       if (flush) {
-                               kvm_flush_remote_tlbs(kvm);
-                               flush = false;
-                       }
-                       cond_resched_lock(&kvm->mmu_lock);
-               }
-       }
-
-       if (flush)
-               kvm_flush_remote_tlbs(kvm);
-
+       slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
+                        kvm_mmu_zap_collapsible_spte, true);
        spin_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot)
 {
-       gfn_t last_gfn;
-       unsigned long *rmapp;
-       unsigned long last_index, index;
-       bool flush = false;
-
-       last_gfn = memslot->base_gfn + memslot->npages - 1;
+       bool flush;
 
        spin_lock(&kvm->mmu_lock);
-
-       rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
-       last_index = gfn_to_index(last_gfn, memslot->base_gfn,
-                       PT_PAGE_TABLE_LEVEL);
-
-       for (index = 0; index <= last_index; ++index, ++rmapp) {
-               if (*rmapp)
-                       flush |= __rmap_clear_dirty(kvm, rmapp);
-
-               if (need_resched() || spin_needbreak(&kvm->mmu_lock))
-                       cond_resched_lock(&kvm->mmu_lock);
-       }
-
+       flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
        spin_unlock(&kvm->mmu_lock);
 
        lockdep_assert_held(&kvm->slots_lock);
@@ -4574,31 +4661,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
                                        struct kvm_memory_slot *memslot)
 {
-       gfn_t last_gfn;
-       int i;
-       bool flush = false;
-
-       last_gfn = memslot->base_gfn + memslot->npages - 1;
+       bool flush;
 
        spin_lock(&kvm->mmu_lock);
-
-       for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
-            i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-               unsigned long *rmapp;
-               unsigned long last_index, index;
-
-               rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
-               last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
-
-               for (index = 0; index <= last_index; ++index, ++rmapp) {
-                       if (*rmapp)
-                               flush |= __rmap_write_protect(kvm, rmapp,
-                                               false);
-
-                       if (need_resched() || spin_needbreak(&kvm->mmu_lock))
-                               cond_resched_lock(&kvm->mmu_lock);
-               }
-       }
+       flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
+                                       false);
        spin_unlock(&kvm->mmu_lock);
 
        /* see kvm_mmu_slot_remove_write_access */
@@ -4612,31 +4679,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
 void kvm_mmu_slot_set_dirty(struct kvm *kvm,
                            struct kvm_memory_slot *memslot)
 {
-       gfn_t last_gfn;
-       int i;
-       bool flush = false;
-
-       last_gfn = memslot->base_gfn + memslot->npages - 1;
+       bool flush;
 
        spin_lock(&kvm->mmu_lock);
-
-       for (i = PT_PAGE_TABLE_LEVEL;
-            i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-               unsigned long *rmapp;
-               unsigned long last_index, index;
-
-               rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
-               last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
-
-               for (index = 0; index <= last_index; ++index, ++rmapp) {
-                       if (*rmapp)
-                               flush |= __rmap_set_dirty(kvm, rmapp);
-
-                       if (need_resched() || spin_needbreak(&kvm->mmu_lock))
-                               cond_resched_lock(&kvm->mmu_lock);
-               }
-       }
-
+       flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
        spin_unlock(&kvm->mmu_lock);
 
        lockdep_assert_held(&kvm->slots_lock);
index c7d6563..398d21c 100644 (file)
@@ -43,6 +43,7 @@
 #define PT_PDPE_LEVEL 3
 #define PT_DIRECTORY_LEVEL 2
 #define PT_PAGE_TABLE_LEVEL 1
+#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
 
 static inline u64 rsvd_bits(int s, int e)
 {
@@ -71,8 +72,6 @@ enum {
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
-void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-               bool ept);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
@@ -166,8 +165,11 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
        int index = (pfec >> 1) +
                    (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
 
+       WARN_ON(pfec & PFERR_RSVD_MASK);
+
        return (mmu->permissions[index] >> pte_access) & 1;
 }
 
 void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
 #endif
index 9ade5cf..368d534 100644 (file)
@@ -197,13 +197,11 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 
        rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
 
-       for (sptep = rmap_get_first(*rmapp, &iter); sptep;
-            sptep = rmap_get_next(&iter)) {
+       for_each_rmap_spte(rmapp, &iter, sptep)
                if (is_writable_pte(*sptep))
                        audit_printk(kvm, "shadow page has writable "
                                     "mappings: gfn %llx role %x\n",
                                     sp->gfn, sp->role.word);
-       }
 }
 
 static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
index fd49c86..6e6d115 100644 (file)
@@ -718,6 +718,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
                                              mmu_is_nested(vcpu));
                if (likely(r != RET_MMIO_PF_INVALID))
                        return r;
+
+               /*
+                * page fault with PFEC.RSVD  = 1 is caused by shadow
+                * page fault, should not be used to walk guest page
+                * table.
+                */
+               error_code &= ~PFERR_RSVD_MASK;
        };
 
        r = mmu_topup_memory_caches(vcpu);
index ce741b8..b9f9e10 100644 (file)
@@ -1082,7 +1082,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
        return target_tsc - tsc;
 }
 
-static void init_vmcb(struct vcpu_svm *svm)
+static void init_vmcb(struct vcpu_svm *svm, bool init_event)
 {
        struct vmcb_control_area *control = &svm->vmcb->control;
        struct vmcb_save_area *save = &svm->vmcb->save;
@@ -1153,17 +1153,17 @@ static void init_vmcb(struct vcpu_svm *svm)
        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-       svm_set_efer(&svm->vcpu, 0);
+       if (!init_event)
+               svm_set_efer(&svm->vcpu, 0);
        save->dr6 = 0xffff0ff0;
        kvm_set_rflags(&svm->vcpu, 2);
        save->rip = 0x0000fff0;
        svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
 
        /*
-        * This is the guest-visible cr0 value.
         * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
+        * It also updates the guest-visible cr0 value.
         */
-       svm->vcpu.arch.cr0 = 0;
        (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
 
        save->cr4 = X86_CR4_PAE;
@@ -1176,7 +1176,7 @@ static void init_vmcb(struct vcpu_svm *svm)
                clr_exception_intercept(svm, PF_VECTOR);
                clr_cr_intercept(svm, INTERCEPT_CR3_READ);
                clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
-               save->g_pat = 0x0007040600070406ULL;
+               save->g_pat = svm->vcpu.arch.pat;
                save->cr3 = 0;
                save->cr4 = 0;
        }
@@ -1195,13 +1195,19 @@ static void init_vmcb(struct vcpu_svm *svm)
        enable_gif(svm);
 }
 
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        u32 dummy;
        u32 eax = 1;
 
-       init_vmcb(svm);
+       if (!init_event) {
+               svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+                                          MSR_IA32_APICBASE_ENABLE;
+               if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
+                       svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+       }
+       init_vmcb(svm, init_event);
 
        kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
        kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@@ -1257,12 +1263,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
        clear_page(svm->vmcb);
        svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
        svm->asid_generation = 0;
-       init_vmcb(svm);
-
-       svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
-                                  MSR_IA32_APICBASE_ENABLE;
-       if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
-               svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+       init_vmcb(svm, false);
 
        svm_init_osvw(&svm->vcpu);
 
@@ -1575,7 +1576,8 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         * does not do it - this results in some delay at
         * reboot
         */
-       cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+       if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_CD_NW_CLEARED))
+               cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
        svm->vmcb->save.cr0 = cr0;
        mark_dirty(svm->vmcb, VMCB_CR);
        update_cr0_intercept(svm);
@@ -1883,7 +1885,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
         * so reinitialize it.
         */
        clear_page(svm->vmcb);
-       init_vmcb(svm);
+       init_vmcb(svm, false);
 
        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
        return 0;
@@ -4381,6 +4383,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .cache_reg = svm_cache_reg,
        .get_rflags = svm_get_rflags,
        .set_rflags = svm_set_rflags,
+       .fpu_activate = svm_fpu_activate,
        .fpu_deactivate = svm_fpu_deactivate,
 
        .tlb_flush = svm_flush_tlb,
index f7b6168..9cf5030 100644 (file)
@@ -2170,8 +2170,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 
        if (is_guest_mode(vcpu))
                msr_bitmap = vmx_msr_bitmap_nested;
-       else if (irqchip_in_kernel(vcpu->kvm) &&
-               apic_x2apic_mode(vcpu->arch.apic)) {
+       else if (vcpu->arch.apic_base & X2APIC_ENABLE) {
                if (is_long_mode(vcpu))
                        msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
                else
@@ -4667,16 +4666,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
        vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
 
-       if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
-               u32 msr_low, msr_high;
-               u64 host_pat;
-               rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
-               host_pat = msr_low | ((u64) msr_high << 32);
-               /* Write the default value follow host pat */
-               vmcs_write64(GUEST_IA32_PAT, host_pat);
-               /* Keep arch.pat sync with GUEST_IA32_PAT */
-               vmx->vcpu.arch.pat = host_pat;
-       }
+       if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+               vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
 
        for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
                u32 index = vmx_msr_index[i];
@@ -4708,22 +4699,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        return 0;
 }
 
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct msr_data apic_base_msr;
+       u64 cr0;
 
        vmx->rmode.vm86_active = 0;
 
        vmx->soft_vnmi_blocked = 0;
 
        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
-       kvm_set_cr8(&vmx->vcpu, 0);
-       apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
-       if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
-               apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
-       apic_base_msr.host_initiated = true;
-       kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
+       kvm_set_cr8(vcpu, 0);
+
+       if (!init_event) {
+               apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
+                                    MSR_IA32_APICBASE_ENABLE;
+               if (kvm_vcpu_is_reset_bsp(vcpu))
+                       apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
+               apic_base_msr.host_initiated = true;
+               kvm_set_apic_base(vcpu, &apic_base_msr);
+       }
 
        vmx_segment_cache_clear(vmx);
 
@@ -4747,9 +4743,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
        vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
 
-       vmcs_write32(GUEST_SYSENTER_CS, 0);
-       vmcs_writel(GUEST_SYSENTER_ESP, 0);
-       vmcs_writel(GUEST_SYSENTER_EIP, 0);
+       if (!init_event) {
+               vmcs_write32(GUEST_SYSENTER_CS, 0);
+               vmcs_writel(GUEST_SYSENTER_ESP, 0);
+               vmcs_writel(GUEST_SYSENTER_EIP, 0);
+               vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+       }
 
        vmcs_writel(GUEST_RFLAGS, 0x02);
        kvm_rip_write(vcpu, 0xfff0);
@@ -4764,18 +4763,15 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
        vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 
-       /* Special registers */
-       vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
        setup_msrs(vmx);
 
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
 
-       if (cpu_has_vmx_tpr_shadow()) {
+       if (cpu_has_vmx_tpr_shadow() && !init_event) {
                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-               if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+               if (vm_need_tpr_shadow(vcpu->kvm))
                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-                                    __pa(vmx->vcpu.arch.apic->regs));
+                                    __pa(vcpu->arch.apic->regs));
                vmcs_write32(TPR_THRESHOLD, 0);
        }
 
@@ -4787,12 +4783,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        if (vmx->vpid != 0)
                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
-       vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-       vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
-       vmx_set_cr4(&vmx->vcpu, 0);
-       vmx_set_efer(&vmx->vcpu, 0);
-       vmx_fpu_activate(&vmx->vcpu);
-       update_exception_bitmap(&vmx->vcpu);
+       cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+       vmx_set_cr0(vcpu, cr0); /* enter rmode */
+       vmx->vcpu.arch.cr0 = cr0;
+       vmx_set_cr4(vcpu, 0);
+       if (!init_event)
+               vmx_set_efer(vcpu, 0);
+       vmx_fpu_activate(vcpu);
+       update_exception_bitmap(vcpu);
 
        vpid_sync_context(vmx);
 }
@@ -5710,9 +5708,6 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
                return 0;
        }
 
-       /* clear all local breakpoint enable flags */
-       vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155);
-
        /*
         * TODO: What about debug traps on tss switch?
         *       Are we supposed to inject them and update dr6?
@@ -7691,6 +7686,158 @@ static void kvm_flush_pml_buffers(struct kvm *kvm)
                kvm_vcpu_kick(vcpu);
 }
 
+static void vmx_dump_sel(char *name, uint32_t sel)
+{
+       pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
+              name, vmcs_read32(sel),
+              vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
+              vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
+              vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
+}
+
+static void vmx_dump_dtsel(char *name, uint32_t limit)
+{
+       pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
+              name, vmcs_read32(limit),
+              vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+}
+
+static void dump_vmcs(void)
+{
+       u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
+       u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
+       u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+       u32 secondary_exec_control = 0;
+       unsigned long cr4 = vmcs_readl(GUEST_CR4);
+       u64 efer = vmcs_readl(GUEST_IA32_EFER);
+       int i, n;
+
+       if (cpu_has_secondary_exec_ctrls())
+               secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+       pr_err("*** Guest State ***\n");
+       pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+              vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
+              vmcs_readl(CR0_GUEST_HOST_MASK));
+       pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+              cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
+       pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
+       if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+           (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
+       {
+               pr_err("PDPTR0 = 0x%016lx  PDPTR1 = 0x%016lx\n",
+                      vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1));
+               pr_err("PDPTR2 = 0x%016lx  PDPTR3 = 0x%016lx\n",
+                      vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3));
+       }
+       pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
+              vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
+       pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
+              vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
+       pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+              vmcs_readl(GUEST_SYSENTER_ESP),
+              vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
+       vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
+       vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
+       vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
+       vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
+       vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
+       vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
+       vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
+       vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
+       vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
+       vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
+       if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
+           (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
+               pr_err("EFER =     0x%016llx  PAT = 0x%016lx\n",
+                      efer, vmcs_readl(GUEST_IA32_PAT));
+       pr_err("DebugCtl = 0x%016lx  DebugExceptions = 0x%016lx\n",
+              vmcs_readl(GUEST_IA32_DEBUGCTL),
+              vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
+       if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+               pr_err("PerfGlobCtl = 0x%016lx\n",
+                      vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL));
+       if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
+               pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS));
+       pr_err("Interruptibility = %08x  ActivityState = %08x\n",
+              vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
+              vmcs_read32(GUEST_ACTIVITY_STATE));
+       if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+               pr_err("InterruptStatus = %04x\n",
+                      vmcs_read16(GUEST_INTR_STATUS));
+
+       pr_err("*** Host State ***\n");
+       pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
+              vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
+       pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
+              vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
+              vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
+              vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
+              vmcs_read16(HOST_TR_SELECTOR));
+       pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
+              vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
+              vmcs_readl(HOST_TR_BASE));
+       pr_err("GDTBase=%016lx IDTBase=%016lx\n",
+              vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
+       pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
+              vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
+              vmcs_readl(HOST_CR4));
+       pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+              vmcs_readl(HOST_IA32_SYSENTER_ESP),
+              vmcs_read32(HOST_IA32_SYSENTER_CS),
+              vmcs_readl(HOST_IA32_SYSENTER_EIP));
+       if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
+               pr_err("EFER = 0x%016lx  PAT = 0x%016lx\n",
+                      vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT));
+       if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+               pr_err("PerfGlobCtl = 0x%016lx\n",
+                      vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL));
+
+       pr_err("*** Control State ***\n");
+       pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
+              pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
+       pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+       pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
+              vmcs_read32(EXCEPTION_BITMAP),
+              vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
+              vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
+       pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
+              vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+              vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
+              vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
+       pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
+              vmcs_read32(VM_EXIT_INTR_INFO),
+              vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+              vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+       pr_err("        reason=%08x qualification=%016lx\n",
+              vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
+       pr_err("IDTVectoring: info=%08x errcode=%08x\n",
+              vmcs_read32(IDT_VECTORING_INFO_FIELD),
+              vmcs_read32(IDT_VECTORING_ERROR_CODE));
+       pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
+       if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
+               pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+       if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
+               pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
+       if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
+               pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER));
+       n = vmcs_read32(CR3_TARGET_COUNT);
+       for (i = 0; i + 1 < n; i += 4)
+               pr_err("CR3 target%u=%016lx target%u=%016lx\n",
+                      i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
+                      i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
+       if (i < n)
+               pr_err("CR3 target%u=%016lx\n",
+                      i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
+       if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+               pr_err("PLE Gap=%08x Window=%08x\n",
+                      vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
+       if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
+               pr_err("Virtual processor ID = 0x%04x\n",
+                      vmcs_read16(VIRTUAL_PROCESSOR_ID));
+}
+
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -7723,6 +7870,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        }
 
        if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+               dump_vmcs();
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
                        = exit_reason;
@@ -8924,7 +9072,7 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
                                       struct vmx_msr_entry *e)
 {
        /* x2APIC MSR accesses are not allowed */
-       if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+       if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
                return -EINVAL;
        if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
            e->index == MSR_IA32_UCODE_REV)
@@ -10185,6 +10333,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .cache_reg = vmx_cache_reg,
        .get_rflags = vmx_get_rflags,
        .set_rflags = vmx_set_rflags,
+       .fpu_activate = vmx_fpu_activate,
        .fpu_deactivate = vmx_fpu_deactivate,
 
        .tlb_flush = vmx_flush_tlb,
index c73efcd..79dde16 100644 (file)
@@ -99,6 +99,9 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 unsigned int min_timer_period_us = 500;
 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
 
+static bool __read_mostly kvmclock_periodic_sync = true;
+module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+
 bool kvm_has_tsc_control;
 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
 u32  kvm_max_guest_tsc_khz;
@@ -572,8 +575,7 @@ out:
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
        unsigned long old_cr0 = kvm_read_cr0(vcpu);
-       unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
-                                   X86_CR0_CD | X86_CR0_NW;
+       unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
 
        cr0 |= X86_CR0_ET;
 
@@ -702,8 +704,9 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr);
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        unsigned long old_cr4 = kvm_read_cr4(vcpu);
-       unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
-                                  X86_CR4_PAE | X86_CR4_SMEP;
+       unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
+                                  X86_CR4_SMEP | X86_CR4_SMAP;
+
        if (cr4 & CR4_RESERVED_BITS)
                return 1;
 
@@ -744,9 +747,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
            (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
                kvm_mmu_reset_context(vcpu);
 
-       if ((cr4 ^ old_cr4) & X86_CR4_SMAP)
-               update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false);
-
        if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
                kvm_update_cpuid(vcpu);
 
@@ -1700,6 +1700,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                vcpu->pvclock_set_guest_stopped_request = false;
        }
 
+       pvclock_flags |= PVCLOCK_COUNTS_FROM_ZERO;
+
        /* If the host uses TSC clocksource, then it is stable */
        if (use_master_clock)
                pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
@@ -1770,6 +1772,9 @@ static void kvmclock_sync_fn(struct work_struct *work)
                                           kvmclock_sync_work);
        struct kvm *kvm = container_of(ka, struct kvm, arch);
 
+       if (!kvmclock_periodic_sync)
+               return;
+
        schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
        schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
                                        KVMCLOCK_SYNC_PERIOD);
@@ -1854,6 +1859,63 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 }
 EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
 
+static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
+{
+       struct mtrr_state_type *mtrr_state = &vcpu->arch.mtrr_state;
+       unsigned char mtrr_enabled = mtrr_state->enabled;
+       gfn_t start, end, mask;
+       int index;
+       bool is_fixed = true;
+
+       if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
+             !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+               return;
+
+       if (!(mtrr_enabled & 0x2) && msr != MSR_MTRRdefType)
+               return;
+
+       switch (msr) {
+       case MSR_MTRRfix64K_00000:
+               start = 0x0;
+               end = 0x80000;
+               break;
+       case MSR_MTRRfix16K_80000:
+               start = 0x80000;
+               end = 0xa0000;
+               break;
+       case MSR_MTRRfix16K_A0000:
+               start = 0xa0000;
+               end = 0xc0000;
+               break;
+       case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+               index = msr - MSR_MTRRfix4K_C0000;
+               start = 0xc0000 + index * (32 << 10);
+               end = start + (32 << 10);
+               break;
+       case MSR_MTRRdefType:
+               is_fixed = false;
+               start = 0x0;
+               end = ~0ULL;
+               break;
+       default:
+               /* variable range MTRRs. */
+               is_fixed = false;
+               index = (msr - 0x200) / 2;
+               start = (((u64)mtrr_state->var_ranges[index].base_hi) << 32) +
+                      (mtrr_state->var_ranges[index].base_lo & PAGE_MASK);
+               mask = (((u64)mtrr_state->var_ranges[index].mask_hi) << 32) +
+                      (mtrr_state->var_ranges[index].mask_lo & PAGE_MASK);
+               mask |= ~0ULL << cpuid_maxphyaddr(vcpu);
+
+               end = ((start & mask) | ~mask) + 1;
+       }
+
+       if (is_fixed && !(mtrr_enabled & 0x1))
+               return;
+
+       kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
+}
+
 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
@@ -1887,7 +1949,7 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                *pt = data;
        }
 
-       kvm_mmu_reset_context(vcpu);
+       update_mtrr(vcpu, msr);
        return 0;
 }
 
@@ -2222,6 +2284,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                                        &vcpu->requests);
 
                        ka->boot_vcpu_runs_old_kvmclock = tmp;
+
+                       ka->kvmclock_offset = -get_kernel_ns();
                }
 
                vcpu->arch.time = data;
@@ -2800,6 +2864,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_HYPERV_TIME:
        case KVM_CAP_IOAPIC_POLARITY_IGNORED:
        case KVM_CAP_TSC_DEADLINE_TIMER:
+       case KVM_CAP_ENABLE_CAP_VM:
+       case KVM_CAP_DISABLE_QUIRKS:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
        case KVM_CAP_ASSIGN_DEV_IRQ:
        case KVM_CAP_PCI_2_3:
@@ -3847,6 +3913,26 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
        return 0;
 }
 
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+                                  struct kvm_enable_cap *cap)
+{
+       int r;
+
+       if (cap->flags)
+               return -EINVAL;
+
+       switch (cap->cap) {
+       case KVM_CAP_DISABLE_QUIRKS:
+               kvm->arch.disabled_quirks = cap->args[0];
+               r = 0;
+               break;
+       default:
+               r = -EINVAL;
+               break;
+       }
+       return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg)
 {
@@ -4099,7 +4185,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
                break;
        }
+       case KVM_ENABLE_CAP: {
+               struct kvm_enable_cap cap;
 
+               r = -EFAULT;
+               if (copy_from_user(&cap, argp, sizeof(cap)))
+                       goto out;
+               r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+               break;
+       }
        default:
                r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
        }
@@ -5954,6 +6048,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
        lapic_irq.shorthand = 0;
        lapic_irq.dest_mode = 0;
        lapic_irq.dest_id = apicid;
+       lapic_irq.msi_redir_hint = false;
 
        lapic_irq.delivery_mode = APIC_DM_REMRD;
        kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
@@ -6197,6 +6292,8 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
                return;
 
        page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+       if (is_error_page(page))
+               return;
        kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
 
        /*
@@ -6347,7 +6444,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (req_immediate_exit)
                smp_send_reschedule(vcpu->cpu);
 
-       kvm_guest_enter();
+       __kvm_guest_enter();
 
        if (unlikely(vcpu->arch.switch_db_regs)) {
                set_debugreg(0, 7);
@@ -7003,7 +7100,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
        return 0;
 }
 
-int fx_init(struct kvm_vcpu *vcpu)
+int fx_init(struct kvm_vcpu *vcpu, bool init_event)
 {
        int err;
 
@@ -7011,7 +7108,9 @@ int fx_init(struct kvm_vcpu *vcpu)
        if (err)
                return err;
 
-       fpu_finit(&vcpu->arch.guest_fpu);
+       if (!init_event)
+               fpu_finit(&vcpu->arch.guest_fpu);
+
        if (cpu_has_xsaves)
                vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
                        host_xcr0 | XSTATE_COMPACTION_ENABLED;
@@ -7053,14 +7152,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
        kvm_put_guest_xcr0(vcpu);
 
-       if (!vcpu->guest_fpu_loaded)
+       if (!vcpu->guest_fpu_loaded) {
+               vcpu->fpu_counter = 0;
                return;
+       }
 
        vcpu->guest_fpu_loaded = 0;
        fpu_save_init(&vcpu->arch.guest_fpu);
        __kernel_fpu_end();
        ++vcpu->stat.fpu_reload;
-       kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+       /*
+        * If using eager FPU mode, or if the guest is a frequent user
+        * of the FPU, just leave the FPU active for next time.
+        * Every 255 times fpu_counter rolls over to 0; a guest that uses
+        * the FPU in bursts will revert to loading it on demand.
+        */
+       if (!vcpu->arch.eager_fpu) {
+               if (++vcpu->fpu_counter < 5)
+                       kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+       }
        trace_kvm_fpu(0);
 }
 
@@ -7076,11 +7186,21 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
                                                unsigned int id)
 {
+       struct kvm_vcpu *vcpu;
+
        if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
                printk_once(KERN_WARNING
                "kvm: SMP vm created on host with unstable TSC; "
                "guest TSC will not be reliable\n");
-       return kvm_x86_ops->vcpu_create(kvm, id);
+
+       vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+
+       /*
+        * Activate fpu unconditionally in case the guest needs eager FPU.  It will be
+        * deactivated soon if it doesn't.
+        */
+       kvm_x86_ops->fpu_activate(vcpu);
+       return vcpu;
 }
 
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
@@ -7091,7 +7211,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        r = vcpu_load(vcpu);
        if (r)
                return r;
-       kvm_vcpu_reset(vcpu);
+       kvm_vcpu_reset(vcpu, false);
        kvm_mmu_setup(vcpu);
        vcpu_put(vcpu);
 
@@ -7111,6 +7231,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
        kvm_write_tsc(vcpu, &msr);
        vcpu_put(vcpu);
 
+       if (!kvmclock_periodic_sync)
+               return;
+
        schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
                                        KVMCLOCK_SYNC_PERIOD);
 }
@@ -7129,7 +7252,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_x86_ops->vcpu_free(vcpu);
 }
 
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
        atomic_set(&vcpu->arch.nmi_queued, 0);
        vcpu->arch.nmi_pending = 0;
@@ -7156,13 +7279,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
        kvm_async_pf_hash_reset(vcpu);
        vcpu->arch.apf.halted = false;
 
-       kvm_pmu_reset(vcpu);
+       if (!init_event)
+               kvm_pmu_reset(vcpu);
 
        memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
        vcpu->arch.regs_avail = ~0;
        vcpu->arch.regs_dirty = ~0;
 
-       kvm_x86_ops->vcpu_reset(vcpu);
+       kvm_x86_ops->vcpu_reset(vcpu, init_event);
 }
 
 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -7351,7 +7475,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                goto fail_free_mce_banks;
        }
 
-       r = fx_init(vcpu);
+       r = fx_init(vcpu, false);
        if (r)
                goto fail_free_wbinvd_dirty_mask;
 
@@ -7363,6 +7487,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
        vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
+       vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
        kvm_async_pf_hash_reset(vcpu);
        kvm_pmu_init(vcpu);
 
@@ -7576,7 +7702,7 @@ out_free:
        return -ENOMEM;
 }
 
-void kvm_arch_memslots_updated(struct kvm *kvm)
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
 {
        /*
         * memslots->generation has been incremented.
@@ -7587,7 +7713,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm)
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
-                               struct kvm_userspace_memory_region *mem,
+                               const struct kvm_userspace_memory_region *mem,
                                enum kvm_mr_change change)
 {
        /*
@@ -7665,14 +7791,14 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem,
+                               const struct kvm_userspace_memory_region *mem,
                                const struct kvm_memory_slot *old,
+                               const struct kvm_memory_slot *new,
                                enum kvm_mr_change change)
 {
-       struct kvm_memory_slot *new;
        int nr_mmu_pages = 0;
 
-       if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
+       if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) {
                int ret;
 
                ret = vm_munmap(old->userspace_addr,
@@ -7689,9 +7815,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
        if (nr_mmu_pages)
                kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
 
-       /* It's OK to get 'new' slot here as it has already been installed */
-       new = id_to_memslot(kvm->memslots, mem->slot);
-
        /*
         * Dirty logging tracks sptes in 4k granularity, meaning that large
         * sptes have to be split.  If live migration is successful, the guest
@@ -7716,9 +7839,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
         * been zapped so no dirty logging staff is needed for old slot. For
         * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
         * new and it's also covered when dealing with the new slot.
+        *
+        * FIXME: const-ify all uses of struct kvm_memory_slot.
         */
        if (change != KVM_MR_DELETE)
-               kvm_mmu_slot_apply_flags(kvm, new);
+               kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
index f5fef18..01a1d01 100644 (file)
@@ -4,6 +4,8 @@
 #include <linux/kvm_host.h>
 #include "kvm_cache_regs.h"
 
+#define MSR_IA32_CR_PAT_DEFAULT  0x0007040600070406ULL
+
 static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 {
        vcpu->arch.exception.pending = false;
index ad45054..a8bcbc9 100644 (file)
@@ -230,6 +230,7 @@ struct kvm_vcpu {
 
        int fpu_active;
        int guest_fpu_loaded, guest_xcr0_loaded;
+       unsigned char fpu_counter;
        wait_queue_head_t wq;
        struct pid *pid;
        int sigset_active;
@@ -500,21 +501,22 @@ enum kvm_mr_change {
 };
 
 int kvm_set_memory_region(struct kvm *kvm,
-                         struct kvm_userspace_memory_region *mem);
+                         const struct kvm_userspace_memory_region *mem);
 int __kvm_set_memory_region(struct kvm *kvm,
-                           struct kvm_userspace_memory_region *mem);
+                           const struct kvm_userspace_memory_region *mem);
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont);
 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
                            unsigned long npages);
-void kvm_arch_memslots_updated(struct kvm *kvm);
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
-                               struct kvm_userspace_memory_region *mem,
+                               const struct kvm_userspace_memory_region *mem,
                                enum kvm_mr_change change);
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem,
+                               const struct kvm_userspace_memory_region *mem,
                                const struct kvm_memory_slot *old,
+                               const struct kvm_memory_slot *new,
                                enum kvm_mr_change change);
 bool kvm_largepages_enabled(void);
 void kvm_disable_largepages(void);
@@ -524,8 +526,8 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm);
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
                                   struct kvm_memory_slot *slot);
 
-int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
-                           int nr_pages);
+int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
+                           struct page **pages, int nr_pages);
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
@@ -538,13 +540,13 @@ void kvm_release_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
 
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
-                      bool write_fault, bool *writable);
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                      bool *writable);
 pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
 pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
+pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+                          bool *async, bool write_fault, bool *writable);
 
 void kvm_release_pfn_clean(pfn_t pfn);
 void kvm_set_pfn_dirty(pfn_t pfn);
@@ -762,16 +764,10 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
 }
 #endif
 
-static inline void kvm_guest_enter(void)
+/* must be called with irqs disabled */
+static inline void __kvm_guest_enter(void)
 {
-       unsigned long flags;
-
-       BUG_ON(preemptible());
-
-       local_irq_save(flags);
        guest_enter();
-       local_irq_restore(flags);
-
        /* KVM does not hold any references to rcu protected data when it
         * switches CPU into a guest mode. In fact switching to a guest mode
         * is very similar to exiting to userspace from rcu point of view. In
@@ -783,12 +779,27 @@ static inline void kvm_guest_enter(void)
                rcu_virt_note_context_switch(smp_processor_id());
 }
 
+/* must be called with irqs disabled */
+static inline void __kvm_guest_exit(void)
+{
+       guest_exit();
+}
+
+static inline void kvm_guest_enter(void)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __kvm_guest_enter();
+       local_irq_restore(flags);
+}
+
 static inline void kvm_guest_exit(void)
 {
        unsigned long flags;
 
        local_irq_save(flags);
-       guest_exit();
+       __kvm_guest_exit();
        local_irq_restore(flags);
 }
 
index 931da7e..1b47a18 100644 (file)
@@ -28,6 +28,7 @@ struct kvm_run;
 struct kvm_userspace_memory_region;
 struct kvm_vcpu;
 struct kvm_vcpu_init;
+struct kvm_memslots;
 
 enum kvm_mr_change;
 
index 4b60056..75bd9f7 100644 (file)
@@ -814,6 +814,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_INJECT_IRQ 113
 #define KVM_CAP_S390_IRQ_STATE 114
 #define KVM_CAP_PPC_HWRNG 115
+#define KVM_CAP_DISABLE_QUIRKS 116
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
index 9097741..6c8e124 100644 (file)
@@ -103,8 +103,7 @@ static void hardware_disable_all(void);
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 
 static void kvm_release_pfn_dirty(pfn_t pfn);
-static void mark_page_dirty_in_slot(struct kvm *kvm,
-                                   struct kvm_memory_slot *memslot, gfn_t gfn);
+static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
 
 __visible bool kvm_rebooting;
 EXPORT_SYMBOL_GPL(kvm_rebooting);
@@ -440,13 +439,60 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
 
 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 
-static void kvm_init_memslots_id(struct kvm *kvm)
+static struct kvm_memslots *kvm_alloc_memslots(void)
 {
        int i;
-       struct kvm_memslots *slots = kvm->memslots;
+       struct kvm_memslots *slots;
 
+       slots = kvm_kvzalloc(sizeof(struct kvm_memslots));
+       if (!slots)
+               return NULL;
+
+       /*
+        * Init kvm generation close to the maximum to easily test the
+        * code of handling generation number wrap-around.
+        */
+       slots->generation = -150;
        for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
                slots->id_to_index[i] = slots->memslots[i].id = i;
+
+       return slots;
+}
+
+static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
+{
+       if (!memslot->dirty_bitmap)
+               return;
+
+       kvfree(memslot->dirty_bitmap);
+       memslot->dirty_bitmap = NULL;
+}
+
+/*
+ * Free any memory in @free but not in @dont.
+ */
+static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
+                             struct kvm_memory_slot *dont)
+{
+       if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
+               kvm_destroy_dirty_bitmap(free);
+
+       kvm_arch_free_memslot(kvm, free, dont);
+
+       free->npages = 0;
+}
+
+static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
+{
+       struct kvm_memory_slot *memslot;
+
+       if (!slots)
+               return;
+
+       kvm_for_each_memslot(memslot, slots)
+               kvm_free_memslot(kvm, memslot, NULL);
+
+       kvfree(slots);
 }
 
 static struct kvm *kvm_create_vm(unsigned long type)
@@ -472,17 +518,10 @@ static struct kvm *kvm_create_vm(unsigned long type)
        BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
 
        r = -ENOMEM;
-       kvm->memslots = kvm_kvzalloc(sizeof(struct kvm_memslots));
+       kvm->memslots = kvm_alloc_memslots();
        if (!kvm->memslots)
                goto out_err_no_srcu;
 
-       /*
-        * Init kvm generation close to the maximum to easily test the
-        * code of handling generation number wrap-around.
-        */
-       kvm->memslots->generation = -150;
-
-       kvm_init_memslots_id(kvm);
        if (init_srcu_struct(&kvm->srcu))
                goto out_err_no_srcu;
        if (init_srcu_struct(&kvm->irq_srcu))
@@ -523,7 +562,7 @@ out_err_no_srcu:
 out_err_no_disable:
        for (i = 0; i < KVM_NR_BUSES; i++)
                kfree(kvm->buses[i]);
-       kvfree(kvm->memslots);
+       kvm_free_memslots(kvm, kvm->memslots);
        kvm_arch_free_vm(kvm);
        return ERR_PTR(r);
 }
@@ -540,40 +579,6 @@ void *kvm_kvzalloc(unsigned long size)
                return kzalloc(size, GFP_KERNEL);
 }
 
-static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
-{
-       if (!memslot->dirty_bitmap)
-               return;
-
-       kvfree(memslot->dirty_bitmap);
-       memslot->dirty_bitmap = NULL;
-}
-
-/*
- * Free any memory in @free but not in @dont.
- */
-static void kvm_free_physmem_slot(struct kvm *kvm, struct kvm_memory_slot *free,
-                                 struct kvm_memory_slot *dont)
-{
-       if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
-               kvm_destroy_dirty_bitmap(free);
-
-       kvm_arch_free_memslot(kvm, free, dont);
-
-       free->npages = 0;
-}
-
-static void kvm_free_physmem(struct kvm *kvm)
-{
-       struct kvm_memslots *slots = kvm->memslots;
-       struct kvm_memory_slot *memslot;
-
-       kvm_for_each_memslot(memslot, slots)
-               kvm_free_physmem_slot(kvm, memslot, NULL);
-
-       kvfree(kvm->memslots);
-}
-
 static void kvm_destroy_devices(struct kvm *kvm)
 {
        struct list_head *node, *tmp;
@@ -607,7 +612,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 #endif
        kvm_arch_destroy_vm(kvm);
        kvm_destroy_devices(kvm);
-       kvm_free_physmem(kvm);
+       kvm_free_memslots(kvm, kvm->memslots);
        cleanup_srcu_struct(&kvm->irq_srcu);
        cleanup_srcu_struct(&kvm->srcu);
        kvm_arch_free_vm(kvm);
@@ -670,8 +675,6 @@ static void update_memslots(struct kvm_memslots *slots,
        WARN_ON(mslots[i].id != id);
        if (!new->npages) {
                WARN_ON(!mslots[i].npages);
-               new->base_gfn = 0;
-               new->flags = 0;
                if (mslots[i].npages)
                        slots->used_slots--;
        } else {
@@ -711,7 +714,7 @@ static void update_memslots(struct kvm_memslots *slots,
        slots->id_to_index[mslots[i].id] = i;
 }
 
-static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
+static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
 {
        u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
 
@@ -728,7 +731,7 @@ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
 static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
                struct kvm_memslots *slots)
 {
-       struct kvm_memslots *old_memslots = kvm->memslots;
+       struct kvm_memslots *old_memslots = kvm_memslots(kvm);
 
        /*
         * Set the low bit in the generation, which disables SPTE caching
@@ -747,7 +750,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
         */
        slots->generation++;
 
-       kvm_arch_memslots_updated(kvm);
+       kvm_arch_memslots_updated(kvm, slots);
 
        return old_memslots;
 }
@@ -761,7 +764,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
  * Must be called holding kvm->slots_lock for write.
  */
 int __kvm_set_memory_region(struct kvm *kvm,
-                           struct kvm_userspace_memory_region *mem)
+                           const struct kvm_userspace_memory_region *mem)
 {
        int r;
        gfn_t base_gfn;
@@ -793,16 +796,13 @@ int __kvm_set_memory_region(struct kvm *kvm,
        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
                goto out;
 
-       slot = id_to_memslot(kvm->memslots, mem->slot);
+       slot = id_to_memslot(kvm_memslots(kvm), mem->slot);
        base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
        npages = mem->memory_size >> PAGE_SHIFT;
 
        if (npages > KVM_MEM_MAX_NR_PAGES)
                goto out;
 
-       if (!npages)
-               mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
-
        new = old = *slot;
 
        new.id = mem->slot;
@@ -828,15 +828,19 @@ int __kvm_set_memory_region(struct kvm *kvm,
                                goto out;
                        }
                }
-       } else if (old.npages) {
+       } else {
+               if (!old.npages)
+                       goto out;
+
                change = KVM_MR_DELETE;
-       } else /* Modify a non-existent slot: disallowed. */
-               goto out;
+               new.base_gfn = 0;
+               new.flags = 0;
+       }
 
        if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
                /* Check for overlaps */
                r = -EEXIST;
-               kvm_for_each_memslot(slot, kvm->memslots) {
+               kvm_for_each_memslot(slot, kvm_memslots(kvm)) {
                        if ((slot->id >= KVM_USER_MEM_SLOTS) ||
                            (slot->id == mem->slot))
                                continue;
@@ -867,7 +871,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
        slots = kvm_kvzalloc(sizeof(struct kvm_memslots));
        if (!slots)
                goto out_free;
-       memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+       memcpy(slots, kvm_memslots(kvm), sizeof(struct kvm_memslots));
 
        if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
                slot = id_to_memslot(slots, mem->slot);
@@ -898,7 +902,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
        if (r)
                goto out_slots;
 
-       /* actual memory is freed via old in kvm_free_physmem_slot below */
+       /* actual memory is freed via old in kvm_free_memslot below */
        if (change == KVM_MR_DELETE) {
                new.dirty_bitmap = NULL;
                memset(&new.arch, 0, sizeof(new.arch));
@@ -907,9 +911,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
        update_memslots(slots, &new);
        old_memslots = install_new_memslots(kvm, slots);
 
-       kvm_arch_commit_memory_region(kvm, mem, &old, change);
+       kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
 
-       kvm_free_physmem_slot(kvm, &old, &new);
+       kvm_free_memslot(kvm, &old, &new);
        kvfree(old_memslots);
 
        /*
@@ -931,14 +935,14 @@ int __kvm_set_memory_region(struct kvm *kvm,
 out_slots:
        kvfree(slots);
 out_free:
-       kvm_free_physmem_slot(kvm, &new, &old);
+       kvm_free_memslot(kvm, &new, &old);
 out:
        return r;
 }
 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
 
 int kvm_set_memory_region(struct kvm *kvm,
-                         struct kvm_userspace_memory_region *mem)
+                         const struct kvm_userspace_memory_region *mem)
 {
        int r;
 
@@ -954,12 +958,14 @@ static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 {
        if (mem->slot >= KVM_USER_MEM_SLOTS)
                return -EINVAL;
+
        return kvm_set_memory_region(kvm, mem);
 }
 
 int kvm_get_dirty_log(struct kvm *kvm,
                        struct kvm_dirty_log *log, int *is_dirty)
 {
+       struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        int r, i;
        unsigned long n;
@@ -969,7 +975,8 @@ int kvm_get_dirty_log(struct kvm *kvm,
        if (log->slot >= KVM_USER_MEM_SLOTS)
                goto out;
 
-       memslot = id_to_memslot(kvm->memslots, log->slot);
+       slots = kvm_memslots(kvm);
+       memslot = id_to_memslot(slots, log->slot);
        r = -ENOENT;
        if (!memslot->dirty_bitmap)
                goto out;
@@ -1018,6 +1025,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
 int kvm_get_dirty_log_protect(struct kvm *kvm,
                        struct kvm_dirty_log *log, bool *is_dirty)
 {
+       struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        int r, i;
        unsigned long n;
@@ -1028,7 +1036,8 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
        if (log->slot >= KVM_USER_MEM_SLOTS)
                goto out;
 
-       memslot = id_to_memslot(kvm->memslots, log->slot);
+       slots = kvm_memslots(kvm);
+       memslot = id_to_memslot(slots, log->slot);
 
        dirty_bitmap = memslot->dirty_bitmap;
        r = -ENOENT;
@@ -1355,9 +1364,8 @@ exit:
        return pfn;
 }
 
-static pfn_t
-__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
-                    bool *async, bool write_fault, bool *writable)
+pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+                          bool *async, bool write_fault, bool *writable)
 {
        unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
 
@@ -1376,44 +1384,13 @@ __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
        return hva_to_pfn(addr, atomic, async, write_fault,
                          writable);
 }
-
-static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
-                         bool write_fault, bool *writable)
-{
-       struct kvm_memory_slot *slot;
-
-       if (async)
-               *async = false;
-
-       slot = gfn_to_memslot(kvm, gfn);
-
-       return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
-                                   writable);
-}
-
-pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
-{
-       return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
-
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
-                      bool write_fault, bool *writable)
-{
-       return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
-
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
-{
-       return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn);
+EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
 
 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                      bool *writable)
 {
-       return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
+       return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
+                                   write_fault, writable);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
@@ -1421,6 +1398,7 @@ pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 {
        return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
 }
+EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
 
 pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 {
@@ -1428,13 +1406,25 @@ pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
 
-int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
-                                                                 int nr_pages)
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
+{
+       return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
+
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+{
+       return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn);
+
+int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
+                           struct page **pages, int nr_pages)
 {
        unsigned long addr;
        gfn_t entry;
 
-       addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
+       addr = gfn_to_hva_many(slot, gfn, &entry);
        if (kvm_is_error_hva(addr))
                return -1;
 
@@ -1590,15 +1580,17 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
                         int offset, int len)
 {
        int r;
+       struct kvm_memory_slot *memslot;
        unsigned long addr;
 
-       addr = gfn_to_hva(kvm, gfn);
+       memslot = gfn_to_memslot(kvm, gfn);
+       addr = gfn_to_hva_memslot(memslot, gfn);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
        r = __copy_to_user((void __user *)addr + offset, data, len);
        if (r)
                return -EFAULT;
-       mark_page_dirty(kvm, gfn);
+       mark_page_dirty_in_slot(memslot, gfn);
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
@@ -1681,7 +1673,7 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
        r = __copy_to_user((void __user *)ghc->hva, data, len);
        if (r)
                return -EFAULT;
-       mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+       mark_page_dirty_in_slot(ghc->memslot, ghc->gpa >> PAGE_SHIFT);
 
        return 0;
 }
@@ -1739,8 +1731,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
 }
 EXPORT_SYMBOL_GPL(kvm_clear_guest);
 
-static void mark_page_dirty_in_slot(struct kvm *kvm,
-                                   struct kvm_memory_slot *memslot,
+static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
                                    gfn_t gfn)
 {
        if (memslot && memslot->dirty_bitmap) {
@@ -1755,7 +1746,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
        struct kvm_memory_slot *memslot;
 
        memslot = gfn_to_memslot(kvm, gfn);
-       mark_page_dirty_in_slot(kvm, memslot, gfn);
+       mark_page_dirty_in_slot(memslot, gfn);
 }
 EXPORT_SYMBOL_GPL(mark_page_dirty);
 
@@ -2882,18 +2873,12 @@ static int hardware_enable_all(void)
 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
                           void *v)
 {
-       int cpu = (long)v;
-
        val &= ~CPU_TASKS_FROZEN;
        switch (val) {
        case CPU_DYING:
-               pr_info("kvm: disabling virtualization on CPU%d\n",
-                      cpu);
                hardware_disable();
                break;
        case CPU_STARTING:
-               pr_info("kvm: enabling virtualization on CPU%d\n",
-                      cpu);
                hardware_enable();
                break;
        }