KVM: stats: Support linear and logarithmic histogram statistics
[linux-2.6-microblaze.git] / arch / x86 / kvm / x86.c
index c6dc1b4..9425589 100644 (file)
@@ -235,10 +235,9 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
        STATS_DESC_ICOUNTER(VM, mmu_unsync),
        STATS_DESC_ICOUNTER(VM, lpages),
        STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
+       STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
        STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
 };
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
-               sizeof(struct kvm_vm_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vm_stats_header = {
        .name_size = KVM_STATS_NAME_SIZE,
@@ -278,8 +277,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
        STATS_DESC_COUNTER(VCPU, directed_yield_successful),
        STATS_DESC_ICOUNTER(VCPU, guest_mode)
 };
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
-               sizeof(struct kvm_vcpu_stat) / sizeof(u64));
 
 const struct kvm_stats_header kvm_vcpu_stats_header = {
        .name_size = KVM_STATS_NAME_SIZE,
@@ -485,7 +482,14 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 
-asmlinkage __visible noinstr void kvm_spurious_fault(void)
+/*
+ * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
+ *
+ * Hardware virtualization extension instructions may fault if a reboot turns
+ * off virtualization while processes are running.  Usually after catching the
+ * fault we just panic; during reboot instead the instruction is ignored.
+ */
+noinstr void kvm_spurious_fault(void)
 {
        /* Fault while not rebooting.  We want the trace. */
        BUG_ON(!kvm_rebooting);
@@ -1180,7 +1184,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
        if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
                for (i = 0; i < KVM_NR_DB_REGS; i++)
                        vcpu->arch.eff_db[i] = vcpu->arch.db[i];
-               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
        }
 }
 
@@ -3407,7 +3410,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                break;
        case MSR_KVM_ASYNC_PF_ACK:
-               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
                        return 1;
                if (data & 0x1) {
                        vcpu->arch.apf.pageready_pending = false;
@@ -3746,7 +3749,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = vcpu->arch.apf.msr_int_val;
                break;
        case MSR_KVM_ASYNC_PF_ACK:
-               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
                        return 1;
 
                msr_info->data = 0;
@@ -4310,12 +4313,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 
        static_call(kvm_x86_vcpu_put)(vcpu);
        vcpu->arch.last_host_tsc = rdtsc();
-       /*
-        * If userspace has set any breakpoints or watchpoints, dr6 is restored
-        * on every vmexit, but if not, we might have a stale dr6 from the
-        * guest. do_debug expects dr6 to be cleared after it runs, do the same.
-        */
-       set_debugreg(0, 6);
 }
 
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -4358,8 +4355,17 @@ static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
 
 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
 {
-       return kvm_arch_interrupt_allowed(vcpu) &&
-               kvm_cpu_accept_dm_intr(vcpu);
+       /*
+        * Do not cause an interrupt window exit if an exception
+        * is pending or an event needs reinjection; userspace
+        * might want to inject the interrupt manually using KVM_SET_REGS
+        * or KVM_SET_SREGS.  For that to work, we must be at an
+        * instruction boundary and with no events half-injected.
+        */
+       return (kvm_arch_interrupt_allowed(vcpu) &&
+               kvm_cpu_accept_dm_intr(vcpu) &&
+               !kvm_event_needs_reinjection(vcpu) &&
+               !vcpu->arch.exception.pending);
 }
 
 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
@@ -6558,9 +6564,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
         * there is no pkey in EPT page table for L1 guest or EPT
         * shadow page table for L2 guest.
         */
-       if (vcpu_match_mmio_gva(vcpu, gva)
-           && !permission_fault(vcpu, vcpu->arch.walk_mmu,
-                                vcpu->arch.mmio_access, 0, access)) {
+       if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
+           !permission_fault(vcpu, vcpu->arch.walk_mmu,
+                             vcpu->arch.mmio_access, 0, access))) {
                *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
                                        (gva & (PAGE_SIZE - 1));
                trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -8569,6 +8575,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated);
 
 static void kvm_apicv_init(struct kvm *kvm)
 {
+       mutex_init(&kvm->arch.apicv_update_lock);
+
        if (enable_apicv)
                clear_bit(APICV_INHIBIT_REASON_DISABLE,
                          &kvm->arch.apicv_inhibit_reasons);
@@ -9227,10 +9235,18 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
 
 void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
 {
+       bool activate;
+
        if (!lapic_in_kernel(vcpu))
                return;
 
-       vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
+       mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+
+       activate = kvm_apicv_activated(vcpu->kvm);
+       if (vcpu->arch.apicv_active == activate)
+               goto out;
+
+       vcpu->arch.apicv_active = activate;
        kvm_apic_update_apicv(vcpu);
        static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
 
@@ -9242,54 +9258,45 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
         */
        if (!vcpu->arch.apicv_active)
                kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+out:
+       mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
 
-/*
- * NOTE: Do not hold any lock prior to calling this.
- *
- * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
- * locked, because it calls __x86_set_memory_region() which does
- * synchronize_srcu(&kvm->srcu).
- */
-void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
 {
-       struct kvm_vcpu *except;
-       unsigned long old, new, expected;
+       unsigned long old, new;
 
        if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
            !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
                return;
 
-       old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
-       do {
-               expected = new = old;
-               if (activate)
-                       __clear_bit(bit, &new);
-               else
-                       __set_bit(bit, &new);
-               if (new == old)
-                       break;
-               old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
-       } while (old != expected);
+       old = new = kvm->arch.apicv_inhibit_reasons;
 
-       if (!!old == !!new)
-               return;
-
-       trace_kvm_apicv_update_request(activate, bit);
-       if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
-               static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate);
+       if (activate)
+               __clear_bit(bit, &new);
+       else
+               __set_bit(bit, &new);
+
+       if (!!old != !!new) {
+               trace_kvm_apicv_update_request(activate, bit);
+               kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
+               kvm->arch.apicv_inhibit_reasons = new;
+               if (new) {
+                       unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
+                       kvm_zap_gfn_range(kvm, gfn, gfn+1);
+               }
+       } else
+               kvm->arch.apicv_inhibit_reasons = new;
+}
+EXPORT_SYMBOL_GPL(__kvm_request_apicv_update);
 
-       /*
-        * Sending request to update APICV for all other vcpus,
-        * while update the calling vcpu immediately instead of
-        * waiting for another #VMEXIT to handle the request.
-        */
-       except = kvm_get_running_vcpu();
-       kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
-                                        except);
-       if (except)
-               kvm_vcpu_update_apicv(except);
+void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+{
+       mutex_lock(&kvm->arch.apicv_update_lock);
+       __kvm_request_apicv_update(kvm, activate, bit);
+       mutex_unlock(&kvm->arch.apicv_update_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
 
@@ -9386,6 +9393,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        }
 
        if (kvm_request_pending(vcpu)) {
+               if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) {
+                       r = -EIO;
+                       goto out;
+               }
                if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
                        if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
                                r = 0;
@@ -9599,8 +9610,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                set_debugreg(vcpu->arch.eff_db[1], 1);
                set_debugreg(vcpu->arch.eff_db[2], 2);
                set_debugreg(vcpu->arch.eff_db[3], 3);
-               set_debugreg(vcpu->arch.dr6, 6);
-               vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
+       } else if (unlikely(hw_breakpoint_active())) {
+               set_debugreg(0, 7);
        }
 
        for (;;) {
@@ -9628,7 +9639,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
                kvm_update_dr0123(vcpu);
                kvm_update_dr7(vcpu);
-               vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
        }
 
        /*
@@ -9965,7 +9975,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                goto out;
        }
 
-       if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+       if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
+           (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
                r = -EINVAL;
                goto out;
        }
@@ -10570,9 +10581,6 @@ static void store_regs(struct kvm_vcpu *vcpu)
 
 static int sync_regs(struct kvm_vcpu *vcpu)
 {
-       if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
-               return -EINVAL;
-
        if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
                __set_regs(vcpu, &vcpu->run->s.regs.regs);
                vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
@@ -10788,6 +10796,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
        unsigned long old_cr0 = kvm_read_cr0(vcpu);
+       unsigned long new_cr0;
+       u32 eax, dummy;
 
        kvm_lapic_reset(vcpu, init_event);
 
@@ -10854,10 +10864,41 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        vcpu->arch.regs_avail = ~0;
        vcpu->arch.regs_dirty = ~0;
 
+       /*
+        * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
+        * if no CPUID match is found.  Note, it's impossible to get a match at
+        * RESET since KVM emulates RESET before exposing the vCPU to userspace,
+        * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET.
+        * But, go through the motions in case that's ever remedied.
+        */
+       eax = 1;
+       if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true))
+               eax = 0x600;
+       kvm_rdx_write(vcpu, eax);
+
        vcpu->arch.ia32_xss = 0;
 
        static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
 
+       kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+       kvm_rip_write(vcpu, 0xfff0);
+
+       /*
+        * CR0.CD/NW are set on RESET, preserved on INIT.  Note, some versions
+        * of Intel's SDM list CD/NW as being set on INIT, but they contradict
+        * (or qualify) that with a footnote stating that CD/NW are preserved.
+        */
+       new_cr0 = X86_CR0_ET;
+       if (init_event)
+               new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD));
+       else
+               new_cr0 |= X86_CR0_NW | X86_CR0_CD;
+
+       static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
+       static_call(kvm_x86_set_cr4)(vcpu, 0);
+       static_call(kvm_x86_set_efer)(vcpu, 0);
+       static_call(kvm_x86_update_exception_bitmap)(vcpu);
+
        /*
         * Reset the MMU context if paging was enabled prior to INIT (which is
         * implied if CR0.PG=1 as CR0 will be '0' prior to RESET).  Unlike the
@@ -10868,7 +10909,20 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
         */
        if (old_cr0 & X86_CR0_PG)
                kvm_mmu_reset_context(vcpu);
+
+       /*
+        * Intel's SDM states that all TLB entries are flushed on INIT.  AMD's
+        * APM states the TLBs are untouched by INIT, but it also states that
+        * the TLBs are flushed on "External initialization of the processor."
+        * Flush the guest TLB regardless of vendor, there is no meaningful
+        * benefit in relying on the guest to flush the TLB immediately after
+        * INIT.  A spurious TLB flush is benign and likely negligible from a
+        * performance perspective.
+        */
+       if (init_event)
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_vcpu_reset);
 
 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
 {
@@ -10985,9 +11039,6 @@ int kvm_arch_hardware_setup(void *opaque)
        int r;
 
        rdmsrl_safe(MSR_EFER, &host_efer);
-       if (WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_NX) &&
-                        !(host_efer & EFER_NX)))
-               return -EIO;
 
        if (boot_cpu_has(X86_FEATURE_XSAVES))
                rdmsrl(MSR_IA32_XSS, host_xss);
@@ -11115,6 +11166,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm_hv_init_vm(kvm);
        kvm_page_track_init(kvm);
        kvm_mmu_init_vm(kvm);
+       kvm_xen_init_vm(kvm);
 
        return static_call(kvm_x86_vm_init)(kvm);
 }
@@ -11304,8 +11356,7 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
 
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                int level = i + 1;
-               int lpages = gfn_to_index(slot->base_gfn + npages - 1,
-                                         slot->base_gfn, level) + 1;
+               int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
 
                WARN_ON(slot->arch.rmap[i]);
 
@@ -11388,8 +11439,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
                int lpages;
                int level = i + 1;
 
-               lpages = gfn_to_index(slot->base_gfn + npages - 1,
-                                     slot->base_gfn, level) + 1;
+               lpages = __kvm_mmu_slot_lpages(slot, npages, level);
 
                linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
                if (!linfo)
@@ -11473,7 +11523,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
 
 static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
                                     struct kvm_memory_slot *old,
-                                    struct kvm_memory_slot *new,
+                                    const struct kvm_memory_slot *new,
                                     enum kvm_mr_change change)
 {
        bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES;
@@ -11553,10 +11603,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                kvm_mmu_change_mmu_pages(kvm,
                                kvm_mmu_calculate_default_mmu_pages(kvm));
 
-       /*
-        * FIXME: const-ify all uses of struct kvm_memory_slot.
-        */
-       kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
+       kvm_mmu_slot_apply_flags(kvm, old, new, change);
 
        /* Free the arrays associated with the old memslot. */
        if (change == KVM_MR_MOVE)