Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[linux-2.6-microblaze.git] / arch / x86 / kvm / x86.c
index ffe5801..47d9f03 100644 (file)
@@ -1399,22 +1399,19 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 }
 EXPORT_SYMBOL_GPL(kvm_set_dr);
 
-void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
 {
        size_t size = ARRAY_SIZE(vcpu->arch.db);
 
        switch (dr) {
        case 0 ... 3:
-               *val = vcpu->arch.db[array_index_nospec(dr, size)];
-               break;
+               return vcpu->arch.db[array_index_nospec(dr, size)];
        case 4:
        case 6:
-               *val = vcpu->arch.dr6;
-               break;
+               return vcpu->arch.dr6;
        case 5:
        default: /* 7 */
-               *val = vcpu->arch.dr7;
-               break;
+               return vcpu->arch.dr7;
        }
 }
 EXPORT_SYMBOL_GPL(kvm_get_dr);
@@ -2860,7 +2857,11 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
        return v * clock->mult;
 }
 
-static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
+/*
+ * As with get_kvmclock_base_ns(), this counts from boot time, at the
+ * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
+ */
+static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp)
 {
        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
        unsigned long seq;
@@ -2879,6 +2880,29 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
        return mode;
 }
 
+/*
+ * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with
+ * no boot time offset.
+ */
+static int do_monotonic(s64 *t, u64 *tsc_timestamp)
+{
+       struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+       unsigned long seq;
+       int mode;
+       u64 ns;
+
+       do {
+               seq = read_seqcount_begin(&gtod->seq);
+               ns = gtod->clock.base_cycles;
+               ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
+               ns >>= gtod->clock.shift;
+               ns += ktime_to_ns(gtod->clock.offset);
+       } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+       *t = ns;
+
+       return mode;
+}
+
 static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
 {
        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
@@ -2900,18 +2924,42 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
        return mode;
 }
 
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and
+ * reports the TSC value from which it do so. Returns true if host is
+ * using TSC based clocksource.
+ */
 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
 {
        /* checked again under seqlock below */
        if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
                return false;
 
-       return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
-                                                     tsc_timestamp));
+       return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns,
+                                                    tsc_timestamp));
 }
 
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ */
+bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
+{
+       /* checked again under seqlock below */
+       if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+               return false;
+
+       return gtod_is_based_on_tsc(do_monotonic(kernel_ns,
+                                                tsc_timestamp));
+}
+
+/*
+ * Calculates CLOCK_REALTIME and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ *
+ * DO NOT USE this for anything related to migration. You want CLOCK_TAI
+ * for that.
+ */
 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
                                           u64 *tsc_timestamp)
 {
@@ -3158,7 +3206,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
 
        guest_hv_clock->version = ++vcpu->hv_clock.version;
 
-       mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+       kvm_gpc_mark_dirty_in_slot(gpc);
        read_unlock_irqrestore(&gpc->lock, flags);
 
        trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
@@ -4680,7 +4728,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                    KVM_XEN_HVM_CONFIG_SHARED_INFO |
                    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
                    KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
-                   KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+                   KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE |
+                   KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA;
                if (sched_info_on())
                        r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
                             KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
@@ -5064,8 +5113,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
        int idx;
 
        if (vcpu->preempted) {
-               if (!vcpu->arch.guest_state_protected)
-                       vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+               vcpu->arch.preempted_in_kernel = kvm_arch_vcpu_in_kernel(vcpu);
 
                /*
                 * Take the srcu lock as memslots will be accessed to check the gfn
@@ -5512,18 +5560,23 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
                                             struct kvm_debugregs *dbgregs)
 {
-       unsigned long val;
+       unsigned int i;
 
        memset(dbgregs, 0, sizeof(*dbgregs));
-       memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
-       kvm_get_dr(vcpu, 6, &val);
-       dbgregs->dr6 = val;
+
+       BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
+       for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+               dbgregs->db[i] = vcpu->arch.db[i];
+
+       dbgregs->dr6 = vcpu->arch.dr6;
        dbgregs->dr7 = vcpu->arch.dr7;
 }
 
 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
                                            struct kvm_debugregs *dbgregs)
 {
+       unsigned int i;
+
        if (dbgregs->flags)
                return -EINVAL;
 
@@ -5532,7 +5585,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
        if (!kvm_dr7_valid(dbgregs->dr7))
                return -EINVAL;
 
-       memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+       for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+               vcpu->arch.db[i] = dbgregs->db[i];
+
        kvm_update_dr0123(vcpu);
        vcpu->arch.dr6 = dbgregs->dr6;
        vcpu->arch.dr7 = dbgregs->dr7;
@@ -8180,10 +8235,9 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
        kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
 }
 
-static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
-                           unsigned long *dest)
+static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr)
 {
-       kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+       return kvm_get_dr(emul_to_vcpu(ctxt), dr);
 }
 
 static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
@@ -8405,12 +8459,9 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
        return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
 }
 
-static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
-                             u32 pmc)
+static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc)
 {
-       if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc))
-               return 0;
-       return -EINVAL;
+       return kvm_pmu_check_rdpmc_early(emul_to_vcpu(ctxt), pmc);
 }
 
 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -8542,7 +8593,7 @@ static const struct x86_emulate_ops emulate_ops = {
        .set_msr_with_filter = emulator_set_msr_with_filter,
        .get_msr_with_filter = emulator_get_msr_with_filter,
        .get_msr             = emulator_get_msr,
-       .check_pmc           = emulator_check_pmc,
+       .check_rdpmc_early   = emulator_check_rdpmc_early,
        .read_pmc            = emulator_read_pmc,
        .halt                = emulator_halt,
        .wbinvd              = emulator_wbinvd,
@@ -8803,31 +8854,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 
        kvm_release_pfn_clean(pfn);
 
-       /* The instructions are well-emulated on direct mmu. */
-       if (vcpu->arch.mmu->root_role.direct) {
-               unsigned int indirect_shadow_pages;
-
-               write_lock(&vcpu->kvm->mmu_lock);
-               indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
-               write_unlock(&vcpu->kvm->mmu_lock);
-
-               if (indirect_shadow_pages)
-                       kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
-
-               return true;
-       }
-
        /*
-        * if emulation was due to access to shadowed page table
-        * and it failed try to unshadow page and re-enter the
-        * guest to let CPU execute the instruction.
+        * If emulation may have been triggered by a write to a shadowed page
+        * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
+        * guest to let the CPU re-execute the instruction in the hope that the
+        * CPU can cleanly execute the instruction that KVM failed to emulate.
         */
-       kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+       if (vcpu->kvm->arch.indirect_shadow_pages)
+               kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
 
        /*
-        * If the access faults on its page table, it can not
-        * be fixed by unprotecting shadow page and it should
-        * be reported to userspace.
+        * If the failed instruction faulted on an access to page tables that
+        * are used to translate any part of the instruction, KVM can't resolve
+        * the issue by unprotecting the gfn, as zapping the shadow page will
+        * result in the instruction taking a !PRESENT page fault and thus put
+        * the vCPU into an infinite loop of page faults.  E.g. KVM will create
+        * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and
+        * then zap the SPTE to unprotect the gfn, and then do it all over
+        * again.  Report the error to userspace.
         */
        return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
 }
@@ -8922,7 +8966,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
        if (unlikely(!r))
                return 0;
 
-       kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+       kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED);
 
        /*
         * rflags is the old, "raw" value of the flags.  The new value has
@@ -9235,9 +9279,9 @@ writeback:
                 */
                if (!ctxt->have_exception ||
                    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
-                       kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+                       kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED);
                        if (ctxt->is_branch)
-                               kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+                               kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED);
                        kvm_rip_write(vcpu, ctxt->eip);
                        if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
                                r = kvm_vcpu_do_singlestep(vcpu);
@@ -9648,11 +9692,13 @@ static void kvm_x86_check_cpu_compat(void *ret)
        *(int *)ret = kvm_x86_check_processor_compatibility();
 }
 
-static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 {
        u64 host_pat;
        int r, cpu;
 
+       guard(mutex)(&vendor_module_lock);
+
        if (kvm_x86_ops.hardware_enable) {
                pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
                return -EEXIST;
@@ -9782,17 +9828,6 @@ out_free_x86_emulator_cache:
        kmem_cache_destroy(x86_emulator_cache);
        return r;
 }
-
-int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
-{
-       int r;
-
-       mutex_lock(&vendor_module_lock);
-       r = __kvm_x86_vendor_init(ops);
-       mutex_unlock(&vendor_module_lock);
-
-       return r;
-}
 EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
 
 void kvm_x86_vendor_exit(void)
@@ -10689,12 +10724,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
        static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
 }
 
-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
-       smp_send_reschedule(vcpu->cpu);
-}
-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
-
 /*
  * Called within kvm->srcu read side.
  * Returns 1 to let vcpu_run() continue the guest execution loop without
@@ -10944,10 +10973,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                goto cancel_injection;
        }
 
-       if (req_immediate_exit) {
+       if (req_immediate_exit)
                kvm_make_request(KVM_REQ_EVENT, vcpu);
-               static_call(kvm_x86_request_immediate_exit)(vcpu);
-       }
 
        fpregs_assert_state_consistent();
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -10978,7 +11005,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
                             (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
 
-               exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
+               exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
                if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
                        break;
 
@@ -12065,7 +12092,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        vcpu->arch.regs_avail = ~0;
        vcpu->arch.regs_dirty = ~0;
 
-       kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
+       kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
 
        if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -12076,27 +12103,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        if (r < 0)
                return r;
 
-       if (irqchip_in_kernel(vcpu->kvm)) {
-               r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
-               if (r < 0)
-                       goto fail_mmu_destroy;
-
-               /*
-                * Defer evaluating inhibits until the vCPU is first run, as
-                * this vCPU will not get notified of any changes until this
-                * vCPU is visible to other vCPUs (marked online and added to
-                * the set of vCPUs).  Opportunistically mark APICv active as
-                * VMX in particularly is highly unlikely to have inhibits.
-                * Ignore the current per-VM APICv state so that vCPU creation
-                * is guaranteed to run with a deterministic value, the request
-                * will ensure the vCPU gets the correct state before VM-Entry.
-                */
-               if (enable_apicv) {
-                       vcpu->arch.apic->apicv_active = true;
-                       kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
-               }
-       } else
-               static_branch_inc(&kvm_has_noapic_vcpu);
+       r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+       if (r < 0)
+               goto fail_mmu_destroy;
 
        r = -ENOMEM;
 
@@ -12217,8 +12226,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
        free_page((unsigned long)vcpu->arch.pio_data);
        kvfree(vcpu->arch.cpuid_entries);
-       if (!lapic_in_kernel(vcpu))
-               static_branch_dec(&kvm_has_noapic_vcpu);
 }
 
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -12495,9 +12502,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
        return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
 }
 
-__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
-EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
-
 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -13100,11 +13104,13 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 
 bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
 {
-       if (kvm_vcpu_apicv_active(vcpu) &&
-           static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
-               return true;
+       return kvm_vcpu_apicv_active(vcpu) &&
+              static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu);
+}
 
-       return false;
+bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.preempted_in_kernel;
 }
 
 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
@@ -13127,9 +13133,6 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
        if (vcpu->arch.guest_state_protected)
                return true;
 
-       if (vcpu != kvm_get_running_vcpu())
-               return vcpu->arch.preempted_in_kernel;
-
        return static_call(kvm_x86_get_cpl)(vcpu) == 0;
 }
 
@@ -13924,9 +13927,6 @@ module_init(kvm_x86_init);
 
 static void __exit kvm_x86_exit(void)
 {
-       /*
-        * If module_init() is implemented, module_exit() must also be
-        * implemented to allow module unload.
-        */
+       WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu));
 }
 module_exit(kvm_x86_exit);