Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

[linux-2.6-microblaze.git] / arch / x86 / kvm / x86.c
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index ffe5801..47d9f03 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1399,22 +1399,19 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
  }
  EXPORT_SYMBOL_GPL(kvm_set_dr);
  
-void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
  {
         size_t size = ARRAY_SIZE(vcpu->arch.db);
  
         switch (dr) {
         case 0 ... 3:
-               *val = vcpu->arch.db[array_index_nospec(dr, size)];
-               break;
+               return vcpu->arch.db[array_index_nospec(dr, size)];
         case 4:
         case 6:
-               *val = vcpu->arch.dr6;
-               break;
+               return vcpu->arch.dr6;
         case 5:
         default: /* 7 */
-               *val = vcpu->arch.dr7;
-               break;
+               return vcpu->arch.dr7;
         }
  }
  EXPORT_SYMBOL_GPL(kvm_get_dr);
@@ -2860,7 +2857,11 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
         return v * clock->mult;
  }
  
-static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
+/*
+ * As with get_kvmclock_base_ns(), this counts from boot time, at the
+ * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
+ */
+static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp)
  {
         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
         unsigned long seq;
@@ -2879,6 +2880,29 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
         return mode;
  }
  
+/*
+ * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with
+ * no boot time offset.
+ */
+static int do_monotonic(s64 *t, u64 *tsc_timestamp)
+{
+       struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+       unsigned long seq;
+       int mode;
+       u64 ns;
+
+       do {
+               seq = read_seqcount_begin(&gtod->seq);
+               ns = gtod->clock.base_cycles;
+               ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
+               ns >>= gtod->clock.shift;
+               ns += ktime_to_ns(gtod->clock.offset);
+       } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+       *t = ns;
+
+       return mode;
+}
+
  static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
  {
         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
@@ -2900,18 +2924,42 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
         return mode;
  }
  
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and
+ * reports the TSC value from which it do so. Returns true if host is
+ * using TSC based clocksource.
+ */
  static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
  {
         /* checked again under seqlock below */
         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
                 return false;
  
-       return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
-                                                     tsc_timestamp));
+       return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns,
+                                                    tsc_timestamp));
  }
  
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ */
+bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
+{
+       /* checked again under seqlock below */
+       if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+               return false;
+
+       return gtod_is_based_on_tsc(do_monotonic(kernel_ns,
+                                                tsc_timestamp));
+}
+
+/*
+ * Calculates CLOCK_REALTIME and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ *
+ * DO NOT USE this for anything related to migration. You want CLOCK_TAI
+ * for that.
+ */
  static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
                                            u64 *tsc_timestamp)
  {
@@ -3158,7 +3206,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
  
         guest_hv_clock->version = ++vcpu->hv_clock.version;
  
-       mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+       kvm_gpc_mark_dirty_in_slot(gpc);
         read_unlock_irqrestore(&gpc->lock, flags);
  
         trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
@@ -4680,7 +4728,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                     KVM_XEN_HVM_CONFIG_SHARED_INFO |
                     KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
                     KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
-                   KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+                   KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE |
+                   KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA;
                 if (sched_info_on())
                         r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
                              KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
@@ -5064,8 +5113,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
         int idx;
  
         if (vcpu->preempted) {
-               if (!vcpu->arch.guest_state_protected)
-                       vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+               vcpu->arch.preempted_in_kernel = kvm_arch_vcpu_in_kernel(vcpu);
  
                 /*
                  * Take the srcu lock as memslots will be accessed to check the gfn
@@ -5512,18 +5560,23 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
  static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
                                              struct kvm_debugregs *dbgregs)
  {
-       unsigned long val;
+       unsigned int i;
  
         memset(dbgregs, 0, sizeof(*dbgregs));
-       memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
-       kvm_get_dr(vcpu, 6, &val);
-       dbgregs->dr6 = val;
+
+       BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
+       for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+               dbgregs->db[i] = vcpu->arch.db[i];
+
+       dbgregs->dr6 = vcpu->arch.dr6;
         dbgregs->dr7 = vcpu->arch.dr7;
  }
  
  static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
                                             struct kvm_debugregs *dbgregs)
  {
+       unsigned int i;
+
         if (dbgregs->flags)
                 return -EINVAL;
  
@@ -5532,7 +5585,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
         if (!kvm_dr7_valid(dbgregs->dr7))
                 return -EINVAL;
  
-       memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+       for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+               vcpu->arch.db[i] = dbgregs->db[i];
+
         kvm_update_dr0123(vcpu);
         vcpu->arch.dr6 = dbgregs->dr6;
         vcpu->arch.dr7 = dbgregs->dr7;
@@ -8180,10 +8235,9 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
         kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
  }
  
-static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
-                           unsigned long *dest)
+static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr)
  {
-       kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+       return kvm_get_dr(emul_to_vcpu(ctxt), dr);
  }
  
  static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
@@ -8405,12 +8459,9 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
         return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
  }
  
-static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
-                             u32 pmc)
+static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc)
  {
-       if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc))
-               return 0;
-       return -EINVAL;
+       return kvm_pmu_check_rdpmc_early(emul_to_vcpu(ctxt), pmc);
  }
  
  static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -8542,7 +8593,7 @@ static const struct x86_emulate_ops emulate_ops = {
         .set_msr_with_filter = emulator_set_msr_with_filter,
         .get_msr_with_filter = emulator_get_msr_with_filter,
         .get_msr             = emulator_get_msr,
-       .check_pmc           = emulator_check_pmc,
+       .check_rdpmc_early   = emulator_check_rdpmc_early,
         .read_pmc            = emulator_read_pmc,
         .halt                = emulator_halt,
         .wbinvd              = emulator_wbinvd,
@@ -8803,31 +8854,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
  
         kvm_release_pfn_clean(pfn);
  
-       /* The instructions are well-emulated on direct mmu. */
-       if (vcpu->arch.mmu->root_role.direct) {
-               unsigned int indirect_shadow_pages;
-
-               write_lock(&vcpu->kvm->mmu_lock);
-               indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
-               write_unlock(&vcpu->kvm->mmu_lock);
-
-               if (indirect_shadow_pages)
-                       kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
-
-               return true;
-       }
-
         /*
-        * if emulation was due to access to shadowed page table
-        * and it failed try to unshadow page and re-enter the
-        * guest to let CPU execute the instruction.
+        * If emulation may have been triggered by a write to a shadowed page
+        * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
+        * guest to let the CPU re-execute the instruction in the hope that the
+        * CPU can cleanly execute the instruction that KVM failed to emulate.
          */
-       kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+       if (vcpu->kvm->arch.indirect_shadow_pages)
+               kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
  
         /*
-        * If the access faults on its page table, it can not
-        * be fixed by unprotecting shadow page and it should
-        * be reported to userspace.
+        * If the failed instruction faulted on an access to page tables that
+        * are used to translate any part of the instruction, KVM can't resolve
+        * the issue by unprotecting the gfn, as zapping the shadow page will
+        * result in the instruction taking a !PRESENT page fault and thus put
+        * the vCPU into an infinite loop of page faults.  E.g. KVM will create
+        * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and
+        * then zap the SPTE to unprotect the gfn, and then do it all over
+        * again.  Report the error to userspace.
          */
         return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
  }
@@ -8922,7 +8966,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
         if (unlikely(!r))
                 return 0;
  
-       kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+       kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED);
  
         /*
          * rflags is the old, "raw" value of the flags.  The new value has
@@ -9235,9 +9279,9 @@ writeback:
                  */
                 if (!ctxt->have_exception ||
                     exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
-                       kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+                       kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED);
                         if (ctxt->is_branch)
-                               kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+                               kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED);
                         kvm_rip_write(vcpu, ctxt->eip);
                         if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
                                 r = kvm_vcpu_do_singlestep(vcpu);
@@ -9648,11 +9692,13 @@ static void kvm_x86_check_cpu_compat(void *ret)
         *(int *)ret = kvm_x86_check_processor_compatibility();
  }
  
-static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
  {
         u64 host_pat;
         int r, cpu;
  
+       guard(mutex)(&vendor_module_lock);
+
         if (kvm_x86_ops.hardware_enable) {
                 pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
                 return -EEXIST;
@@ -9782,17 +9828,6 @@ out_free_x86_emulator_cache:
         kmem_cache_destroy(x86_emulator_cache);
         return r;
  }
-
-int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
-{
-       int r;
-
-       mutex_lock(&vendor_module_lock);
-       r = __kvm_x86_vendor_init(ops);
-       mutex_unlock(&vendor_module_lock);
-
-       return r;
-}
  EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
  
  void kvm_x86_vendor_exit(void)
@@ -10689,12 +10724,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
         static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
  }
  
-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
-       smp_send_reschedule(vcpu->cpu);
-}
-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
-
  /*
   * Called within kvm->srcu read side.
   * Returns 1 to let vcpu_run() continue the guest execution loop without
@@ -10944,10 +10973,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 goto cancel_injection;
         }
  
-       if (req_immediate_exit) {
+       if (req_immediate_exit)
                 kvm_make_request(KVM_REQ_EVENT, vcpu);
-               static_call(kvm_x86_request_immediate_exit)(vcpu);
-       }
  
         fpregs_assert_state_consistent();
         if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -10978,7 +11005,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
                              (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
  
-               exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
+               exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
                 if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
                         break;
  
@@ -12065,7 +12092,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
         vcpu->arch.regs_avail = ~0;
         vcpu->arch.regs_dirty = ~0;
  
-       kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
+       kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
  
         if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -12076,27 +12103,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
         if (r < 0)
                 return r;
  
-       if (irqchip_in_kernel(vcpu->kvm)) {
-               r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
-               if (r < 0)
-                       goto fail_mmu_destroy;
-
-               /*
-                * Defer evaluating inhibits until the vCPU is first run, as
-                * this vCPU will not get notified of any changes until this
-                * vCPU is visible to other vCPUs (marked online and added to
-                * the set of vCPUs).  Opportunistically mark APICv active as
-                * VMX in particularly is highly unlikely to have inhibits.
-                * Ignore the current per-VM APICv state so that vCPU creation
-                * is guaranteed to run with a deterministic value, the request
-                * will ensure the vCPU gets the correct state before VM-Entry.
-                */
-               if (enable_apicv) {
-                       vcpu->arch.apic->apicv_active = true;
-                       kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
-               }
-       } else
-               static_branch_inc(&kvm_has_noapic_vcpu);
+       r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+       if (r < 0)
+               goto fail_mmu_destroy;
  
         r = -ENOMEM;
  
@@ -12217,8 +12226,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
         srcu_read_unlock(&vcpu->kvm->srcu, idx);
         free_page((unsigned long)vcpu->arch.pio_data);
         kvfree(vcpu->arch.cpuid_entries);
-       if (!lapic_in_kernel(vcpu))
-               static_branch_dec(&kvm_has_noapic_vcpu);
  }
  
  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -12495,9 +12502,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
         return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
  }
  
-__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
-EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
-
  void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
  {
         struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -13100,11 +13104,13 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
  
  bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
  {
-       if (kvm_vcpu_apicv_active(vcpu) &&
-           static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
-               return true;
+       return kvm_vcpu_apicv_active(vcpu) &&
+              static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu);
+}
  
-       return false;
+bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.preempted_in_kernel;
  }
  
  bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
@@ -13127,9 +13133,6 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
         if (vcpu->arch.guest_state_protected)
                 return true;
  
-       if (vcpu != kvm_get_running_vcpu())
-               return vcpu->arch.preempted_in_kernel;
-
         return static_call(kvm_x86_get_cpl)(vcpu) == 0;
  }
  
@@ -13924,9 +13927,6 @@ module_init(kvm_x86_init);
  
  static void __exit kvm_x86_exit(void)
  {
-       /*
-        * If module_init() is implemented, module_exit() must also be
-        * implemented to allow module unload.
-        */
+       WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu));
  }
  module_exit(kvm_x86_exit);