KVM: x86/pmu: Reprogram PEBS event to emulate guest PEBS counter
authorLike Xu <likexu@tencent.com>
Mon, 11 Apr 2022 10:19:37 +0000 (18:19 +0800)
committerPaolo Bonzini <pbonzini@redhat.com>
Wed, 8 Jun 2022 08:47:58 +0000 (04:47 -0400)
When a guest counter is configured as a PEBS counter through
IA32_PEBS_ENABLE, a guest PEBS event will be reprogrammed by
configuring a non-zero precision level in the perf_event_attr.

The guest PEBS overflow PMI bit would be set in the guest
GLOBAL_STATUS MSR when PEBS facility generates a PEBS
overflow PMI based on guest IA32_DS_AREA MSR.

Even with the same counter index and the same event code and
mask, guest PEBS events will not be reused for non-PEBS events.

Originally-by: Andi Kleen <ak@linux.intel.com>
Co-developed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Like Xu <likexu@tencent.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Message-Id: <20220411101946.20262-9-likexu@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
arch/x86/kvm/pmu.c

index 3f868fe..cdefcb0 100644 (file)
@@ -86,15 +86,22 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
 static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
 {
        struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+       bool skip_pmi = false;
 
        /* Ignore counters that have been reprogrammed already. */
        if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
                return;
 
-       __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+       if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
+               /* Indicate PEBS overflow PMI to guest. */
+               skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
+                                             (unsigned long *)&pmu->global_status);
+       } else {
+               __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+       }
        kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
 
-       if (!pmc->intr)
+       if (!pmc->intr || skip_pmi)
                return;
 
        /*
@@ -124,6 +131,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
                                  u64 config, bool exclude_user,
                                  bool exclude_kernel, bool intr)
 {
+       struct kvm_pmu *pmu = pmc_to_pmu(pmc);
        struct perf_event *event;
        struct perf_event_attr attr = {
                .type = type,
@@ -135,6 +143,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
                .exclude_kernel = exclude_kernel,
                .config = config,
        };
+       bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
 
        if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
                return;
@@ -150,6 +159,23 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
                 */
                attr.sample_period = 0;
        }
+       if (pebs) {
+               /*
+                * The non-zero precision level of guest event makes the ordinary
+                * guest event becomes a guest PEBS event and triggers the host
+                * PEBS PMI handler to determine whether the PEBS overflow PMI
+                * comes from the host counters or the guest.
+                *
+                * For most PEBS hardware events, the difference in the software
+                * precision levels of guest and host PEBS events will not affect
+                * the accuracy of the PEBS profiling result, because the "event IP"
+                * in the PEBS record is calibrated on the guest side.
+                *
+                * On Icelake everything is fine. Other hardware (GLC+, TNT+) that
+                * could possibly care here is unsupported and needs changes.
+                */
+               attr.precise_ip = 1;
+       }
 
        event = perf_event_create_kernel_counter(&attr, -1, current,
                                                 kvm_perf_overflow, pmc);
@@ -163,7 +189,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
        pmc_to_pmu(pmc)->event_count++;
        clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
        pmc->is_paused = false;
-       pmc->intr = intr;
+       pmc->intr = intr || pebs;
 }
 
 static void pmc_pause_counter(struct kvm_pmc *pmc)
@@ -189,6 +215,10 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
                              get_sample_period(pmc, pmc->counter)))
                return false;
 
+       if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) &&
+           pmc->perf_event->attr.precise_ip)
+               return false;
+
        /* reuse perf_event to serve as pmc_reprogram_counter() does*/
        perf_event_enable(pmc->perf_event);
        pmc->is_paused = false;