Merge tag 'docs-5.5a' of git://git.lwn.net/linux
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / i915_perf.c
index e42b868..65d7c2e 100644 (file)
 #include <linux/uuid.h>
 
 #include "gem/i915_gem_context.h"
-#include "gem/i915_gem_pm.h"
+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
+#include "gt/intel_gt.h"
 #include "gt/intel_lrc_reg.h"
+#include "gt/intel_ring.h"
 
 #include "i915_drv.h"
 #include "i915_perf.h"
 #include "oa/i915_oa_cflgt3.h"
 #include "oa/i915_oa_cnl.h"
 #include "oa/i915_oa_icl.h"
+#include "oa/i915_oa_tgl.h"
 
 /* HW requires this to be a power of two, between 128k and 16M, though driver
  * is currently generally designed assuming the largest 16M size is used such
@@ -291,6 +295,7 @@ static u32 i915_perf_stream_paranoid = true;
 
 /* On Gen8+ automatically triggered OA reports include a 'reason' field... */
 #define OAREPORT_REASON_MASK           0x3f
+#define OAREPORT_REASON_MASK_EXTENDED  0x7f
 #define OAREPORT_REASON_SHIFT          19
 #define OAREPORT_REASON_TIMER          (1<<0)
 #define OAREPORT_REASON_CTX_SWITCH     (1<<3)
@@ -336,17 +341,24 @@ static const struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
        [I915_OA_FORMAT_C4_B8]              = { 7, 64 },
 };
 
+static const struct i915_oa_format gen12_oa_formats[I915_OA_FORMAT_MAX] = {
+       [I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
+};
+
 #define SAMPLE_OA_REPORT      (1<<0)
 
 /**
  * struct perf_open_properties - for validated properties given to open a stream
  * @sample_flags: `DRM_I915_PERF_PROP_SAMPLE_*` properties are tracked as flags
  * @single_context: Whether a single or all gpu contexts should be monitored
+ * @hold_preemption: Whether the preemption is disabled for the filtered
+ *                   context
  * @ctx_handle: A gem ctx handle for use with @single_context
  * @metrics_set: An ID for an OA unit metric set advertised via sysfs
  * @oa_format: An OA unit HW report format
  * @oa_periodic: Whether to enable periodic OA unit sampling
  * @oa_period_exponent: The OA unit sampling period is derived from this
+ * @engine: The engine (typically rcs0) being monitored by the OA unit
  *
  * As read_properties_unlocked() enumerates and validates the properties given
  * to open a stream of metrics the configuration is built up in the structure
@@ -356,6 +368,7 @@ struct perf_open_properties {
        u32 sample_flags;
 
        u64 single_context:1;
+       u64 hold_preemption:1;
        u64 ctx_handle;
 
        /* OA sampling state */
@@ -363,69 +376,74 @@ struct perf_open_properties {
        int oa_format;
        bool oa_periodic;
        int oa_period_exponent;
+
+       struct intel_engine_cs *engine;
+};
+
+struct i915_oa_config_bo {
+       struct llist_node node;
+
+       struct i915_oa_config *oa_config;
+       struct i915_vma *vma;
 };
 
 static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer);
 
-static void free_oa_config(struct drm_i915_private *dev_priv,
-                          struct i915_oa_config *oa_config)
+void i915_oa_config_release(struct kref *ref)
 {
-       if (!PTR_ERR(oa_config->flex_regs))
-               kfree(oa_config->flex_regs);
-       if (!PTR_ERR(oa_config->b_counter_regs))
-               kfree(oa_config->b_counter_regs);
-       if (!PTR_ERR(oa_config->mux_regs))
-               kfree(oa_config->mux_regs);
-       kfree(oa_config);
-}
+       struct i915_oa_config *oa_config =
+               container_of(ref, typeof(*oa_config), ref);
 
-static void put_oa_config(struct drm_i915_private *dev_priv,
-                         struct i915_oa_config *oa_config)
-{
-       if (!atomic_dec_and_test(&oa_config->ref_count))
-               return;
+       kfree(oa_config->flex_regs);
+       kfree(oa_config->b_counter_regs);
+       kfree(oa_config->mux_regs);
 
-       free_oa_config(dev_priv, oa_config);
+       kfree_rcu(oa_config, rcu);
 }
 
-static int get_oa_config(struct drm_i915_private *dev_priv,
-                        int metrics_set,
-                        struct i915_oa_config **out_config)
+struct i915_oa_config *
+i915_perf_get_oa_config(struct i915_perf *perf, int metrics_set)
 {
-       int ret;
+       struct i915_oa_config *oa_config;
 
-       if (metrics_set == 1) {
-               *out_config = &dev_priv->perf.test_config;
-               atomic_inc(&dev_priv->perf.test_config.ref_count);
-               return 0;
-       }
+       rcu_read_lock();
+       if (metrics_set == 1)
+               oa_config = &perf->test_config;
+       else
+               oa_config = idr_find(&perf->metrics_idr, metrics_set);
+       if (oa_config)
+               oa_config = i915_oa_config_get(oa_config);
+       rcu_read_unlock();
 
-       ret = mutex_lock_interruptible(&dev_priv->perf.metrics_lock);
-       if (ret)
-               return ret;
+       return oa_config;
+}
 
-       *out_config = idr_find(&dev_priv->perf.metrics_idr, metrics_set);
-       if (!*out_config)
-               ret = -EINVAL;
-       else
-               atomic_inc(&(*out_config)->ref_count);
+static void free_oa_config_bo(struct i915_oa_config_bo *oa_bo)
+{
+       i915_oa_config_put(oa_bo->oa_config);
+       i915_vma_put(oa_bo->vma);
+       kfree(oa_bo);
+}
 
-       mutex_unlock(&dev_priv->perf.metrics_lock);
+static u32 gen12_oa_hw_tail_read(struct i915_perf_stream *stream)
+{
+       struct intel_uncore *uncore = stream->uncore;
 
-       return ret;
+       return intel_uncore_read(uncore, GEN12_OAG_OATAILPTR) &
+              GEN12_OAG_OATAILPTR_MASK;
 }
 
 static u32 gen8_oa_hw_tail_read(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct intel_uncore *uncore = stream->uncore;
 
-       return I915_READ(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
+       return intel_uncore_read(uncore, GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
 }
 
 static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
-       u32 oastatus1 = I915_READ(GEN7_OASTATUS1);
+       struct intel_uncore *uncore = stream->uncore;
+       u32 oastatus1 = intel_uncore_read(uncore, GEN7_OASTATUS1);
 
        return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
 }
@@ -456,7 +474,6 @@ static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream)
  */
 static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
        int report_size = stream->oa_buffer.format_size;
        unsigned long flags;
        unsigned int aged_idx;
@@ -479,7 +496,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
        aged_tail = stream->oa_buffer.tails[aged_idx].offset;
        aging_tail = stream->oa_buffer.tails[!aged_idx].offset;
 
-       hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream);
+       hw_tail = stream->perf->ops.oa_hw_tail_read(stream);
 
        /* The tail pointer increases in 64 byte increments,
         * not in report_size steps...
@@ -536,7 +553,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
                                aging_tail = hw_tail;
                        stream->oa_buffer.aging_timestamp = now;
                } else {
-                       DRM_ERROR("Ignoring spurious out of range OA buffer tail pointer = %u\n",
+                       DRM_ERROR("Ignoring spurious out of range OA buffer tail pointer = %x\n",
                                  hw_tail);
                }
        }
@@ -655,7 +672,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
                                  size_t count,
                                  size_t *offset)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct intel_uncore *uncore = stream->uncore;
        int report_size = stream->oa_buffer.format_size;
        u8 *oa_buf_base = stream->oa_buffer.vaddr;
        u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
@@ -738,9 +755,11 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
                 * it to userspace...
                 */
                reason = ((report32[0] >> OAREPORT_REASON_SHIFT) &
-                         OAREPORT_REASON_MASK);
+                         (IS_GEN(stream->perf->i915, 12) ?
+                          OAREPORT_REASON_MASK_EXTENDED :
+                          OAREPORT_REASON_MASK));
                if (reason == 0) {
-                       if (__ratelimit(&dev_priv->perf.spurious_report_rs))
+                       if (__ratelimit(&stream->perf->spurious_report_rs))
                                DRM_NOTE("Skipping spurious, invalid OA report\n");
                        continue;
                }
@@ -755,7 +774,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
                 * Note: that we don't clear the valid_ctx_bit so userspace can
                 * understand that the ID has been squashed by the kernel.
                 */
-               if (!(report32[0] & dev_priv->perf.gen8_valid_ctx_bit))
+               if (!(report32[0] & stream->perf->gen8_valid_ctx_bit) &&
+                   INTEL_GEN(stream->perf->i915) <= 11)
                        ctx_id = report32[2] = INVALID_CTX_ID;
 
                /*
@@ -789,7 +809,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
                 * switches since it's not-uncommon for periodic samples to
                 * identify a switch before any 'context switch' report.
                 */
-               if (!dev_priv->perf.exclusive_stream->ctx ||
+               if (!stream->perf->exclusive_stream->ctx ||
                    stream->specific_ctx_id == ctx_id ||
                    stream->oa_buffer.last_ctx_id == stream->specific_ctx_id ||
                    reason & OAREPORT_REASON_CTX_SWITCH) {
@@ -798,7 +818,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
                         * While filtering for a single context we avoid
                         * leaking the IDs of other contexts.
                         */
-                       if (dev_priv->perf.exclusive_stream->ctx &&
+                       if (stream->perf->exclusive_stream->ctx &&
                            stream->specific_ctx_id != ctx_id) {
                                report32[2] = INVALID_CTX_ID;
                        }
@@ -822,6 +842,11 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
        }
 
        if (start_offset != *offset) {
+               i915_reg_t oaheadptr;
+
+               oaheadptr = IS_GEN(stream->perf->i915, 12) ?
+                           GEN12_OAG_OAHEADPTR : GEN8_OAHEADPTR;
+
                spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
 
                /*
@@ -829,8 +854,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
                 * relative to oa_buf_base so put back here...
                 */
                head += gtt_offset;
-
-               I915_WRITE(GEN8_OAHEADPTR, head & GEN8_OAHEADPTR_MASK);
+               intel_uncore_write(uncore, oaheadptr,
+                                  head & GEN12_OAG_OAHEADPTR_MASK);
                stream->oa_buffer.head = head;
 
                spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
@@ -864,14 +889,18 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
                        size_t count,
                        size_t *offset)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct intel_uncore *uncore = stream->uncore;
        u32 oastatus;
+       i915_reg_t oastatus_reg;
        int ret;
 
        if (WARN_ON(!stream->oa_buffer.vaddr))
                return -EIO;
 
-       oastatus = I915_READ(GEN8_OASTATUS);
+       oastatus_reg = IS_GEN(stream->perf->i915, 12) ?
+                      GEN12_OAG_OASTATUS : GEN8_OASTATUS;
+
+       oastatus = intel_uncore_read(uncore, oastatus_reg);
 
        /*
         * We treat OABUFFER_OVERFLOW as a significant error:
@@ -896,14 +925,14 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
                DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
                          stream->period_exponent);
 
-               dev_priv->perf.ops.oa_disable(stream);
-               dev_priv->perf.ops.oa_enable(stream);
+               stream->perf->ops.oa_disable(stream);
+               stream->perf->ops.oa_enable(stream);
 
                /*
                 * Note: .oa_enable() is expected to re-init the oabuffer and
                 * reset GEN8_OASTATUS for us
                 */
-               oastatus = I915_READ(GEN8_OASTATUS);
+               oastatus = intel_uncore_read(uncore, oastatus_reg);
        }
 
        if (oastatus & GEN8_OASTATUS_REPORT_LOST) {
@@ -911,8 +940,8 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
                                       DRM_I915_PERF_RECORD_OA_REPORT_LOST);
                if (ret)
                        return ret;
-               I915_WRITE(GEN8_OASTATUS,
-                          oastatus & ~GEN8_OASTATUS_REPORT_LOST);
+               intel_uncore_write(uncore, oastatus_reg,
+                                  oastatus & ~GEN8_OASTATUS_REPORT_LOST);
        }
 
        return gen8_append_oa_reports(stream, buf, count, offset);
@@ -943,7 +972,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
                                  size_t count,
                                  size_t *offset)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct intel_uncore *uncore = stream->uncore;
        int report_size = stream->oa_buffer.format_size;
        u8 *oa_buf_base = stream->oa_buffer.vaddr;
        u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
@@ -1017,7 +1046,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
                 * copying it to userspace...
                 */
                if (report32[0] == 0) {
-                       if (__ratelimit(&dev_priv->perf.spurious_report_rs))
+                       if (__ratelimit(&stream->perf->spurious_report_rs))
                                DRM_NOTE("Skipping spurious, invalid OA report\n");
                        continue;
                }
@@ -1043,9 +1072,9 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
                 */
                head += gtt_offset;
 
-               I915_WRITE(GEN7_OASTATUS2,
-                          ((head & GEN7_OASTATUS2_HEAD_MASK) |
-                           GEN7_OASTATUS2_MEM_SELECT_GGTT));
+               intel_uncore_write(uncore, GEN7_OASTATUS2,
+                                  (head & GEN7_OASTATUS2_HEAD_MASK) |
+                                  GEN7_OASTATUS2_MEM_SELECT_GGTT);
                stream->oa_buffer.head = head;
 
                spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
@@ -1075,21 +1104,21 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
                        size_t count,
                        size_t *offset)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct intel_uncore *uncore = stream->uncore;
        u32 oastatus1;
        int ret;
 
        if (WARN_ON(!stream->oa_buffer.vaddr))
                return -EIO;
 
-       oastatus1 = I915_READ(GEN7_OASTATUS1);
+       oastatus1 = intel_uncore_read(uncore, GEN7_OASTATUS1);
 
        /* XXX: On Haswell we don't have a safe way to clear oastatus1
         * bits while the OA unit is enabled (while the tail pointer
         * may be updated asynchronously) so we ignore status bits
         * that have already been reported to userspace.
         */
-       oastatus1 &= ~dev_priv->perf.gen7_latched_oastatus1;
+       oastatus1 &= ~stream->perf->gen7_latched_oastatus1;
 
        /* We treat OABUFFER_OVERFLOW as a significant error:
         *
@@ -1120,10 +1149,10 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
                DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
                          stream->period_exponent);
 
-               dev_priv->perf.ops.oa_disable(stream);
-               dev_priv->perf.ops.oa_enable(stream);
+               stream->perf->ops.oa_disable(stream);
+               stream->perf->ops.oa_enable(stream);
 
-               oastatus1 = I915_READ(GEN7_OASTATUS1);
+               oastatus1 = intel_uncore_read(uncore, GEN7_OASTATUS1);
        }
 
        if (unlikely(oastatus1 & GEN7_OASTATUS1_REPORT_LOST)) {
@@ -1131,7 +1160,7 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
                                       DRM_I915_PERF_RECORD_OA_REPORT_LOST);
                if (ret)
                        return ret;
-               dev_priv->perf.gen7_latched_oastatus1 |=
+               stream->perf->gen7_latched_oastatus1 |=
                        GEN7_OASTATUS1_REPORT_LOST;
        }
 
@@ -1196,25 +1225,18 @@ static int i915_oa_read(struct i915_perf_stream *stream,
                        size_t count,
                        size_t *offset)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
-
-       return dev_priv->perf.ops.read(stream, buf, count, offset);
+       return stream->perf->ops.read(stream, buf, count, offset);
 }
 
 static struct intel_context *oa_pin_context(struct i915_perf_stream *stream)
 {
        struct i915_gem_engines_iter it;
-       struct drm_i915_private *i915 = stream->dev_priv;
        struct i915_gem_context *ctx = stream->ctx;
        struct intel_context *ce;
        int err;
 
-       err = i915_mutex_lock_interruptible(&i915->drm);
-       if (err)
-               return ERR_PTR(err);
-
        for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
-               if (ce->engine->class != RENDER_CLASS)
+               if (ce->engine != stream->engine) /* first match! */
                        continue;
 
                /*
@@ -1229,10 +1251,6 @@ static struct intel_context *oa_pin_context(struct i915_perf_stream *stream)
        }
        i915_gem_context_unlock_engines(ctx);
 
-       mutex_unlock(&i915->drm.struct_mutex);
-       if (err)
-               return ERR_PTR(err);
-
        return stream->pinned_ctx;
 }
 
@@ -1248,14 +1266,13 @@ static struct intel_context *oa_pin_context(struct i915_perf_stream *stream)
  */
 static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *i915 = stream->dev_priv;
        struct intel_context *ce;
 
        ce = oa_pin_context(stream);
        if (IS_ERR(ce))
                return PTR_ERR(ce);
 
-       switch (INTEL_GEN(i915)) {
+       switch (INTEL_GEN(ce->engine->i915)) {
        case 7: {
                /*
                 * On Haswell we don't do any post processing of the reports
@@ -1269,7 +1286,11 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
        case 8:
        case 9:
        case 10:
-               if (USES_GUC_SUBMISSION(i915)) {
+               if (intel_engine_in_execlists_submission_mode(ce->engine)) {
+                       stream->specific_ctx_id_mask =
+                               (1U << GEN8_CTX_ID_WIDTH) - 1;
+                       stream->specific_ctx_id = stream->specific_ctx_id_mask;
+               } else {
                        /*
                         * When using GuC, the context descriptor we write in
                         * i915 is read by GuC and rewritten before it's
@@ -1289,31 +1310,23 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
                         */
                        stream->specific_ctx_id_mask =
                                (1U << (GEN8_CTX_ID_WIDTH - 1)) - 1;
-               } else {
-                       stream->specific_ctx_id_mask =
-                               (1U << GEN8_CTX_ID_WIDTH) - 1;
-                       stream->specific_ctx_id =
-                               upper_32_bits(ce->lrc_desc);
-                       stream->specific_ctx_id &=
-                               stream->specific_ctx_id_mask;
                }
                break;
 
-       case 11: {
+       case 11:
+       case 12: {
                stream->specific_ctx_id_mask =
-                       ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32) |
-                       ((1U << GEN11_ENGINE_INSTANCE_WIDTH) - 1) << (GEN11_ENGINE_INSTANCE_SHIFT - 32) |
-                       ((1 << GEN11_ENGINE_CLASS_WIDTH) - 1) << (GEN11_ENGINE_CLASS_SHIFT - 32);
-               stream->specific_ctx_id = upper_32_bits(ce->lrc_desc);
-               stream->specific_ctx_id &=
-                       stream->specific_ctx_id_mask;
+                       ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32);
+               stream->specific_ctx_id = stream->specific_ctx_id_mask;
                break;
        }
 
        default:
-               MISSING_CASE(INTEL_GEN(i915));
+               MISSING_CASE(INTEL_GEN(ce->engine->i915));
        }
 
+       ce->tag = stream->specific_ctx_id_mask;
+
        DRM_DEBUG_DRIVER("filtering on ctx_id=0x%x ctx_id_mask=0x%x\n",
                         stream->specific_ctx_id,
                         stream->specific_ctx_id_mask);
@@ -1330,69 +1343,76 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
  */
 static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
        struct intel_context *ce;
 
-       stream->specific_ctx_id = INVALID_CTX_ID;
-       stream->specific_ctx_id_mask = 0;
-
        ce = fetch_and_zero(&stream->pinned_ctx);
        if (ce) {
-               mutex_lock(&dev_priv->drm.struct_mutex);
+               ce->tag = 0; /* recomputed on next submission after parking */
                intel_context_unpin(ce);
-               mutex_unlock(&dev_priv->drm.struct_mutex);
        }
+
+       stream->specific_ctx_id = INVALID_CTX_ID;
+       stream->specific_ctx_id_mask = 0;
 }
 
 static void
 free_oa_buffer(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *i915 = stream->dev_priv;
-
-       mutex_lock(&i915->drm.struct_mutex);
-
        i915_vma_unpin_and_release(&stream->oa_buffer.vma,
                                   I915_VMA_RELEASE_MAP);
 
-       mutex_unlock(&i915->drm.struct_mutex);
-
        stream->oa_buffer.vaddr = NULL;
 }
 
+static void
+free_oa_configs(struct i915_perf_stream *stream)
+{
+       struct i915_oa_config_bo *oa_bo, *tmp;
+
+       i915_oa_config_put(stream->oa_config);
+       llist_for_each_entry_safe(oa_bo, tmp, stream->oa_config_bos.first, node)
+               free_oa_config_bo(oa_bo);
+}
+
+static void
+free_noa_wait(struct i915_perf_stream *stream)
+{
+       i915_vma_unpin_and_release(&stream->noa_wait, 0);
+}
+
 static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct i915_perf *perf = stream->perf;
 
-       BUG_ON(stream != dev_priv->perf.exclusive_stream);
+       BUG_ON(stream != perf->exclusive_stream);
 
        /*
         * Unset exclusive_stream first, it will be checked while disabling
         * the metric set on gen8+.
         */
-       mutex_lock(&dev_priv->drm.struct_mutex);
-       dev_priv->perf.exclusive_stream = NULL;
-       dev_priv->perf.ops.disable_metric_set(stream);
-       mutex_unlock(&dev_priv->drm.struct_mutex);
+       perf->exclusive_stream = NULL;
+       perf->ops.disable_metric_set(stream);
 
        free_oa_buffer(stream);
 
-       intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
-       intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref);
+       intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
+       intel_engine_pm_put(stream->engine);
 
        if (stream->ctx)
                oa_put_render_ctx_id(stream);
 
-       put_oa_config(dev_priv, stream->oa_config);
+       free_oa_configs(stream);
+       free_noa_wait(stream);
 
-       if (dev_priv->perf.spurious_report_rs.missed) {
+       if (perf->spurious_report_rs.missed) {
                DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n",
-                        dev_priv->perf.spurious_report_rs.missed);
+                        perf->spurious_report_rs.missed);
        }
 }
 
 static void gen7_init_oa_buffer(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct intel_uncore *uncore = stream->uncore;
        u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
        unsigned long flags;
 
@@ -1401,13 +1421,14 @@ static void gen7_init_oa_buffer(struct i915_perf_stream *stream)
        /* Pre-DevBDW: OABUFFER must be set with counters off,
         * before OASTATUS1, but after OASTATUS2
         */
-       I915_WRITE(GEN7_OASTATUS2,
-                  gtt_offset | GEN7_OASTATUS2_MEM_SELECT_GGTT); /* head */
+       intel_uncore_write(uncore, GEN7_OASTATUS2, /* head */
+                          gtt_offset | GEN7_OASTATUS2_MEM_SELECT_GGTT);
        stream->oa_buffer.head = gtt_offset;
 
-       I915_WRITE(GEN7_OABUFFER, gtt_offset);
+       intel_uncore_write(uncore, GEN7_OABUFFER, gtt_offset);
 
-       I915_WRITE(GEN7_OASTATUS1, gtt_offset | OABUFFER_SIZE_16M); /* tail */
+       intel_uncore_write(uncore, GEN7_OASTATUS1, /* tail */
+                          gtt_offset | OABUFFER_SIZE_16M);
 
        /* Mark that we need updated tail pointers to read from... */
        stream->oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
@@ -1419,7 +1440,7 @@ static void gen7_init_oa_buffer(struct i915_perf_stream *stream)
         * already seen since they can't be cleared while periodic
         * sampling is enabled.
         */
-       dev_priv->perf.gen7_latched_oastatus1 = 0;
+       stream->perf->gen7_latched_oastatus1 = 0;
 
        /* NB: although the OA buffer will initially be allocated
         * zeroed via shmfs (and so this memset is redundant when
@@ -1434,25 +1455,22 @@ static void gen7_init_oa_buffer(struct i915_perf_stream *stream)
         */
        memset(stream->oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
 
-       /* Maybe make ->pollin per-stream state if we support multiple
-        * concurrent streams in the future.
-        */
        stream->pollin = false;
 }
 
 static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct intel_uncore *uncore = stream->uncore;
        u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
        unsigned long flags;
 
        spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
 
-       I915_WRITE(GEN8_OASTATUS, 0);
-       I915_WRITE(GEN8_OAHEADPTR, gtt_offset);
+       intel_uncore_write(uncore, GEN8_OASTATUS, 0);
+       intel_uncore_write(uncore, GEN8_OAHEADPTR, gtt_offset);
        stream->oa_buffer.head = gtt_offset;
 
-       I915_WRITE(GEN8_OABUFFER_UDW, 0);
+       intel_uncore_write(uncore, GEN8_OABUFFER_UDW, 0);
 
        /*
         * PRM says:
@@ -1462,9 +1480,9 @@ static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
         *  to enable proper functionality of the overflow
         *  bit."
         */
-       I915_WRITE(GEN8_OABUFFER, gtt_offset |
+       intel_uncore_write(uncore, GEN8_OABUFFER, gtt_offset |
                   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
-       I915_WRITE(GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
+       intel_uncore_write(uncore, GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
 
        /* Mark that we need updated tail pointers to read from... */
        stream->oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
@@ -1493,35 +1511,82 @@ static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
         */
        memset(stream->oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
 
+       stream->pollin = false;
+}
+
+static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
+{
+       struct intel_uncore *uncore = stream->uncore;
+       u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
+       unsigned long flags;
+
+       spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
+
+       intel_uncore_write(uncore, GEN12_OAG_OASTATUS, 0);
+       intel_uncore_write(uncore, GEN12_OAG_OAHEADPTR,
+                          gtt_offset & GEN12_OAG_OAHEADPTR_MASK);
+       stream->oa_buffer.head = gtt_offset;
+
+       /*
+        * PRM says:
+        *
+        *  "This MMIO must be set before the OATAILPTR
+        *  register and after the OAHEADPTR register. This is
+        *  to enable proper functionality of the overflow
+        *  bit."
+        */
+       intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
+                          OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
+       intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
+                          gtt_offset & GEN12_OAG_OATAILPTR_MASK);
+
+       /* Mark that we need updated tail pointers to read from... */
+       stream->oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
+       stream->oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
+
+       /*
+        * Reset state used to recognise context switches, affecting which
+        * reports we will forward to userspace while filtering for a single
+        * context.
+        */
+       stream->oa_buffer.last_ctx_id = INVALID_CTX_ID;
+
+       spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
+
        /*
-        * Maybe make ->pollin per-stream state if we support multiple
-        * concurrent streams in the future.
+        * NB: although the OA buffer will initially be allocated
+        * zeroed via shmfs (and so this memset is redundant when
+        * first allocating), we may re-init the OA buffer, either
+        * when re-enabling a stream or in error/reset paths.
+        *
+        * The reason we clear the buffer for each re-init is for the
+        * sanity check in gen8_append_oa_reports() that looks at the
+        * reason field to make sure it's non-zero which relies on
+        * the assumption that new reports are being written to zeroed
+        * memory...
         */
+       memset(stream->oa_buffer.vaddr, 0,
+              stream->oa_buffer.vma->size);
+
        stream->pollin = false;
 }
 
 static int alloc_oa_buffer(struct i915_perf_stream *stream)
 {
        struct drm_i915_gem_object *bo;
-       struct drm_i915_private *dev_priv = stream->dev_priv;
        struct i915_vma *vma;
        int ret;
 
        if (WARN_ON(stream->oa_buffer.vma))
                return -ENODEV;
 
-       ret = i915_mutex_lock_interruptible(&dev_priv->drm);
-       if (ret)
-               return ret;
-
        BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
        BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
 
-       bo = i915_gem_object_create_shmem(dev_priv, OA_BUFFER_SIZE);
+       bo = i915_gem_object_create_shmem(stream->perf->i915, OA_BUFFER_SIZE);
        if (IS_ERR(bo)) {
                DRM_ERROR("Failed to allocate OA buffer\n");
-               ret = PTR_ERR(bo);
-               goto unlock;
+               return PTR_ERR(bo);
        }
 
        i915_gem_object_set_cache_coherency(bo, I915_CACHE_LLC);
@@ -1541,11 +1606,7 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream)
                goto err_unpin;
        }
 
-       DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p\n",
-                        i915_ggtt_offset(stream->oa_buffer.vma),
-                        stream->oa_buffer.vaddr);
-
-       goto unlock;
+       return 0;
 
 err_unpin:
        __i915_vma_unpin(vma);
@@ -1556,55 +1617,389 @@ err_unref:
        stream->oa_buffer.vaddr = NULL;
        stream->oa_buffer.vma = NULL;
 
-unlock:
-       mutex_unlock(&dev_priv->drm.struct_mutex);
        return ret;
 }
 
-static void config_oa_regs(struct drm_i915_private *dev_priv,
-                          const struct i915_oa_reg *regs,
-                          u32 n_regs)
+static u32 *save_restore_register(struct i915_perf_stream *stream, u32 *cs,
+                                 bool save, i915_reg_t reg, u32 offset,
+                                 u32 dword_count)
+{
+       u32 cmd;
+       u32 d;
+
+       cmd = save ? MI_STORE_REGISTER_MEM : MI_LOAD_REGISTER_MEM;
+       if (INTEL_GEN(stream->perf->i915) >= 8)
+               cmd++;
+
+       for (d = 0; d < dword_count; d++) {
+               *cs++ = cmd;
+               *cs++ = i915_mmio_reg_offset(reg) + 4 * d;
+               *cs++ = intel_gt_scratch_offset(stream->engine->gt,
+                                               offset) + 4 * d;
+               *cs++ = 0;
+       }
+
+       return cs;
+}
+
+static int alloc_noa_wait(struct i915_perf_stream *stream)
+{
+       struct drm_i915_private *i915 = stream->perf->i915;
+       struct drm_i915_gem_object *bo;
+       struct i915_vma *vma;
+       const u64 delay_ticks = 0xffffffffffffffff -
+               DIV64_U64_ROUND_UP(
+                       atomic64_read(&stream->perf->noa_programming_delay) *
+                       RUNTIME_INFO(i915)->cs_timestamp_frequency_khz,
+                       1000000ull);
+       const u32 base = stream->engine->mmio_base;
+#define CS_GPR(x) GEN8_RING_CS_GPR(base, x)
+       u32 *batch, *ts0, *cs, *jump;
+       int ret, i;
+       enum {
+               START_TS,
+               NOW_TS,
+               DELTA_TS,
+               JUMP_PREDICATE,
+               DELTA_TARGET,
+               N_CS_GPR
+       };
+
+       bo = i915_gem_object_create_internal(i915, 4096);
+       if (IS_ERR(bo)) {
+               DRM_ERROR("Failed to allocate NOA wait batchbuffer\n");
+               return PTR_ERR(bo);
+       }
+
+       /*
+        * We pin in GGTT because we jump into this buffer now because
+        * multiple OA config BOs will have a jump to this address and it
+        * needs to be fixed during the lifetime of the i915/perf stream.
+        */
+       vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 0, PIN_HIGH);
+       if (IS_ERR(vma)) {
+               ret = PTR_ERR(vma);
+               goto err_unref;
+       }
+
+       batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
+       if (IS_ERR(batch)) {
+               ret = PTR_ERR(batch);
+               goto err_unpin;
+       }
+
+       /* Save registers. */
+       for (i = 0; i < N_CS_GPR; i++)
+               cs = save_restore_register(
+                       stream, cs, true /* save */, CS_GPR(i),
+                       INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
+       cs = save_restore_register(
+               stream, cs, true /* save */, MI_PREDICATE_RESULT_1,
+               INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
+
+       /* First timestamp snapshot location. */
+       ts0 = cs;
+
+       /*
+        * Initial snapshot of the timestamp register to implement the wait.
+        * We work with 32b values, so clear out the top 32b bits of the
+        * register because the ALU works 64bits.
+        */
+       *cs++ = MI_LOAD_REGISTER_IMM(1);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(START_TS)) + 4;
+       *cs++ = 0;
+       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base));
+       *cs++ = i915_mmio_reg_offset(CS_GPR(START_TS));
+
+       /*
+        * This is the location we're going to jump back into until the
+        * required amount of time has passed.
+        */
+       jump = cs;
+
+       /*
+        * Take another snapshot of the timestamp register. Take care to clear
+        * up the top 32bits of CS_GPR(1) as we're using it for other
+        * operations below.
+        */
+       *cs++ = MI_LOAD_REGISTER_IMM(1);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS)) + 4;
+       *cs++ = 0;
+       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base));
+       *cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS));
+
+       /*
+        * Do a diff between the 2 timestamps and store the result back into
+        * CS_GPR(1).
+        */
+       *cs++ = MI_MATH(5);
+       *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+       *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+       *cs++ = MI_MATH_SUB;
+       *cs++ = MI_MATH_STORE(MI_MATH_REG(DELTA_TS), MI_MATH_REG_ACCU);
+       *cs++ = MI_MATH_STORE(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
+
+       /*
+        * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
+        * timestamp have rolled over the 32bits) into the predicate register
+        * to be used for the predicated jump.
+        */
+       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE));
+       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
+
+       /* Restart from the beginning if we had timestamps roll over. */
+       *cs++ = (INTEL_GEN(i915) < 8 ?
+                MI_BATCH_BUFFER_START :
+                MI_BATCH_BUFFER_START_GEN8) |
+               MI_BATCH_PREDICATE;
+       *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4;
+       *cs++ = 0;
+
+       /*
+        * Now add the diff between to previous timestamps and add it to :
+        *      (((1 * << 64) - 1) - delay_ns)
+        *
+        * When the Carry Flag contains 1 this means the elapsed time is
+        * longer than the expected delay, and we can exit the wait loop.
+        */
+       *cs++ = MI_LOAD_REGISTER_IMM(2);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET));
+       *cs++ = lower_32_bits(delay_ticks);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET)) + 4;
+       *cs++ = upper_32_bits(delay_ticks);
+
+       *cs++ = MI_MATH(4);
+       *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(DELTA_TS));
+       *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(DELTA_TARGET));
+       *cs++ = MI_MATH_ADD;
+       *cs++ = MI_MATH_STOREINV(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
+
+       /*
+        * Transfer the result into the predicate register to be used for the
+        * predicated jump.
+        */
+       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE));
+       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
+
+       /* Predicate the jump.  */
+       *cs++ = (INTEL_GEN(i915) < 8 ?
+                MI_BATCH_BUFFER_START :
+                MI_BATCH_BUFFER_START_GEN8) |
+               MI_BATCH_PREDICATE;
+       *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4;
+       *cs++ = 0;
+
+       /* Restore registers. */
+       for (i = 0; i < N_CS_GPR; i++)
+               cs = save_restore_register(
+                       stream, cs, false /* restore */, CS_GPR(i),
+                       INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
+       cs = save_restore_register(
+               stream, cs, false /* restore */, MI_PREDICATE_RESULT_1,
+               INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
+
+       /* And return to the ring. */
+       *cs++ = MI_BATCH_BUFFER_END;
+
+       GEM_BUG_ON(cs - batch > PAGE_SIZE / sizeof(*batch));
+
+       i915_gem_object_flush_map(bo);
+       i915_gem_object_unpin_map(bo);
+
+       stream->noa_wait = vma;
+       return 0;
+
+err_unpin:
+       i915_vma_unpin_and_release(&vma, 0);
+err_unref:
+       i915_gem_object_put(bo);
+       return ret;
+}
+
+static u32 *write_cs_mi_lri(u32 *cs,
+                           const struct i915_oa_reg *reg_data,
+                           u32 n_regs)
 {
        u32 i;
 
        for (i = 0; i < n_regs; i++) {
-               const struct i915_oa_reg *reg = regs + i;
+               if ((i % MI_LOAD_REGISTER_IMM_MAX_REGS) == 0) {
+                       u32 n_lri = min_t(u32,
+                                         n_regs - i,
+                                         MI_LOAD_REGISTER_IMM_MAX_REGS);
 
-               I915_WRITE(reg->addr, reg->value);
+                       *cs++ = MI_LOAD_REGISTER_IMM(n_lri);
+               }
+               *cs++ = i915_mmio_reg_offset(reg_data[i].addr);
+               *cs++ = reg_data[i].value;
        }
+
+       return cs;
 }
 
-static void delay_after_mux(void)
+static int num_lri_dwords(int num_regs)
 {
+       int count = 0;
+
+       if (num_regs > 0) {
+               count += DIV_ROUND_UP(num_regs, MI_LOAD_REGISTER_IMM_MAX_REGS);
+               count += num_regs * 2;
+       }
+
+       return count;
+}
+
+static struct i915_oa_config_bo *
+alloc_oa_config_buffer(struct i915_perf_stream *stream,
+                      struct i915_oa_config *oa_config)
+{
+       struct drm_i915_gem_object *obj;
+       struct i915_oa_config_bo *oa_bo;
+       size_t config_length = 0;
+       u32 *cs;
+       int err;
+
+       oa_bo = kzalloc(sizeof(*oa_bo), GFP_KERNEL);
+       if (!oa_bo)
+               return ERR_PTR(-ENOMEM);
+
+       config_length += num_lri_dwords(oa_config->mux_regs_len);
+       config_length += num_lri_dwords(oa_config->b_counter_regs_len);
+       config_length += num_lri_dwords(oa_config->flex_regs_len);
+       config_length += 3; /* MI_BATCH_BUFFER_START */
+       config_length = ALIGN(sizeof(u32) * config_length, I915_GTT_PAGE_SIZE);
+
+       obj = i915_gem_object_create_shmem(stream->perf->i915, config_length);
+       if (IS_ERR(obj)) {
+               err = PTR_ERR(obj);
+               goto err_free;
+       }
+
+       cs = i915_gem_object_pin_map(obj, I915_MAP_WB);
+       if (IS_ERR(cs)) {
+               err = PTR_ERR(cs);
+               goto err_oa_bo;
+       }
+
+       cs = write_cs_mi_lri(cs,
+                            oa_config->mux_regs,
+                            oa_config->mux_regs_len);
+       cs = write_cs_mi_lri(cs,
+                            oa_config->b_counter_regs,
+                            oa_config->b_counter_regs_len);
+       cs = write_cs_mi_lri(cs,
+                            oa_config->flex_regs,
+                            oa_config->flex_regs_len);
+
+       /* Jump into the active wait. */
+       *cs++ = (INTEL_GEN(stream->perf->i915) < 8 ?
+                MI_BATCH_BUFFER_START :
+                MI_BATCH_BUFFER_START_GEN8);
+       *cs++ = i915_ggtt_offset(stream->noa_wait);
+       *cs++ = 0;
+
+       i915_gem_object_flush_map(obj);
+       i915_gem_object_unpin_map(obj);
+
+       oa_bo->vma = i915_vma_instance(obj,
+                                      &stream->engine->gt->ggtt->vm,
+                                      NULL);
+       if (IS_ERR(oa_bo->vma)) {
+               err = PTR_ERR(oa_bo->vma);
+               goto err_oa_bo;
+       }
+
+       oa_bo->oa_config = i915_oa_config_get(oa_config);
+       llist_add(&oa_bo->node, &stream->oa_config_bos);
+
+       return oa_bo;
+
+err_oa_bo:
+       i915_gem_object_put(obj);
+err_free:
+       kfree(oa_bo);
+       return ERR_PTR(err);
+}
+
+static struct i915_vma *
+get_oa_vma(struct i915_perf_stream *stream, struct i915_oa_config *oa_config)
+{
+       struct i915_oa_config_bo *oa_bo;
+
        /*
-        * It apparently takes a fairly long time for a new MUX
-        * configuration to be be applied after these register writes.
-        * This delay duration was derived empirically based on the
-        * render_basic config but hopefully it covers the maximum
-        * configuration latency.
-        *
-        * As a fallback, the checks in _append_oa_reports() to skip
-        * invalid OA reports do also seem to work to discard reports
-        * generated before this config has completed - albeit not
-        * silently.
-        *
-        * Unfortunately this is essentially a magic number, since we
-        * don't currently know of a reliable mechanism for predicting
-        * how long the MUX config will take to apply and besides
-        * seeing invalid reports we don't know of a reliable way to
-        * explicitly check that the MUX config has landed.
-        *
-        * It's even possible we've miss characterized the underlying
-        * problem - it just seems like the simplest explanation why
-        * a delay at this location would mitigate any invalid reports.
+        * Look for the buffer in the already allocated BOs attached
+        * to the stream.
         */
-       usleep_range(15000, 20000);
+       llist_for_each_entry(oa_bo, stream->oa_config_bos.first, node) {
+               if (oa_bo->oa_config == oa_config &&
+                   memcmp(oa_bo->oa_config->uuid,
+                          oa_config->uuid,
+                          sizeof(oa_config->uuid)) == 0)
+                       goto out;
+       }
+
+       oa_bo = alloc_oa_config_buffer(stream, oa_config);
+       if (IS_ERR(oa_bo))
+               return ERR_CAST(oa_bo);
+
+out:
+       return i915_vma_get(oa_bo->vma);
+}
+
+static int emit_oa_config(struct i915_perf_stream *stream,
+                         struct i915_oa_config *oa_config,
+                         struct intel_context *ce)
+{
+       struct i915_request *rq;
+       struct i915_vma *vma;
+       int err;
+
+       vma = get_oa_vma(stream, oa_config);
+       if (IS_ERR(vma))
+               return PTR_ERR(vma);
+
+       err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
+       if (err)
+               goto err_vma_put;
+
+       rq = i915_request_create(ce);
+       if (IS_ERR(rq)) {
+               err = PTR_ERR(rq);
+               goto err_vma_unpin;
+       }
+
+       i915_vma_lock(vma);
+       err = i915_request_await_object(rq, vma->obj, 0);
+       if (!err)
+               err = i915_vma_move_to_active(vma, rq, 0);
+       i915_vma_unlock(vma);
+       if (err)
+               goto err_add_request;
+
+       err = rq->engine->emit_bb_start(rq,
+                                       vma->node.start, 0,
+                                       I915_DISPATCH_SECURE);
+err_add_request:
+       i915_request_add(rq);
+err_vma_unpin:
+       i915_vma_unpin(vma);
+err_vma_put:
+       i915_vma_put(vma);
+       return err;
+}
+
+static struct intel_context *oa_context(struct i915_perf_stream *stream)
+{
+       return stream->pinned_ctx ?: stream->engine->kernel_context;
 }
 
 static int hsw_enable_metric_set(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
-       const struct i915_oa_config *oa_config = stream->oa_config;
+       struct intel_uncore *uncore = stream->uncore;
 
        /*
         * PRM:
@@ -1616,31 +2011,24 @@ static int hsw_enable_metric_set(struct i915_perf_stream *stream)
         * count the events from non-render domain. Unit level clock
         * gating for RCS should also be disabled.
         */
-       I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
-                                   ~GEN7_DOP_CLOCK_GATE_ENABLE));
-       I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) |
-                                 GEN6_CSUNIT_CLOCK_GATE_DISABLE));
-
-       config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
-       delay_after_mux();
-
-       config_oa_regs(dev_priv, oa_config->b_counter_regs,
-                      oa_config->b_counter_regs_len);
+       intel_uncore_rmw(uncore, GEN7_MISCCPCTL,
+                        GEN7_DOP_CLOCK_GATE_ENABLE, 0);
+       intel_uncore_rmw(uncore, GEN6_UCGCTL1,
+                        0, GEN6_CSUNIT_CLOCK_GATE_DISABLE);
 
-       return 0;
+       return emit_oa_config(stream, stream->oa_config, oa_context(stream));
 }
 
 static void hsw_disable_metric_set(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct intel_uncore *uncore = stream->uncore;
 
-       I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) &
-                                 ~GEN6_CSUNIT_CLOCK_GATE_DISABLE));
-       I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) |
-                                   GEN7_DOP_CLOCK_GATE_ENABLE));
+       intel_uncore_rmw(uncore, GEN6_UCGCTL1,
+                        GEN6_CSUNIT_CLOCK_GATE_DISABLE, 0);
+       intel_uncore_rmw(uncore, GEN7_MISCCPCTL,
+                        0, GEN7_DOP_CLOCK_GATE_ENABLE);
 
-       I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
-                                     ~GT_NOA_ENABLE));
+       intel_uncore_rmw(uncore, GDT_CHICKEN_BITS, GT_NOA_ENABLE, 0);
 }
 
 static u32 oa_config_flex_reg(const struct i915_oa_config *oa_config,
@@ -1672,14 +2060,11 @@ static u32 oa_config_flex_reg(const struct i915_oa_config *oa_config,
  * in the case that the OA unit has been disabled.
  */
 static void
-gen8_update_reg_state_unlocked(struct i915_perf_stream *stream,
-                              struct intel_context *ce,
-                              u32 *reg_state,
-                              const struct i915_oa_config *oa_config)
-{
-       struct drm_i915_private *i915 = ce->engine->i915;
-       u32 ctx_oactxctrl = i915->perf.ctx_oactxctrl_offset;
-       u32 ctx_flexeu0 = i915->perf.ctx_flexeu0_offset;
+gen8_update_reg_state_unlocked(const struct intel_context *ce,
+                              const struct i915_perf_stream *stream)
+{
+       u32 ctx_oactxctrl = stream->perf->ctx_oactxctrl_offset;
+       u32 ctx_flexeu0 = stream->perf->ctx_flexeu0_offset;
        /* The MMIO offsets for Flex EU registers aren't contiguous */
        i915_reg_t flex_regs[] = {
                EU_PERF_CNTL0,
@@ -1690,21 +2075,28 @@ gen8_update_reg_state_unlocked(struct i915_perf_stream *stream,
                EU_PERF_CNTL5,
                EU_PERF_CNTL6,
        };
+       u32 *reg_state = ce->lrc_reg_state;
        int i;
 
-       CTX_REG(reg_state, ctx_oactxctrl, GEN8_OACTXCONTROL,
-               (stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
-               (stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
-               GEN8_OA_COUNTER_RESUME);
+       if (IS_GEN(stream->perf->i915, 12)) {
+               u32 format = stream->oa_buffer.format;
 
-       for (i = 0; i < ARRAY_SIZE(flex_regs); i++) {
-               CTX_REG(reg_state, ctx_flexeu0 + i * 2, flex_regs[i],
-                       oa_config_flex_reg(oa_config, flex_regs[i]));
+               reg_state[ctx_oactxctrl + 1] =
+                       (format << GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT) |
+                       (stream->oa_config ? GEN12_OAR_OACONTROL_COUNTER_ENABLE : 0);
+       } else {
+               reg_state[ctx_oactxctrl + 1] =
+                       (stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
+                       (stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
+                       GEN8_OA_COUNTER_RESUME;
        }
 
-       CTX_REG(reg_state,
-               CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
-               intel_sseu_make_rpcs(i915, &ce->sseu));
+       for (i = 0; !!ctx_flexeu0 && i < ARRAY_SIZE(flex_regs); i++)
+               reg_state[ctx_flexeu0 + i * 2 + 1] =
+                       oa_config_flex_reg(stream->oa_config, flex_regs[i]);
+
+       reg_state[CTX_R_PWR_CLK_STATE] =
+               intel_sseu_make_rpcs(ce->engine->i915, &ce->sseu);
 }
 
 struct flex {
@@ -1728,7 +2120,7 @@ gen8_store_flex(struct i915_request *rq,
        offset = i915_ggtt_offset(ce->state) + LRC_STATE_PN * PAGE_SIZE;
        do {
                *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
-               *cs++ = offset + (flex->offset + 1) * sizeof(u32);
+               *cs++ = offset + flex->offset * sizeof(u32);
                *cs++ = 0;
                *cs++ = flex->value;
        } while (flex++, --count);
@@ -1832,6 +2224,36 @@ static int gen8_configure_context(struct i915_gem_context *ctx,
        return err;
 }
 
+static int gen12_emit_oar_config(struct intel_context *ce, bool enable)
+{
+       struct i915_request *rq;
+       u32 *cs;
+       int err = 0;
+
+       rq = i915_request_create(ce);
+       if (IS_ERR(rq))
+               return PTR_ERR(rq);
+
+       cs = intel_ring_begin(rq, 4);
+       if (IS_ERR(cs)) {
+               err = PTR_ERR(cs);
+               goto out;
+       }
+
+       *cs++ = MI_LOAD_REGISTER_IMM(1);
+       *cs++ = i915_mmio_reg_offset(RING_CONTEXT_CONTROL(ce->engine->mmio_base));
+       *cs++ = _MASKED_FIELD(GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE,
+                             enable ? GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE : 0);
+       *cs++ = MI_NOOP;
+
+       intel_ring_advance(rq, cs);
+
+out:
+       i915_request_add(rq);
+
+       return err;
+}
+
 /*
  * Manages updating the per-context aspects of the OA stream
  * configuration across all contexts.
@@ -1856,24 +2278,22 @@ static int gen8_configure_context(struct i915_gem_context *ctx,
  *
  * Note: it's only the RCS/Render context that has any OA state.
  */
-static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
-                                      const struct i915_oa_config *oa_config)
+static int lrc_configure_all_contexts(struct i915_perf_stream *stream,
+                                     const struct i915_oa_config *oa_config)
 {
-       struct drm_i915_private *i915 = stream->dev_priv;
+       struct drm_i915_private *i915 = stream->perf->i915;
        /* The MMIO offsets for Flex EU registers aren't contiguous */
-       const u32 ctx_flexeu0 = i915->perf.ctx_flexeu0_offset;
-#define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N))
+       const u32 ctx_flexeu0 = stream->perf->ctx_flexeu0_offset;
+#define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N) + 1)
        struct flex regs[] = {
                {
                        GEN8_R_PWR_CLK_STATE,
                        CTX_R_PWR_CLK_STATE,
                },
                {
-                       GEN8_OACTXCONTROL,
-                       i915->perf.ctx_oactxctrl_offset,
-                       ((stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
-                        (stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
-                        GEN8_OA_COUNTER_RESUME)
+                       IS_GEN(i915, 12) ?
+                       GEN12_OAR_OACONTROL : GEN8_OACTXCONTROL,
+                       stream->perf->ctx_oactxctrl_offset + 1,
                },
                { EU_PERF_CNTL0, ctx_flexeuN(0) },
                { EU_PERF_CNTL1, ctx_flexeuN(1) },
@@ -1885,13 +2305,27 @@ static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
        };
 #undef ctx_flexeuN
        struct intel_engine_cs *engine;
-       struct i915_gem_context *ctx;
-       int i;
+       struct i915_gem_context *ctx, *cn;
+       size_t array_size = IS_GEN(i915, 12) ? 2 : ARRAY_SIZE(regs);
+       int i, err;
 
-       for (i = 2; i < ARRAY_SIZE(regs); i++)
+       if (IS_GEN(i915, 12)) {
+               u32 format = stream->oa_buffer.format;
+
+               regs[1].value =
+                       (format << GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT) |
+                       (oa_config ? GEN12_OAR_OACONTROL_COUNTER_ENABLE : 0);
+       } else {
+               regs[1].value =
+                       (stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
+                       (stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
+                       GEN8_OA_COUNTER_RESUME;
+       }
+
+       for (i = 2; !!ctx_flexeu0 && i < array_size; i++)
                regs[i].value = oa_config_flex_reg(oa_config, regs[i].reg);
 
-       lockdep_assert_held(&i915->drm.struct_mutex);
+       lockdep_assert_held(&stream->perf->lock);
 
        /*
         * The OA register config is setup through the context image. This image
@@ -1909,16 +2343,27 @@ static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
         * context. Contexts idle at the time of reconfiguration are not
         * trapped behind the barrier.
         */
-       list_for_each_entry(ctx, &i915->contexts.list, link) {
-               int err;
-
+       spin_lock(&i915->gem.contexts.lock);
+       list_for_each_entry_safe(ctx, cn, &i915->gem.contexts.list, link) {
                if (ctx == i915->kernel_context)
                        continue;
 
-               err = gen8_configure_context(ctx, regs, ARRAY_SIZE(regs));
-               if (err)
+               if (!kref_get_unless_zero(&ctx->ref))
+                       continue;
+
+               spin_unlock(&i915->gem.contexts.lock);
+
+               err = gen8_configure_context(ctx, regs, array_size);
+               if (err) {
+                       i915_gem_context_put(ctx);
                        return err;
+               }
+
+               spin_lock(&i915->gem.contexts.lock);
+               list_safe_reset_next(ctx, cn, link);
+               i915_gem_context_put(ctx);
        }
+       spin_unlock(&i915->gem.contexts.lock);
 
        /*
         * After updating all other contexts, we need to modify ourselves.
@@ -1927,14 +2372,13 @@ static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
         */
        for_each_uabi_engine(engine, i915) {
                struct intel_context *ce = engine->kernel_context;
-               int err;
 
                if (engine->class != RENDER_CLASS)
                        continue;
 
                regs[0].value = intel_sseu_make_rpcs(i915, &ce->sseu);
 
-               err = gen8_modify_self(ce, regs, ARRAY_SIZE(regs));
+               err = gen8_modify_self(ce, regs, array_size);
                if (err)
                        return err;
        }
@@ -1944,8 +2388,8 @@ static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
 
 static int gen8_enable_metric_set(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
-       const struct i915_oa_config *oa_config = stream->oa_config;
+       struct intel_uncore *uncore = stream->uncore;
+       struct i915_oa_config *oa_config = stream->oa_config;
        int ret;
 
        /*
@@ -1971,10 +2415,10 @@ static int gen8_enable_metric_set(struct i915_perf_stream *stream)
         * be read back from automatically triggered reports, as part of the
         * RPT_ID field.
         */
-       if (IS_GEN_RANGE(dev_priv, 9, 11)) {
-               I915_WRITE(GEN8_OA_DEBUG,
-                          _MASKED_BIT_ENABLE(GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
-                                             GEN9_OA_DEBUG_INCLUDE_CLK_RATIO));
+       if (IS_GEN_RANGE(stream->perf->i915, 9, 11)) {
+               intel_uncore_write(uncore, GEN8_OA_DEBUG,
+                                  _MASKED_BIT_ENABLE(GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
+                                                     GEN9_OA_DEBUG_INCLUDE_CLK_RATIO));
        }
 
        /*
@@ -1982,45 +2426,102 @@ static int gen8_enable_metric_set(struct i915_perf_stream *stream)
         * to make sure all slices/subslices are ON before writing to NOA
         * registers.
         */
-       ret = gen8_configure_all_contexts(stream, oa_config);
+       ret = lrc_configure_all_contexts(stream, oa_config);
        if (ret)
                return ret;
 
-       config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
-       delay_after_mux();
+       return emit_oa_config(stream, oa_config, oa_context(stream));
+}
+
+static int gen12_enable_metric_set(struct i915_perf_stream *stream)
+{
+       struct intel_uncore *uncore = stream->uncore;
+       struct i915_oa_config *oa_config = stream->oa_config;
+       bool periodic = stream->periodic;
+       u32 period_exponent = stream->period_exponent;
+       int ret;
 
-       config_oa_regs(dev_priv, oa_config->b_counter_regs,
-                      oa_config->b_counter_regs_len);
+       intel_uncore_write(uncore, GEN12_OAG_OA_DEBUG,
+                          /* Disable clk ratio reports, like previous Gens. */
+                          _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
+                                             GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) |
+                          /*
+                           * If the user didn't require OA reports, instruct the
+                           * hardware not to emit ctx switch reports.
+                           */
+                          !(stream->sample_flags & SAMPLE_OA_REPORT) ?
+                          _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS) :
+                          _MASKED_BIT_DISABLE(GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS));
+
+       intel_uncore_write(uncore, GEN12_OAG_OAGLBCTXCTRL, periodic ?
+                          (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME |
+                           GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE |
+                           (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
+                           : 0);
 
-       return 0;
+       /*
+        * Update all contexts prior writing the mux configurations as we need
+        * to make sure all slices/subslices are ON before writing to NOA
+        * registers.
+        */
+       ret = lrc_configure_all_contexts(stream, oa_config);
+       if (ret)
+               return ret;
+
+       /*
+        * For Gen12, performance counters are context
+        * saved/restored. Only enable it for the context that
+        * requested this.
+        */
+       if (stream->ctx) {
+               ret = gen12_emit_oar_config(stream->pinned_ctx,
+                                           oa_config != NULL);
+               if (ret)
+                       return ret;
+       }
+
+       return emit_oa_config(stream, oa_config, oa_context(stream));
 }
 
 static void gen8_disable_metric_set(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct intel_uncore *uncore = stream->uncore;
 
        /* Reset all contexts' slices/subslices configurations. */
-       gen8_configure_all_contexts(stream, NULL);
+       lrc_configure_all_contexts(stream, NULL);
 
-       I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
-                                     ~GT_NOA_ENABLE));
+       intel_uncore_rmw(uncore, GDT_CHICKEN_BITS, GT_NOA_ENABLE, 0);
 }
 
 static void gen10_disable_metric_set(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct intel_uncore *uncore = stream->uncore;
+
+       /* Reset all contexts' slices/subslices configurations. */
+       lrc_configure_all_contexts(stream, NULL);
+
+       /* Make sure we disable noa to save power. */
+       intel_uncore_rmw(uncore, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, 0);
+}
+
+static void gen12_disable_metric_set(struct i915_perf_stream *stream)
+{
+       struct intel_uncore *uncore = stream->uncore;
 
        /* Reset all contexts' slices/subslices configurations. */
-       gen8_configure_all_contexts(stream, NULL);
+       lrc_configure_all_contexts(stream, NULL);
+
+       /* disable the context save/restore or OAR counters */
+       if (stream->ctx)
+               gen12_emit_oar_config(stream->pinned_ctx, false);
 
        /* Make sure we disable noa to save power. */
-       I915_WRITE(RPM_CONFIG1,
-                  I915_READ(RPM_CONFIG1) & ~GEN10_GT_NOA_ENABLE);
+       intel_uncore_rmw(uncore, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, 0);
 }
 
 static void gen7_oa_enable(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct intel_uncore *uncore = stream->uncore;
        struct i915_gem_context *ctx = stream->ctx;
        u32 ctx_id = stream->specific_ctx_id;
        bool periodic = stream->periodic;
@@ -2038,19 +2539,19 @@ static void gen7_oa_enable(struct i915_perf_stream *stream)
         */
        gen7_init_oa_buffer(stream);
 
-       I915_WRITE(GEN7_OACONTROL,
-                  (ctx_id & GEN7_OACONTROL_CTX_MASK) |
-                  (period_exponent <<
-                   GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
-                  (periodic ? GEN7_OACONTROL_TIMER_ENABLE : 0) |
-                  (report_format << GEN7_OACONTROL_FORMAT_SHIFT) |
-                  (ctx ? GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
-                  GEN7_OACONTROL_ENABLE);
+       intel_uncore_write(uncore, GEN7_OACONTROL,
+                          (ctx_id & GEN7_OACONTROL_CTX_MASK) |
+                          (period_exponent <<
+                           GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
+                          (periodic ? GEN7_OACONTROL_TIMER_ENABLE : 0) |
+                          (report_format << GEN7_OACONTROL_FORMAT_SHIFT) |
+                          (ctx ? GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
+                          GEN7_OACONTROL_ENABLE);
 }
 
 static void gen8_oa_enable(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct intel_uncore *uncore = stream->uncore;
        u32 report_format = stream->oa_buffer.format;
 
        /*
@@ -2069,9 +2570,28 @@ static void gen8_oa_enable(struct i915_perf_stream *stream)
         * filtering and instead filter on the cpu based on the context-id
         * field of reports
         */
-       I915_WRITE(GEN8_OACONTROL, (report_format <<
-                                   GEN8_OA_REPORT_FORMAT_SHIFT) |
-                                  GEN8_OA_COUNTER_ENABLE);
+       intel_uncore_write(uncore, GEN8_OACONTROL,
+                          (report_format << GEN8_OA_REPORT_FORMAT_SHIFT) |
+                          GEN8_OA_COUNTER_ENABLE);
+}
+
+static void gen12_oa_enable(struct i915_perf_stream *stream)
+{
+       struct intel_uncore *uncore = stream->uncore;
+       u32 report_format = stream->oa_buffer.format;
+
+       /*
+        * If we don't want OA reports from the OA buffer, then we don't even
+        * need to program the OAG unit.
+        */
+       if (!(stream->sample_flags & SAMPLE_OA_REPORT))
+               return;
+
+       gen12_init_oa_buffer(stream);
+
+       intel_uncore_write(uncore, GEN12_OAG_OACONTROL,
+                          (report_format << GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT) |
+                          GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE);
 }
 
 /**
@@ -2085,9 +2605,7 @@ static void gen8_oa_enable(struct i915_perf_stream *stream)
  */
 static void i915_oa_stream_enable(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
-
-       dev_priv->perf.ops.oa_enable(stream);
+       stream->perf->ops.oa_enable(stream);
 
        if (stream->periodic)
                hrtimer_start(&stream->poll_check_timer,
@@ -2097,7 +2615,7 @@ static void i915_oa_stream_enable(struct i915_perf_stream *stream)
 
 static void gen7_oa_disable(struct i915_perf_stream *stream)
 {
-       struct intel_uncore *uncore = &stream->dev_priv->uncore;
+       struct intel_uncore *uncore = stream->uncore;
 
        intel_uncore_write(uncore, GEN7_OACONTROL, 0);
        if (intel_wait_for_register(uncore,
@@ -2108,7 +2626,7 @@ static void gen7_oa_disable(struct i915_perf_stream *stream)
 
 static void gen8_oa_disable(struct i915_perf_stream *stream)
 {
-       struct intel_uncore *uncore = &stream->dev_priv->uncore;
+       struct intel_uncore *uncore = stream->uncore;
 
        intel_uncore_write(uncore, GEN8_OACONTROL, 0);
        if (intel_wait_for_register(uncore,
@@ -2117,6 +2635,18 @@ static void gen8_oa_disable(struct i915_perf_stream *stream)
                DRM_ERROR("wait for OA to be disabled timed out\n");
 }
 
+static void gen12_oa_disable(struct i915_perf_stream *stream)
+{
+       struct intel_uncore *uncore = stream->uncore;
+
+       intel_uncore_write(uncore, GEN12_OAG_OACONTROL, 0);
+       if (intel_wait_for_register(uncore,
+                                   GEN12_OAG_OACONTROL,
+                                   GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 0,
+                                   50))
+               DRM_ERROR("wait for OA to be disabled timed out\n");
+}
+
 /**
  * i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream
  * @stream: An i915 perf stream opened for OA metrics
@@ -2127,9 +2657,7 @@ static void gen8_oa_disable(struct i915_perf_stream *stream)
  */
 static void i915_oa_stream_disable(struct i915_perf_stream *stream)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
-
-       dev_priv->perf.ops.oa_disable(stream);
+       stream->perf->ops.oa_disable(stream);
 
        if (stream->periodic)
                hrtimer_cancel(&stream->poll_check_timer);
@@ -2166,15 +2694,21 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
                               struct drm_i915_perf_open_param *param,
                               struct perf_open_properties *props)
 {
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct i915_perf *perf = stream->perf;
        int format_size;
        int ret;
 
-       /* If the sysfs metrics/ directory wasn't registered for some
+       if (!props->engine) {
+               DRM_DEBUG("OA engine not specified\n");
+               return -EINVAL;
+       }
+
+       /*
+        * If the sysfs metrics/ directory wasn't registered for some
         * reason then don't let userspace try their luck with config
         * IDs
         */
-       if (!dev_priv->perf.metrics_kobj) {
+       if (!perf->metrics_kobj) {
                DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
                return -EINVAL;
        }
@@ -2184,16 +2718,17 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
                return -EINVAL;
        }
 
-       if (!dev_priv->perf.ops.enable_metric_set) {
+       if (!perf->ops.enable_metric_set) {
                DRM_DEBUG("OA unit not supported\n");
                return -ENODEV;
        }
 
-       /* To avoid the complexity of having to accurately filter
+       /*
+        * To avoid the complexity of having to accurately filter
         * counter reports and marshal to the appropriate client
         * we currently only allow exclusive access
         */
-       if (dev_priv->perf.exclusive_stream) {
+       if (perf->exclusive_stream) {
                DRM_DEBUG("OA unit already in use\n");
                return -EBUSY;
        }
@@ -2203,9 +2738,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
                return -EINVAL;
        }
 
+       stream->engine = props->engine;
+       stream->uncore = stream->engine->gt->uncore;
+
        stream->sample_size = sizeof(struct drm_i915_perf_record_header);
 
-       format_size = dev_priv->perf.oa_formats[props->oa_format].size;
+       format_size = perf->oa_formats[props->oa_format].size;
 
        stream->sample_flags |= SAMPLE_OA_REPORT;
        stream->sample_size += format_size;
@@ -2214,8 +2752,10 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
        if (WARN_ON(stream->oa_buffer.format_size == 0))
                return -EINVAL;
 
+       stream->hold_preemption = props->hold_preemption;
+
        stream->oa_buffer.format =
-               dev_priv->perf.oa_formats[props->oa_format].format;
+               perf->oa_formats[props->oa_format].format;
 
        stream->periodic = props->oa_periodic;
        if (stream->periodic)
@@ -2229,9 +2769,16 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
                }
        }
 
-       ret = get_oa_config(dev_priv, props->metrics_set, &stream->oa_config);
+       ret = alloc_noa_wait(stream);
        if (ret) {
+               DRM_DEBUG("Unable to allocate NOA wait batch buffer\n");
+               goto err_noa_wait_alloc;
+       }
+
+       stream->oa_config = i915_perf_get_oa_config(perf, props->metrics_set);
+       if (!stream->oa_config) {
                DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
+               ret = -EINVAL;
                goto err_config;
        }
 
@@ -2247,27 +2794,24 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
         *   In our case we are expecting that taking pm + FORCEWAKE
         *   references will effectively disable RC6.
         */
-       stream->wakeref = intel_runtime_pm_get(&dev_priv->runtime_pm);
-       intel_uncore_forcewake_get(&dev_priv->uncore, FORCEWAKE_ALL);
+       intel_engine_pm_get(stream->engine);
+       intel_uncore_forcewake_get(stream->uncore, FORCEWAKE_ALL);
 
        ret = alloc_oa_buffer(stream);
        if (ret)
                goto err_oa_buf_alloc;
 
-       ret = i915_mutex_lock_interruptible(&dev_priv->drm);
-       if (ret)
-               goto err_lock;
-
        stream->ops = &i915_oa_stream_ops;
-       dev_priv->perf.exclusive_stream = stream;
+       perf->exclusive_stream = stream;
 
-       ret = dev_priv->perf.ops.enable_metric_set(stream);
+       ret = perf->ops.enable_metric_set(stream);
        if (ret) {
                DRM_DEBUG("Unable to enable metric set\n");
                goto err_enable;
        }
 
-       mutex_unlock(&dev_priv->drm.struct_mutex);
+       DRM_DEBUG("opening stream oa config uuid=%s\n",
+                 stream->oa_config->uuid);
 
        hrtimer_init(&stream->poll_check_timer,
                     CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -2278,38 +2822,40 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
        return 0;
 
 err_enable:
-       dev_priv->perf.exclusive_stream = NULL;
-       dev_priv->perf.ops.disable_metric_set(stream);
-       mutex_unlock(&dev_priv->drm.struct_mutex);
+       perf->exclusive_stream = NULL;
+       perf->ops.disable_metric_set(stream);
 
-err_lock:
        free_oa_buffer(stream);
 
 err_oa_buf_alloc:
-       put_oa_config(dev_priv, stream->oa_config);
+       free_oa_configs(stream);
 
-       intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
-       intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref);
+       intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
+       intel_engine_pm_put(stream->engine);
 
 err_config:
+       free_noa_wait(stream);
+
+err_noa_wait_alloc:
        if (stream->ctx)
                oa_put_render_ctx_id(stream);
 
        return ret;
 }
 
-void i915_oa_init_reg_state(struct intel_engine_cs *engine,
-                           struct intel_context *ce,
-                           u32 *regs)
+void i915_oa_init_reg_state(const struct intel_context *ce,
+                           const struct intel_engine_cs *engine)
 {
        struct i915_perf_stream *stream;
 
+       /* perf.exclusive_stream serialised by lrc_configure_all_contexts() */
+
        if (engine->class != RENDER_CLASS)
                return;
 
        stream = engine->i915->perf.exclusive_stream;
        if (stream)
-               gen8_update_reg_state_unlocked(stream, ce, regs, stream->oa_config);
+               gen8_update_reg_state_unlocked(ce, stream);
 }
 
 /**
@@ -2379,7 +2925,7 @@ static ssize_t i915_perf_read(struct file *file,
                              loff_t *ppos)
 {
        struct i915_perf_stream *stream = file->private_data;
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct i915_perf *perf = stream->perf;
        ssize_t ret;
 
        /* To ensure it's handled consistently we simply treat all reads of a
@@ -2402,15 +2948,15 @@ static ssize_t i915_perf_read(struct file *file,
                        if (ret)
                                return ret;
 
-                       mutex_lock(&dev_priv->perf.lock);
+                       mutex_lock(&perf->lock);
                        ret = i915_perf_read_locked(stream, file,
                                                    buf, count, ppos);
-                       mutex_unlock(&dev_priv->perf.lock);
+                       mutex_unlock(&perf->lock);
                } while (ret == -EAGAIN);
        } else {
-               mutex_lock(&dev_priv->perf.lock);
+               mutex_lock(&perf->lock);
                ret = i915_perf_read_locked(stream, file, buf, count, ppos);
-               mutex_unlock(&dev_priv->perf.lock);
+               mutex_unlock(&perf->lock);
        }
 
        /* We allow the poll checking to sometimes report false positive EPOLLIN
@@ -2448,7 +2994,6 @@ static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
 
 /**
  * i915_perf_poll_locked - poll_wait() with a suitable wait queue for stream
- * @dev_priv: i915 device instance
  * @stream: An i915 perf stream
  * @file: An i915 perf stream file
  * @wait: poll() state table
@@ -2457,15 +3002,14 @@ static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
  * &i915_perf_stream_ops->poll_wait to call poll_wait() with a wait queue that
  * will be woken for new stream data.
  *
- * Note: The &drm_i915_private->perf.lock mutex has been taken to serialize
+ * Note: The &perf->lock mutex has been taken to serialize
  * with any non-file-operation driver hooks.
  *
  * Returns: any poll events that are ready without sleeping
  */
-static __poll_t i915_perf_poll_locked(struct drm_i915_private *dev_priv,
-                                         struct i915_perf_stream *stream,
-                                         struct file *file,
-                                         poll_table *wait)
+static __poll_t i915_perf_poll_locked(struct i915_perf_stream *stream,
+                                     struct file *file,
+                                     poll_table *wait)
 {
        __poll_t events = 0;
 
@@ -2499,12 +3043,12 @@ static __poll_t i915_perf_poll_locked(struct drm_i915_private *dev_priv,
 static __poll_t i915_perf_poll(struct file *file, poll_table *wait)
 {
        struct i915_perf_stream *stream = file->private_data;
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct i915_perf *perf = stream->perf;
        __poll_t ret;
 
-       mutex_lock(&dev_priv->perf.lock);
-       ret = i915_perf_poll_locked(dev_priv, stream, file, wait);
-       mutex_unlock(&dev_priv->perf.lock);
+       mutex_lock(&perf->lock);
+       ret = i915_perf_poll_locked(stream, file, wait);
+       mutex_unlock(&perf->lock);
 
        return ret;
 }
@@ -2529,6 +3073,9 @@ static void i915_perf_enable_locked(struct i915_perf_stream *stream)
 
        if (stream->ops->enable)
                stream->ops->enable(stream);
+
+       if (stream->hold_preemption)
+               i915_gem_context_set_nopreempt(stream->ctx);
 }
 
 /**
@@ -2553,17 +3100,54 @@ static void i915_perf_disable_locked(struct i915_perf_stream *stream)
        /* Allow stream->ops->disable() to refer to this */
        stream->enabled = false;
 
+       if (stream->hold_preemption)
+               i915_gem_context_clear_nopreempt(stream->ctx);
+
        if (stream->ops->disable)
                stream->ops->disable(stream);
 }
 
+static long i915_perf_config_locked(struct i915_perf_stream *stream,
+                                   unsigned long metrics_set)
+{
+       struct i915_oa_config *config;
+       long ret = stream->oa_config->id;
+
+       config = i915_perf_get_oa_config(stream->perf, metrics_set);
+       if (!config)
+               return -EINVAL;
+
+       if (config != stream->oa_config) {
+               int err;
+
+               /*
+                * If OA is bound to a specific context, emit the
+                * reconfiguration inline from that context. The update
+                * will then be ordered with respect to submission on that
+                * context.
+                *
+                * When set globally, we use a low priority kernel context,
+                * so it will effectively take effect when idle.
+                */
+               err = emit_oa_config(stream, config, oa_context(stream));
+               if (err == 0)
+                       config = xchg(&stream->oa_config, config);
+               else
+                       ret = err;
+       }
+
+       i915_oa_config_put(config);
+
+       return ret;
+}
+
 /**
  * i915_perf_ioctl - support ioctl() usage with i915 perf stream FDs
  * @stream: An i915 perf stream
  * @cmd: the ioctl request
  * @arg: the ioctl data
  *
- * Note: The &drm_i915_private->perf.lock mutex has been taken to serialize
+ * Note: The &perf->lock mutex has been taken to serialize
  * with any non-file-operation driver hooks.
  *
  * Returns: zero on success or a negative error code. Returns -EINVAL for
@@ -2580,6 +3164,8 @@ static long i915_perf_ioctl_locked(struct i915_perf_stream *stream,
        case I915_PERF_IOCTL_DISABLE:
                i915_perf_disable_locked(stream);
                return 0;
+       case I915_PERF_IOCTL_CONFIG:
+               return i915_perf_config_locked(stream, arg);
        }
 
        return -EINVAL;
@@ -2601,12 +3187,12 @@ static long i915_perf_ioctl(struct file *file,
                            unsigned long arg)
 {
        struct i915_perf_stream *stream = file->private_data;
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct i915_perf *perf = stream->perf;
        long ret;
 
-       mutex_lock(&dev_priv->perf.lock);
+       mutex_lock(&perf->lock);
        ret = i915_perf_ioctl_locked(stream, cmd, arg);
-       mutex_unlock(&dev_priv->perf.lock);
+       mutex_unlock(&perf->lock);
 
        return ret;
 }
@@ -2618,7 +3204,7 @@ static long i915_perf_ioctl(struct file *file,
  * Frees all resources associated with the given i915 perf @stream, disabling
  * any associated data capture in the process.
  *
- * Note: The &drm_i915_private->perf.lock mutex has been taken to serialize
+ * Note: The &perf->lock mutex has been taken to serialize
  * with any non-file-operation driver hooks.
  */
 static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
@@ -2629,8 +3215,6 @@ static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
        if (stream->ops->destroy)
                stream->ops->destroy(stream);
 
-       list_del(&stream->link);
-
        if (stream->ctx)
                i915_gem_context_put(stream->ctx);
 
@@ -2651,14 +3235,14 @@ static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
 static int i915_perf_release(struct inode *inode, struct file *file)
 {
        struct i915_perf_stream *stream = file->private_data;
-       struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct i915_perf *perf = stream->perf;
 
-       mutex_lock(&dev_priv->perf.lock);
+       mutex_lock(&perf->lock);
        i915_perf_destroy_locked(stream);
-       mutex_unlock(&dev_priv->perf.lock);
+       mutex_unlock(&perf->lock);
 
        /* Release the reference the perf stream kept on the driver. */
-       drm_dev_put(&dev_priv->drm);
+       drm_dev_put(&perf->i915->drm);
 
        return 0;
 }
@@ -2680,7 +3264,7 @@ static const struct file_operations fops = {
 
 /**
  * i915_perf_open_ioctl_locked - DRM ioctl() for userspace to open a stream FD
- * @dev_priv: i915 device instance
+ * @perf: i915 perf instance
  * @param: The open parameters passed to 'DRM_I915_PERF_OPEN`
  * @props: individually validated u64 property value pairs
  * @file: drm file
@@ -2688,7 +3272,7 @@ static const struct file_operations fops = {
  * See i915_perf_ioctl_open() for interface details.
  *
  * Implements further stream config validation and stream initialization on
- * behalf of i915_perf_open_ioctl() with the &drm_i915_private->perf.lock mutex
+ * behalf of i915_perf_open_ioctl() with the &perf->lock mutex
  * taken to serialize with any non-file-operation driver hooks.
  *
  * Note: at this point the @props have only been validated in isolation and
@@ -2703,7 +3287,7 @@ static const struct file_operations fops = {
  * Returns: zero on success or a negative error code.
  */
 static int
-i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
+i915_perf_open_ioctl_locked(struct i915_perf *perf,
                            struct drm_i915_perf_open_param *param,
                            struct perf_open_properties *props,
                            struct drm_file *file)
@@ -2734,17 +3318,34 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
         * rest of the system, which we consider acceptable for a
         * non-privileged client.
         *
-        * For Gen8+ the OA unit no longer supports clock gating off for a
+        * For Gen8->11 the OA unit no longer supports clock gating off for a
         * specific context and the kernel can't securely stop the counters
         * from updating as system-wide / global values. Even though we can
         * filter reports based on the included context ID we can't block
         * clients from seeing the raw / global counter values via
         * MI_REPORT_PERF_COUNT commands and so consider it a privileged op to
         * enable the OA unit by default.
+        *
+        * For Gen12+ we gain a new OAR unit that only monitors the RCS on a
+        * per context basis. So we can relax requirements there if the user
+        * doesn't request global stream access (i.e. query based sampling
+        * using MI_RECORD_PERF_COUNT.
         */
-       if (IS_HASWELL(dev_priv) && specific_ctx)
+       if (IS_HASWELL(perf->i915) && specific_ctx)
+               privileged_op = false;
+       else if (IS_GEN(perf->i915, 12) && specific_ctx &&
+                (props->sample_flags & SAMPLE_OA_REPORT) == 0)
                privileged_op = false;
 
+       if (props->hold_preemption) {
+               if (!props->single_context) {
+                       DRM_DEBUG("preemption disable with no context\n");
+                       ret = -EINVAL;
+                       goto err;
+               }
+               privileged_op = true;
+       }
+
        /* Similar to perf's kernel.perf_paranoid_cpu sysctl option
         * we check a dev.i915.perf_stream_paranoid sysctl option
         * to determine if it's ok to access system wide OA counters
@@ -2752,7 +3353,7 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
         */
        if (privileged_op &&
            i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
-               DRM_DEBUG("Insufficient privileges to open system-wide i915 perf stream\n");
+               DRM_DEBUG("Insufficient privileges to open i915 perf stream\n");
                ret = -EACCES;
                goto err_ctx;
        }
@@ -2763,7 +3364,7 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
                goto err_ctx;
        }
 
-       stream->dev_priv = dev_priv;
+       stream->perf = perf;
        stream->ctx = specific_ctx;
 
        ret = i915_oa_stream_init(stream, param, props);
@@ -2779,8 +3380,6 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
                goto err_flags;
        }
 
-       list_add(&stream->link, &dev_priv->perf.streams);
-
        if (param->flags & I915_PERF_FLAG_FD_CLOEXEC)
                f_flags |= O_CLOEXEC;
        if (param->flags & I915_PERF_FLAG_FD_NONBLOCK)
@@ -2789,7 +3388,7 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
        stream_fd = anon_inode_getfd("[i915_perf]", &fops, stream, f_flags);
        if (stream_fd < 0) {
                ret = stream_fd;
-               goto err_open;
+               goto err_flags;
        }
 
        if (!(param->flags & I915_PERF_FLAG_DISABLED))
@@ -2798,12 +3397,10 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
        /* Take a reference on the driver that will be kept with stream_fd
         * until its release.
         */
-       drm_dev_get(&dev_priv->drm);
+       drm_dev_get(&perf->i915->drm);
 
        return stream_fd;
 
-err_open:
-       list_del(&stream->link);
 err_flags:
        if (stream->ops->destroy)
                stream->ops->destroy(stream);
@@ -2816,15 +3413,15 @@ err:
        return ret;
 }
 
-static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
+static u64 oa_exponent_to_ns(struct i915_perf *perf, int exponent)
 {
        return div64_u64(1000000000ULL * (2ULL << exponent),
-                        1000ULL * RUNTIME_INFO(dev_priv)->cs_timestamp_frequency_khz);
+                        1000ULL * RUNTIME_INFO(perf->i915)->cs_timestamp_frequency_khz);
 }
 
 /**
  * read_properties_unlocked - validate + copy userspace stream open properties
- * @dev_priv: i915 device instance
+ * @perf: i915 perf instance
  * @uprops: The array of u64 key value pairs given by userspace
  * @n_props: The number of key value pairs expected in @uprops
  * @props: The stream configuration built up while validating properties
@@ -2837,7 +3434,7 @@ static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
  * we shouldn't validate or assume anything about ordering here. This doesn't
  * rule out defining new properties with ordering requirements in the future.
  */
-static int read_properties_unlocked(struct drm_i915_private *dev_priv,
+static int read_properties_unlocked(struct i915_perf *perf,
                                    u64 __user *uprops,
                                    u32 n_props,
                                    struct perf_open_properties *props)
@@ -2852,6 +3449,15 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
                return -EINVAL;
        }
 
+       /* At the moment we only support using i915-perf on the RCS. */
+       props->engine = intel_engine_lookup_user(perf->i915,
+                                                I915_ENGINE_CLASS_RENDER,
+                                                0);
+       if (!props->engine) {
+               DRM_DEBUG("No RENDER-capable engines\n");
+               return -EINVAL;
+       }
+
        /* Considering that ID = 0 is reserved and assuming that we don't
         * (currently) expect any configurations to ever specify duplicate
         * values for a particular property ID then the last _PROP_MAX value is
@@ -2903,7 +3509,7 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
                                          value);
                                return -EINVAL;
                        }
-                       if (!dev_priv->perf.oa_formats[value].size) {
+                       if (!perf->oa_formats[value].size) {
                                DRM_DEBUG("Unsupported OA report format %llu\n",
                                          value);
                                return -EINVAL;
@@ -2924,7 +3530,7 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
                         */
 
                        BUILD_BUG_ON(sizeof(oa_period) != 8);
-                       oa_period = oa_exponent_to_ns(dev_priv, value);
+                       oa_period = oa_exponent_to_ns(perf, value);
 
                        /* This check is primarily to ensure that oa_period <=
                         * UINT32_MAX (before passing to do_div which only
@@ -2949,6 +3555,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
                        props->oa_periodic = true;
                        props->oa_period_exponent = value;
                        break;
+               case DRM_I915_PERF_PROP_HOLD_PREEMPTION:
+                       props->hold_preemption = !!value;
+                       break;
                case DRM_I915_PERF_PROP_MAX:
                        MISSING_CASE(id);
                        return -EINVAL;
@@ -2978,7 +3587,7 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
  * mutex to avoid an awkward lockdep with mmap_sem.
  *
  * Most of the implementation details are handled by
- * i915_perf_open_ioctl_locked() after taking the &drm_i915_private->perf.lock
+ * i915_perf_open_ioctl_locked() after taking the &perf->lock
  * mutex for serializing with any non-file-operation driver hooks.
  *
  * Return: A newly opened i915 Perf stream file descriptor or negative
@@ -2987,13 +3596,13 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 int i915_perf_open_ioctl(struct drm_device *dev, void *data,
                         struct drm_file *file)
 {
-       struct drm_i915_private *dev_priv = dev->dev_private;
+       struct i915_perf *perf = &to_i915(dev)->perf;
        struct drm_i915_perf_open_param *param = data;
        struct perf_open_properties props;
        u32 known_open_flags;
        int ret;
 
-       if (!dev_priv->perf.initialized) {
+       if (!perf->i915) {
                DRM_DEBUG("i915 perf interface not available for this system\n");
                return -ENOTSUPP;
        }
@@ -3006,124 +3615,130 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data,
                return -EINVAL;
        }
 
-       ret = read_properties_unlocked(dev_priv,
+       ret = read_properties_unlocked(perf,
                                       u64_to_user_ptr(param->properties_ptr),
                                       param->num_properties,
                                       &props);
        if (ret)
                return ret;
 
-       mutex_lock(&dev_priv->perf.lock);
-       ret = i915_perf_open_ioctl_locked(dev_priv, param, &props, file);
-       mutex_unlock(&dev_priv->perf.lock);
+       mutex_lock(&perf->lock);
+       ret = i915_perf_open_ioctl_locked(perf, param, &props, file);
+       mutex_unlock(&perf->lock);
 
        return ret;
 }
 
 /**
  * i915_perf_register - exposes i915-perf to userspace
- * @dev_priv: i915 device instance
+ * @i915: i915 device instance
  *
  * In particular OA metric sets are advertised under a sysfs metrics/
  * directory allowing userspace to enumerate valid IDs that can be
  * used to open an i915-perf stream.
  */
-void i915_perf_register(struct drm_i915_private *dev_priv)
+void i915_perf_register(struct drm_i915_private *i915)
 {
+       struct i915_perf *perf = &i915->perf;
        int ret;
 
-       if (!dev_priv->perf.initialized)
+       if (!perf->i915)
                return;
 
        /* To be sure we're synchronized with an attempted
         * i915_perf_open_ioctl(); considering that we register after
         * being exposed to userspace.
         */
-       mutex_lock(&dev_priv->perf.lock);
+       mutex_lock(&perf->lock);
 
-       dev_priv->perf.metrics_kobj =
+       perf->metrics_kobj =
                kobject_create_and_add("metrics",
-                                      &dev_priv->drm.primary->kdev->kobj);
-       if (!dev_priv->perf.metrics_kobj)
+                                      &i915->drm.primary->kdev->kobj);
+       if (!perf->metrics_kobj)
                goto exit;
 
-       sysfs_attr_init(&dev_priv->perf.test_config.sysfs_metric_id.attr);
-
-       if (INTEL_GEN(dev_priv) >= 11) {
-               i915_perf_load_test_config_icl(dev_priv);
-       } else if (IS_CANNONLAKE(dev_priv)) {
-               i915_perf_load_test_config_cnl(dev_priv);
-       } else if (IS_COFFEELAKE(dev_priv)) {
-               if (IS_CFL_GT2(dev_priv))
-                       i915_perf_load_test_config_cflgt2(dev_priv);
-               if (IS_CFL_GT3(dev_priv))
-                       i915_perf_load_test_config_cflgt3(dev_priv);
-       } else if (IS_GEMINILAKE(dev_priv)) {
-               i915_perf_load_test_config_glk(dev_priv);
-       } else if (IS_KABYLAKE(dev_priv)) {
-               if (IS_KBL_GT2(dev_priv))
-                       i915_perf_load_test_config_kblgt2(dev_priv);
-               else if (IS_KBL_GT3(dev_priv))
-                       i915_perf_load_test_config_kblgt3(dev_priv);
-       } else if (IS_BROXTON(dev_priv)) {
-               i915_perf_load_test_config_bxt(dev_priv);
-       } else if (IS_SKYLAKE(dev_priv)) {
-               if (IS_SKL_GT2(dev_priv))
-                       i915_perf_load_test_config_sklgt2(dev_priv);
-               else if (IS_SKL_GT3(dev_priv))
-                       i915_perf_load_test_config_sklgt3(dev_priv);
-               else if (IS_SKL_GT4(dev_priv))
-                       i915_perf_load_test_config_sklgt4(dev_priv);
-       } else if (IS_CHERRYVIEW(dev_priv)) {
-               i915_perf_load_test_config_chv(dev_priv);
-       } else if (IS_BROADWELL(dev_priv)) {
-               i915_perf_load_test_config_bdw(dev_priv);
-       } else if (IS_HASWELL(dev_priv)) {
-               i915_perf_load_test_config_hsw(dev_priv);
-}
-
-       if (dev_priv->perf.test_config.id == 0)
+       sysfs_attr_init(&perf->test_config.sysfs_metric_id.attr);
+
+       if (IS_TIGERLAKE(i915)) {
+               i915_perf_load_test_config_tgl(i915);
+       } else if (INTEL_GEN(i915) >= 11) {
+               i915_perf_load_test_config_icl(i915);
+       } else if (IS_CANNONLAKE(i915)) {
+               i915_perf_load_test_config_cnl(i915);
+       } else if (IS_COFFEELAKE(i915)) {
+               if (IS_CFL_GT2(i915))
+                       i915_perf_load_test_config_cflgt2(i915);
+               if (IS_CFL_GT3(i915))
+                       i915_perf_load_test_config_cflgt3(i915);
+       } else if (IS_GEMINILAKE(i915)) {
+               i915_perf_load_test_config_glk(i915);
+       } else if (IS_KABYLAKE(i915)) {
+               if (IS_KBL_GT2(i915))
+                       i915_perf_load_test_config_kblgt2(i915);
+               else if (IS_KBL_GT3(i915))
+                       i915_perf_load_test_config_kblgt3(i915);
+       } else if (IS_BROXTON(i915)) {
+               i915_perf_load_test_config_bxt(i915);
+       } else if (IS_SKYLAKE(i915)) {
+               if (IS_SKL_GT2(i915))
+                       i915_perf_load_test_config_sklgt2(i915);
+               else if (IS_SKL_GT3(i915))
+                       i915_perf_load_test_config_sklgt3(i915);
+               else if (IS_SKL_GT4(i915))
+                       i915_perf_load_test_config_sklgt4(i915);
+       } else if (IS_CHERRYVIEW(i915)) {
+               i915_perf_load_test_config_chv(i915);
+       } else if (IS_BROADWELL(i915)) {
+               i915_perf_load_test_config_bdw(i915);
+       } else if (IS_HASWELL(i915)) {
+               i915_perf_load_test_config_hsw(i915);
+       }
+
+       if (perf->test_config.id == 0)
                goto sysfs_error;
 
-       ret = sysfs_create_group(dev_priv->perf.metrics_kobj,
-                                &dev_priv->perf.test_config.sysfs_metric);
+       ret = sysfs_create_group(perf->metrics_kobj,
+                                &perf->test_config.sysfs_metric);
        if (ret)
                goto sysfs_error;
 
-       atomic_set(&dev_priv->perf.test_config.ref_count, 1);
+       perf->test_config.perf = perf;
+       kref_init(&perf->test_config.ref);
 
        goto exit;
 
 sysfs_error:
-       kobject_put(dev_priv->perf.metrics_kobj);
-       dev_priv->perf.metrics_kobj = NULL;
+       kobject_put(perf->metrics_kobj);
+       perf->metrics_kobj = NULL;
 
 exit:
-       mutex_unlock(&dev_priv->perf.lock);
+       mutex_unlock(&perf->lock);
 }
 
 /**
  * i915_perf_unregister - hide i915-perf from userspace
- * @dev_priv: i915 device instance
+ * @i915: i915 device instance
  *
  * i915-perf state cleanup is split up into an 'unregister' and
  * 'deinit' phase where the interface is first hidden from
  * userspace by i915_perf_unregister() before cleaning up
  * remaining state in i915_perf_fini().
  */
-void i915_perf_unregister(struct drm_i915_private *dev_priv)
+void i915_perf_unregister(struct drm_i915_private *i915)
 {
-       if (!dev_priv->perf.metrics_kobj)
+       struct i915_perf *perf = &i915->perf;
+
+       if (!perf->metrics_kobj)
                return;
 
-       sysfs_remove_group(dev_priv->perf.metrics_kobj,
-                          &dev_priv->perf.test_config.sysfs_metric);
+       sysfs_remove_group(perf->metrics_kobj,
+                          &perf->test_config.sysfs_metric);
 
-       kobject_put(dev_priv->perf.metrics_kobj);
-       dev_priv->perf.metrics_kobj = NULL;
+       kobject_put(perf->metrics_kobj);
+       perf->metrics_kobj = NULL;
 }
 
-static bool gen8_is_valid_flex_addr(struct drm_i915_private *dev_priv, u32 addr)
+static bool gen8_is_valid_flex_addr(struct i915_perf *perf, u32 addr)
 {
        static const i915_reg_t flex_eu_regs[] = {
                EU_PERF_CNTL0,
@@ -3143,56 +3758,80 @@ static bool gen8_is_valid_flex_addr(struct drm_i915_private *dev_priv, u32 addr)
        return false;
 }
 
-static bool gen7_is_valid_b_counter_addr(struct drm_i915_private *dev_priv, u32 addr)
+#define ADDR_IN_RANGE(addr, start, end) \
+       ((addr) >= (start) && \
+        (addr) <= (end))
+
+#define REG_IN_RANGE(addr, start, end) \
+       ((addr) >= i915_mmio_reg_offset(start) && \
+        (addr) <= i915_mmio_reg_offset(end))
+
+#define REG_EQUAL(addr, mmio) \
+       ((addr) == i915_mmio_reg_offset(mmio))
+
+static bool gen7_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
+{
+       return REG_IN_RANGE(addr, OASTARTTRIG1, OASTARTTRIG8) ||
+              REG_IN_RANGE(addr, OAREPORTTRIG1, OAREPORTTRIG8) ||
+              REG_IN_RANGE(addr, OACEC0_0, OACEC7_1);
+}
+
+static bool gen7_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
+{
+       return REG_EQUAL(addr, HALF_SLICE_CHICKEN2) ||
+              REG_IN_RANGE(addr, MICRO_BP0_0, NOA_WRITE) ||
+              REG_IN_RANGE(addr, OA_PERFCNT1_LO, OA_PERFCNT2_HI) ||
+              REG_IN_RANGE(addr, OA_PERFMATRIX_LO, OA_PERFMATRIX_HI);
+}
+
+static bool gen8_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
 {
-       return (addr >= i915_mmio_reg_offset(OASTARTTRIG1) &&
-               addr <= i915_mmio_reg_offset(OASTARTTRIG8)) ||
-               (addr >= i915_mmio_reg_offset(OAREPORTTRIG1) &&
-                addr <= i915_mmio_reg_offset(OAREPORTTRIG8)) ||
-               (addr >= i915_mmio_reg_offset(OACEC0_0) &&
-                addr <= i915_mmio_reg_offset(OACEC7_1));
+       return gen7_is_valid_mux_addr(perf, addr) ||
+              REG_EQUAL(addr, WAIT_FOR_RC6_EXIT) ||
+              REG_IN_RANGE(addr, RPM_CONFIG0, NOA_CONFIG(8));
 }
 
-static bool gen7_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
+static bool gen10_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
 {
-       return addr == i915_mmio_reg_offset(HALF_SLICE_CHICKEN2) ||
-               (addr >= i915_mmio_reg_offset(MICRO_BP0_0) &&
-                addr <= i915_mmio_reg_offset(NOA_WRITE)) ||
-               (addr >= i915_mmio_reg_offset(OA_PERFCNT1_LO) &&
-                addr <= i915_mmio_reg_offset(OA_PERFCNT2_HI)) ||
-               (addr >= i915_mmio_reg_offset(OA_PERFMATRIX_LO) &&
-                addr <= i915_mmio_reg_offset(OA_PERFMATRIX_HI));
+       return gen8_is_valid_mux_addr(perf, addr) ||
+              REG_EQUAL(addr, GEN10_NOA_WRITE_HIGH) ||
+              REG_IN_RANGE(addr, OA_PERFCNT3_LO, OA_PERFCNT4_HI);
 }
 
-static bool gen8_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
+static bool hsw_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
 {
-       return gen7_is_valid_mux_addr(dev_priv, addr) ||
-               addr == i915_mmio_reg_offset(WAIT_FOR_RC6_EXIT) ||
-               (addr >= i915_mmio_reg_offset(RPM_CONFIG0) &&
-                addr <= i915_mmio_reg_offset(NOA_CONFIG(8)));
+       return gen7_is_valid_mux_addr(perf, addr) ||
+              ADDR_IN_RANGE(addr, 0x25100, 0x2FF90) ||
+              REG_IN_RANGE(addr, HSW_MBVID2_NOA0, HSW_MBVID2_NOA9) ||
+              REG_EQUAL(addr, HSW_MBVID2_MISR0);
 }
 
-static bool gen10_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
+static bool chv_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
 {
-       return gen8_is_valid_mux_addr(dev_priv, addr) ||
-               addr == i915_mmio_reg_offset(GEN10_NOA_WRITE_HIGH) ||
-               (addr >= i915_mmio_reg_offset(OA_PERFCNT3_LO) &&
-                addr <= i915_mmio_reg_offset(OA_PERFCNT4_HI));
+       return gen7_is_valid_mux_addr(perf, addr) ||
+              ADDR_IN_RANGE(addr, 0x182300, 0x1823A4);
 }
 
-static bool hsw_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
+static bool gen12_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
 {
-       return gen7_is_valid_mux_addr(dev_priv, addr) ||
-               (addr >= 0x25100 && addr <= 0x2FF90) ||
-               (addr >= i915_mmio_reg_offset(HSW_MBVID2_NOA0) &&
-                addr <= i915_mmio_reg_offset(HSW_MBVID2_NOA9)) ||
-               addr == i915_mmio_reg_offset(HSW_MBVID2_MISR0);
+       return REG_IN_RANGE(addr, GEN12_OAG_OASTARTTRIG1, GEN12_OAG_OASTARTTRIG8) ||
+              REG_IN_RANGE(addr, GEN12_OAG_OAREPORTTRIG1, GEN12_OAG_OAREPORTTRIG8) ||
+              REG_IN_RANGE(addr, GEN12_OAG_CEC0_0, GEN12_OAG_CEC7_1) ||
+              REG_IN_RANGE(addr, GEN12_OAG_SCEC0_0, GEN12_OAG_SCEC7_1) ||
+              REG_EQUAL(addr, GEN12_OAA_DBG_REG) ||
+              REG_EQUAL(addr, GEN12_OAG_OA_PESS) ||
+              REG_EQUAL(addr, GEN12_OAG_SPCTR_CNF);
 }
 
-static bool chv_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
+static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
 {
-       return gen7_is_valid_mux_addr(dev_priv, addr) ||
-               (addr >= 0x182300 && addr <= 0x1823A4);
+       return REG_EQUAL(addr, NOA_WRITE) ||
+              REG_EQUAL(addr, GEN10_NOA_WRITE_HIGH) ||
+              REG_EQUAL(addr, GDT_CHICKEN_BITS) ||
+              REG_EQUAL(addr, WAIT_FOR_RC6_EXIT) ||
+              REG_EQUAL(addr, RPM_CONFIG0) ||
+              REG_EQUAL(addr, RPM_CONFIG1) ||
+              REG_IN_RANGE(addr, NOA_CONFIG(0), NOA_CONFIG(8));
 }
 
 static u32 mask_reg_value(u32 reg, u32 val)
@@ -3201,21 +3840,21 @@ static u32 mask_reg_value(u32 reg, u32 val)
         * WaDisableSTUnitPowerOptimization workaround. Make sure the value
         * programmed by userspace doesn't change this.
         */
-       if (i915_mmio_reg_offset(HALF_SLICE_CHICKEN2) == reg)
+       if (REG_EQUAL(reg, HALF_SLICE_CHICKEN2))
                val = val & ~_MASKED_BIT_ENABLE(GEN8_ST_PO_DISABLE);
 
        /* WAIT_FOR_RC6_EXIT has only one bit fullfilling the function
         * indicated by its name and a bunch of selection fields used by OA
         * configs.
         */
-       if (i915_mmio_reg_offset(WAIT_FOR_RC6_EXIT) == reg)
+       if (REG_EQUAL(reg, WAIT_FOR_RC6_EXIT))
                val = val & ~_MASKED_BIT_ENABLE(HSW_WAIT_FOR_RC6_EXIT_ENABLE);
 
        return val;
 }
 
-static struct i915_oa_reg *alloc_oa_regs(struct drm_i915_private *dev_priv,
-                                        bool (*is_valid)(struct drm_i915_private *dev_priv, u32 addr),
+static struct i915_oa_reg *alloc_oa_regs(struct i915_perf *perf,
+                                        bool (*is_valid)(struct i915_perf *perf, u32 addr),
                                         u32 __user *regs,
                                         u32 n_regs)
 {
@@ -3245,7 +3884,7 @@ static struct i915_oa_reg *alloc_oa_regs(struct drm_i915_private *dev_priv,
                if (err)
                        goto addr_err;
 
-               if (!is_valid(dev_priv, addr)) {
+               if (!is_valid(perf, addr)) {
                        DRM_DEBUG("Invalid oa_reg address: %X\n", addr);
                        err = -EINVAL;
                        goto addr_err;
@@ -3278,7 +3917,7 @@ static ssize_t show_dynamic_id(struct device *dev,
        return sprintf(buf, "%d\n", oa_config->id);
 }
 
-static int create_dynamic_oa_sysfs_entry(struct drm_i915_private *dev_priv,
+static int create_dynamic_oa_sysfs_entry(struct i915_perf *perf,
                                         struct i915_oa_config *oa_config)
 {
        sysfs_attr_init(&oa_config->sysfs_metric_id.attr);
@@ -3293,7 +3932,7 @@ static int create_dynamic_oa_sysfs_entry(struct drm_i915_private *dev_priv,
        oa_config->sysfs_metric.name = oa_config->uuid;
        oa_config->sysfs_metric.attrs = oa_config->attrs;
 
-       return sysfs_create_group(dev_priv->perf.metrics_kobj,
+       return sysfs_create_group(perf->metrics_kobj,
                                  &oa_config->sysfs_metric);
 }
 
@@ -3313,17 +3952,18 @@ static int create_dynamic_oa_sysfs_entry(struct drm_i915_private *dev_priv,
 int i915_perf_add_config_ioctl(struct drm_device *dev, void *data,
                               struct drm_file *file)
 {
-       struct drm_i915_private *dev_priv = dev->dev_private;
+       struct i915_perf *perf = &to_i915(dev)->perf;
        struct drm_i915_perf_oa_config *args = data;
        struct i915_oa_config *oa_config, *tmp;
+       static struct i915_oa_reg *regs;
        int err, id;
 
-       if (!dev_priv->perf.initialized) {
+       if (!perf->i915) {
                DRM_DEBUG("i915 perf interface not available for this system\n");
                return -ENOTSUPP;
        }
 
-       if (!dev_priv->perf.metrics_kobj) {
+       if (!perf->metrics_kobj) {
                DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
                return -EINVAL;
        }
@@ -3346,7 +3986,8 @@ int i915_perf_add_config_ioctl(struct drm_device *dev, void *data,
                return -ENOMEM;
        }
 
-       atomic_set(&oa_config->ref_count, 1);
+       oa_config->perf = perf;
+       kref_init(&oa_config->ref);
 
        if (!uuid_is_valid(args->uuid)) {
                DRM_DEBUG("Invalid uuid format for OA config\n");
@@ -3360,59 +4001,59 @@ int i915_perf_add_config_ioctl(struct drm_device *dev, void *data,
        memcpy(oa_config->uuid, args->uuid, sizeof(args->uuid));
 
        oa_config->mux_regs_len = args->n_mux_regs;
-       oa_config->mux_regs =
-               alloc_oa_regs(dev_priv,
-                             dev_priv->perf.ops.is_valid_mux_reg,
-                             u64_to_user_ptr(args->mux_regs_ptr),
-                             args->n_mux_regs);
+       regs = alloc_oa_regs(perf,
+                            perf->ops.is_valid_mux_reg,
+                            u64_to_user_ptr(args->mux_regs_ptr),
+                            args->n_mux_regs);
 
-       if (IS_ERR(oa_config->mux_regs)) {
+       if (IS_ERR(regs)) {
                DRM_DEBUG("Failed to create OA config for mux_regs\n");
-               err = PTR_ERR(oa_config->mux_regs);
+               err = PTR_ERR(regs);
                goto reg_err;
        }
+       oa_config->mux_regs = regs;
 
        oa_config->b_counter_regs_len = args->n_boolean_regs;
-       oa_config->b_counter_regs =
-               alloc_oa_regs(dev_priv,
-                             dev_priv->perf.ops.is_valid_b_counter_reg,
-                             u64_to_user_ptr(args->boolean_regs_ptr),
-                             args->n_boolean_regs);
+       regs = alloc_oa_regs(perf,
+                            perf->ops.is_valid_b_counter_reg,
+                            u64_to_user_ptr(args->boolean_regs_ptr),
+                            args->n_boolean_regs);
 
-       if (IS_ERR(oa_config->b_counter_regs)) {
+       if (IS_ERR(regs)) {
                DRM_DEBUG("Failed to create OA config for b_counter_regs\n");
-               err = PTR_ERR(oa_config->b_counter_regs);
+               err = PTR_ERR(regs);
                goto reg_err;
        }
+       oa_config->b_counter_regs = regs;
 
-       if (INTEL_GEN(dev_priv) < 8) {
+       if (INTEL_GEN(perf->i915) < 8) {
                if (args->n_flex_regs != 0) {
                        err = -EINVAL;
                        goto reg_err;
                }
        } else {
                oa_config->flex_regs_len = args->n_flex_regs;
-               oa_config->flex_regs =
-                       alloc_oa_regs(dev_priv,
-                                     dev_priv->perf.ops.is_valid_flex_reg,
-                                     u64_to_user_ptr(args->flex_regs_ptr),
-                                     args->n_flex_regs);
+               regs = alloc_oa_regs(perf,
+                                    perf->ops.is_valid_flex_reg,
+                                    u64_to_user_ptr(args->flex_regs_ptr),
+                                    args->n_flex_regs);
 
-               if (IS_ERR(oa_config->flex_regs)) {
+               if (IS_ERR(regs)) {
                        DRM_DEBUG("Failed to create OA config for flex_regs\n");
-                       err = PTR_ERR(oa_config->flex_regs);
+                       err = PTR_ERR(regs);
                        goto reg_err;
                }
+               oa_config->flex_regs = regs;
        }
 
-       err = mutex_lock_interruptible(&dev_priv->perf.metrics_lock);
+       err = mutex_lock_interruptible(&perf->metrics_lock);
        if (err)
                goto reg_err;
 
        /* We shouldn't have too many configs, so this iteration shouldn't be
         * too costly.
         */
-       idr_for_each_entry(&dev_priv->perf.metrics_idr, tmp, id) {
+       idr_for_each_entry(&perf->metrics_idr, tmp, id) {
                if (!strcmp(tmp->uuid, oa_config->uuid)) {
                        DRM_DEBUG("OA config already exists with this uuid\n");
                        err = -EADDRINUSE;
@@ -3420,14 +4061,14 @@ int i915_perf_add_config_ioctl(struct drm_device *dev, void *data,
                }
        }
 
-       err = create_dynamic_oa_sysfs_entry(dev_priv, oa_config);
+       err = create_dynamic_oa_sysfs_entry(perf, oa_config);
        if (err) {
                DRM_DEBUG("Failed to create sysfs entry for OA config\n");
                goto sysfs_err;
        }
 
        /* Config id 0 is invalid, id 1 for kernel stored test config. */
-       oa_config->id = idr_alloc(&dev_priv->perf.metrics_idr,
+       oa_config->id = idr_alloc(&perf->metrics_idr,
                                  oa_config, 2,
                                  0, GFP_KERNEL);
        if (oa_config->id < 0) {
@@ -3436,16 +4077,16 @@ int i915_perf_add_config_ioctl(struct drm_device *dev, void *data,
                goto sysfs_err;
        }
 
-       mutex_unlock(&dev_priv->perf.metrics_lock);
+       mutex_unlock(&perf->metrics_lock);
 
        DRM_DEBUG("Added config %s id=%i\n", oa_config->uuid, oa_config->id);
 
        return oa_config->id;
 
 sysfs_err:
-       mutex_unlock(&dev_priv->perf.metrics_lock);
+       mutex_unlock(&perf->metrics_lock);
 reg_err:
-       put_oa_config(dev_priv, oa_config);
+       i915_oa_config_put(oa_config);
        DRM_DEBUG("Failed to add new OA config\n");
        return err;
 }
@@ -3464,12 +4105,12 @@ reg_err:
 int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data,
                                  struct drm_file *file)
 {
-       struct drm_i915_private *dev_priv = dev->dev_private;
+       struct i915_perf *perf = &to_i915(dev)->perf;
        u64 *arg = data;
        struct i915_oa_config *oa_config;
        int ret;
 
-       if (!dev_priv->perf.initialized) {
+       if (!perf->i915) {
                DRM_DEBUG("i915 perf interface not available for this system\n");
                return -ENOTSUPP;
        }
@@ -3479,31 +4120,33 @@ int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data,
                return -EACCES;
        }
 
-       ret = mutex_lock_interruptible(&dev_priv->perf.metrics_lock);
+       ret = mutex_lock_interruptible(&perf->metrics_lock);
        if (ret)
-               goto lock_err;
+               return ret;
 
-       oa_config = idr_find(&dev_priv->perf.metrics_idr, *arg);
+       oa_config = idr_find(&perf->metrics_idr, *arg);
        if (!oa_config) {
                DRM_DEBUG("Failed to remove unknown OA config\n");
                ret = -ENOENT;
-               goto config_err;
+               goto err_unlock;
        }
 
        GEM_BUG_ON(*arg != oa_config->id);
 
-       sysfs_remove_group(dev_priv->perf.metrics_kobj,
-                          &oa_config->sysfs_metric);
+       sysfs_remove_group(perf->metrics_kobj, &oa_config->sysfs_metric);
+
+       idr_remove(&perf->metrics_idr, *arg);
 
-       idr_remove(&dev_priv->perf.metrics_idr, *arg);
+       mutex_unlock(&perf->metrics_lock);
 
        DRM_DEBUG("Removed config %s id=%i\n", oa_config->uuid, oa_config->id);
 
-       put_oa_config(dev_priv, oa_config);
+       i915_oa_config_put(oa_config);
 
-config_err:
-       mutex_unlock(&dev_priv->perf.metrics_lock);
-lock_err:
+       return 0;
+
+err_unlock:
+       mutex_unlock(&perf->metrics_lock);
        return ret;
 }
 
@@ -3551,103 +4194,126 @@ static struct ctl_table dev_root[] = {
 
 /**
  * i915_perf_init - initialize i915-perf state on module load
- * @dev_priv: i915 device instance
+ * @i915: i915 device instance
  *
  * Initializes i915-perf state without exposing anything to userspace.
  *
  * Note: i915-perf initialization is split into an 'init' and 'register'
  * phase with the i915_perf_register() exposing state to userspace.
  */
-void i915_perf_init(struct drm_i915_private *dev_priv)
-{
-       if (IS_HASWELL(dev_priv)) {
-               dev_priv->perf.ops.is_valid_b_counter_reg =
-                       gen7_is_valid_b_counter_addr;
-               dev_priv->perf.ops.is_valid_mux_reg =
-                       hsw_is_valid_mux_addr;
-               dev_priv->perf.ops.is_valid_flex_reg = NULL;
-               dev_priv->perf.ops.enable_metric_set = hsw_enable_metric_set;
-               dev_priv->perf.ops.disable_metric_set = hsw_disable_metric_set;
-               dev_priv->perf.ops.oa_enable = gen7_oa_enable;
-               dev_priv->perf.ops.oa_disable = gen7_oa_disable;
-               dev_priv->perf.ops.read = gen7_oa_read;
-               dev_priv->perf.ops.oa_hw_tail_read =
-                       gen7_oa_hw_tail_read;
-
-               dev_priv->perf.oa_formats = hsw_oa_formats;
-       } else if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
+void i915_perf_init(struct drm_i915_private *i915)
+{
+       struct i915_perf *perf = &i915->perf;
+
+       /* XXX const struct i915_perf_ops! */
+
+       if (IS_HASWELL(i915)) {
+               perf->ops.is_valid_b_counter_reg = gen7_is_valid_b_counter_addr;
+               perf->ops.is_valid_mux_reg = hsw_is_valid_mux_addr;
+               perf->ops.is_valid_flex_reg = NULL;
+               perf->ops.enable_metric_set = hsw_enable_metric_set;
+               perf->ops.disable_metric_set = hsw_disable_metric_set;
+               perf->ops.oa_enable = gen7_oa_enable;
+               perf->ops.oa_disable = gen7_oa_disable;
+               perf->ops.read = gen7_oa_read;
+               perf->ops.oa_hw_tail_read = gen7_oa_hw_tail_read;
+
+               perf->oa_formats = hsw_oa_formats;
+       } else if (HAS_LOGICAL_RING_CONTEXTS(i915)) {
                /* Note: that although we could theoretically also support the
                 * legacy ringbuffer mode on BDW (and earlier iterations of
                 * this driver, before upstreaming did this) it didn't seem
                 * worth the complexity to maintain now that BDW+ enable
                 * execlist mode by default.
                 */
-               dev_priv->perf.oa_formats = gen8_plus_oa_formats;
+               perf->ops.read = gen8_oa_read;
 
-               dev_priv->perf.ops.oa_enable = gen8_oa_enable;
-               dev_priv->perf.ops.oa_disable = gen8_oa_disable;
-               dev_priv->perf.ops.read = gen8_oa_read;
-               dev_priv->perf.ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
+               if (IS_GEN_RANGE(i915, 8, 9)) {
+                       perf->oa_formats = gen8_plus_oa_formats;
 
-               if (IS_GEN_RANGE(dev_priv, 8, 9)) {
-                       dev_priv->perf.ops.is_valid_b_counter_reg =
+                       perf->ops.is_valid_b_counter_reg =
                                gen7_is_valid_b_counter_addr;
-                       dev_priv->perf.ops.is_valid_mux_reg =
+                       perf->ops.is_valid_mux_reg =
                                gen8_is_valid_mux_addr;
-                       dev_priv->perf.ops.is_valid_flex_reg =
+                       perf->ops.is_valid_flex_reg =
                                gen8_is_valid_flex_addr;
 
-                       if (IS_CHERRYVIEW(dev_priv)) {
-                               dev_priv->perf.ops.is_valid_mux_reg =
+                       if (IS_CHERRYVIEW(i915)) {
+                               perf->ops.is_valid_mux_reg =
                                        chv_is_valid_mux_addr;
                        }
 
-                       dev_priv->perf.ops.enable_metric_set = gen8_enable_metric_set;
-                       dev_priv->perf.ops.disable_metric_set = gen8_disable_metric_set;
+                       perf->ops.oa_enable = gen8_oa_enable;
+                       perf->ops.oa_disable = gen8_oa_disable;
+                       perf->ops.enable_metric_set = gen8_enable_metric_set;
+                       perf->ops.disable_metric_set = gen8_disable_metric_set;
+                       perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
 
-                       if (IS_GEN(dev_priv, 8)) {
-                               dev_priv->perf.ctx_oactxctrl_offset = 0x120;
-                               dev_priv->perf.ctx_flexeu0_offset = 0x2ce;
+                       if (IS_GEN(i915, 8)) {
+                               perf->ctx_oactxctrl_offset = 0x120;
+                               perf->ctx_flexeu0_offset = 0x2ce;
 
-                               dev_priv->perf.gen8_valid_ctx_bit = BIT(25);
+                               perf->gen8_valid_ctx_bit = BIT(25);
                        } else {
-                               dev_priv->perf.ctx_oactxctrl_offset = 0x128;
-                               dev_priv->perf.ctx_flexeu0_offset = 0x3de;
+                               perf->ctx_oactxctrl_offset = 0x128;
+                               perf->ctx_flexeu0_offset = 0x3de;
 
-                               dev_priv->perf.gen8_valid_ctx_bit = BIT(16);
+                               perf->gen8_valid_ctx_bit = BIT(16);
                        }
-               } else if (IS_GEN_RANGE(dev_priv, 10, 11)) {
-                       dev_priv->perf.ops.is_valid_b_counter_reg =
+               } else if (IS_GEN_RANGE(i915, 10, 11)) {
+                       perf->oa_formats = gen8_plus_oa_formats;
+
+                       perf->ops.is_valid_b_counter_reg =
                                gen7_is_valid_b_counter_addr;
-                       dev_priv->perf.ops.is_valid_mux_reg =
+                       perf->ops.is_valid_mux_reg =
                                gen10_is_valid_mux_addr;
-                       dev_priv->perf.ops.is_valid_flex_reg =
+                       perf->ops.is_valid_flex_reg =
                                gen8_is_valid_flex_addr;
 
-                       dev_priv->perf.ops.enable_metric_set = gen8_enable_metric_set;
-                       dev_priv->perf.ops.disable_metric_set = gen10_disable_metric_set;
+                       perf->ops.oa_enable = gen8_oa_enable;
+                       perf->ops.oa_disable = gen8_oa_disable;
+                       perf->ops.enable_metric_set = gen8_enable_metric_set;
+                       perf->ops.disable_metric_set = gen10_disable_metric_set;
+                       perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
 
-                       if (IS_GEN(dev_priv, 10)) {
-                               dev_priv->perf.ctx_oactxctrl_offset = 0x128;
-                               dev_priv->perf.ctx_flexeu0_offset = 0x3de;
+                       if (IS_GEN(i915, 10)) {
+                               perf->ctx_oactxctrl_offset = 0x128;
+                               perf->ctx_flexeu0_offset = 0x3de;
                        } else {
-                               dev_priv->perf.ctx_oactxctrl_offset = 0x124;
-                               dev_priv->perf.ctx_flexeu0_offset = 0x78e;
+                               perf->ctx_oactxctrl_offset = 0x124;
+                               perf->ctx_flexeu0_offset = 0x78e;
                        }
-                       dev_priv->perf.gen8_valid_ctx_bit = BIT(16);
+                       perf->gen8_valid_ctx_bit = BIT(16);
+               } else if (IS_GEN(i915, 12)) {
+                       perf->oa_formats = gen12_oa_formats;
+
+                       perf->ops.is_valid_b_counter_reg =
+                               gen12_is_valid_b_counter_addr;
+                       perf->ops.is_valid_mux_reg =
+                               gen12_is_valid_mux_addr;
+                       perf->ops.is_valid_flex_reg =
+                               gen8_is_valid_flex_addr;
+
+                       perf->ops.oa_enable = gen12_oa_enable;
+                       perf->ops.oa_disable = gen12_oa_disable;
+                       perf->ops.enable_metric_set = gen12_enable_metric_set;
+                       perf->ops.disable_metric_set = gen12_disable_metric_set;
+                       perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read;
+
+                       perf->ctx_flexeu0_offset = 0;
+                       perf->ctx_oactxctrl_offset = 0x144;
                }
        }
 
-       if (dev_priv->perf.ops.enable_metric_set) {
-               INIT_LIST_HEAD(&dev_priv->perf.streams);
-               mutex_init(&dev_priv->perf.lock);
+       if (perf->ops.enable_metric_set) {
+               mutex_init(&perf->lock);
 
                oa_sample_rate_hard_limit = 1000 *
-                       (RUNTIME_INFO(dev_priv)->cs_timestamp_frequency_khz / 2);
-               dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
+                       (RUNTIME_INFO(i915)->cs_timestamp_frequency_khz / 2);
+               perf->sysctl_header = register_sysctl_table(dev_root);
 
-               mutex_init(&dev_priv->perf.metrics_lock);
-               idr_init(&dev_priv->perf.metrics_idr);
+               mutex_init(&perf->metrics_lock);
+               idr_init(&perf->metrics_idr);
 
                /* We set up some ratelimit state to potentially throttle any
                 * _NOTES about spurious, invalid OA reports which we don't
@@ -3659,44 +4325,70 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
                 *
                 * Using the same limiting factors as printk_ratelimit()
                 */
-               ratelimit_state_init(&dev_priv->perf.spurious_report_rs,
-                                    5 * HZ, 10);
+               ratelimit_state_init(&perf->spurious_report_rs, 5 * HZ, 10);
                /* Since we use a DRM_NOTE for spurious reports it would be
                 * inconsistent to let __ratelimit() automatically print a
                 * warning for throttling.
                 */
-               ratelimit_set_flags(&dev_priv->perf.spurious_report_rs,
+               ratelimit_set_flags(&perf->spurious_report_rs,
                                    RATELIMIT_MSG_ON_RELEASE);
 
-               dev_priv->perf.initialized = true;
+               atomic64_set(&perf->noa_programming_delay,
+                            500 * 1000 /* 500us */);
+
+               perf->i915 = i915;
        }
 }
 
 static int destroy_config(int id, void *p, void *data)
 {
-       struct drm_i915_private *dev_priv = data;
-       struct i915_oa_config *oa_config = p;
-
-       put_oa_config(dev_priv, oa_config);
-
+       i915_oa_config_put(p);
        return 0;
 }
 
 /**
  * i915_perf_fini - Counter part to i915_perf_init()
- * @dev_priv: i915 device instance
+ * @i915: i915 device instance
  */
-void i915_perf_fini(struct drm_i915_private *dev_priv)
+void i915_perf_fini(struct drm_i915_private *i915)
 {
-       if (!dev_priv->perf.initialized)
+       struct i915_perf *perf = &i915->perf;
+
+       if (!perf->i915)
                return;
 
-       idr_for_each(&dev_priv->perf.metrics_idr, destroy_config, dev_priv);
-       idr_destroy(&dev_priv->perf.metrics_idr);
+       idr_for_each(&perf->metrics_idr, destroy_config, perf);
+       idr_destroy(&perf->metrics_idr);
 
-       unregister_sysctl_table(dev_priv->perf.sysctl_header);
+       unregister_sysctl_table(perf->sysctl_header);
 
-       memset(&dev_priv->perf.ops, 0, sizeof(dev_priv->perf.ops));
+       memset(&perf->ops, 0, sizeof(perf->ops));
+       perf->i915 = NULL;
+}
 
-       dev_priv->perf.initialized = false;
+/**
+ * i915_perf_ioctl_version - Version of the i915-perf subsystem
+ *
+ * This version number is used by userspace to detect available features.
+ */
+int i915_perf_ioctl_version(void)
+{
+       /*
+        * 1: Initial version
+        *   I915_PERF_IOCTL_ENABLE
+        *   I915_PERF_IOCTL_DISABLE
+        *
+        * 2: Added runtime modification of OA config.
+        *   I915_PERF_IOCTL_CONFIG
+        *
+        * 3: Add DRM_I915_PERF_PROP_HOLD_PREEMPTION parameter to hold
+        *    preemption on a particular context so that performance data is
+        *    accessible from a delta of MI_RPC reports without looking at the
+        *    OA buffer.
+        */
+       return 3;
 }
+
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftests/i915_perf.c"
+#endif