drm/i915: Use indirect ctx bb to mend CMD_BUF_CCTL
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / gt / intel_lrc.c
index 6fbad5e..c8014c2 100644 (file)
@@ -238,6 +238,112 @@ __execlists_update_reg_state(const struct intel_context *ce,
                             const struct intel_engine_cs *engine,
                             u32 head);
 
+static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
+{
+       if (INTEL_GEN(engine->i915) >= 12)
+               return 0x60;
+       else if (INTEL_GEN(engine->i915) >= 9)
+               return 0x54;
+       else if (engine->class == RENDER_CLASS)
+               return 0x58;
+       else
+               return -1;
+}
+
+static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
+{
+       if (INTEL_GEN(engine->i915) >= 12)
+               return 0x74;
+       else if (INTEL_GEN(engine->i915) >= 9)
+               return 0x68;
+       else if (engine->class == RENDER_CLASS)
+               return 0xd8;
+       else
+               return -1;
+}
+
+static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
+{
+       if (INTEL_GEN(engine->i915) >= 12)
+               return 0x12;
+       else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
+               return 0x18;
+       else
+               return -1;
+}
+
+static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
+{
+       int x;
+
+       x = lrc_ring_wa_bb_per_ctx(engine);
+       if (x < 0)
+               return x;
+
+       return x + 2;
+}
+
+static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
+{
+       int x;
+
+       x = lrc_ring_indirect_ptr(engine);
+       if (x < 0)
+               return x;
+
+       return x + 2;
+}
+
+static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
+{
+       if (engine->class != RENDER_CLASS)
+               return -1;
+
+       if (INTEL_GEN(engine->i915) >= 12)
+               return 0xb6;
+       else if (INTEL_GEN(engine->i915) >= 11)
+               return 0xaa;
+       else
+               return -1;
+}
+
+static u32
+lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
+{
+       switch (INTEL_GEN(engine->i915)) {
+       default:
+               MISSING_CASE(INTEL_GEN(engine->i915));
+               fallthrough;
+       case 12:
+               return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+       case 11:
+               return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+       case 10:
+               return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+       case 9:
+               return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+       case 8:
+               return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+       }
+}
+
+static void
+lrc_ring_setup_indirect_ctx(u32 *regs,
+                           const struct intel_engine_cs *engine,
+                           u32 ctx_bb_ggtt_addr,
+                           u32 size)
+{
+       GEM_BUG_ON(!size);
+       GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
+       GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
+       regs[lrc_ring_indirect_ptr(engine) + 1] =
+               ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
+
+       GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
+       regs[lrc_ring_indirect_offset(engine) + 1] =
+               lrc_ring_indirect_offset_default(engine) << 6;
+}
+
 static u32 intel_context_get_runtime(const struct intel_context *ce)
 {
        /*
@@ -514,7 +620,7 @@ static void set_offsets(u32 *regs,
 #define REG16(x) \
        (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
        (((x) >> 2) & 0x7f)
-#define END(x) 0, (x)
+#define END(total_state_size) 0, (total_state_size)
 {
        const u32 base = engine->mmio_base;
 
@@ -537,7 +643,7 @@ static void set_offsets(u32 *regs,
                if (flags & POSTED)
                        *regs |= MI_LRI_FORCE_POSTED;
                if (INTEL_GEN(engine->i915) >= 11)
-                       *regs |= MI_LRI_CS_MMIO;
+                       *regs |= MI_LRI_LRM_CS_MMIO;
                regs++;
 
                GEM_BUG_ON(!count);
@@ -922,8 +1028,63 @@ static const u8 gen12_rcs_offsets[] = {
        NOP(6),
        LRI(1, 0),
        REG(0x0c8),
+       NOP(3 + 9 + 1),
+
+       LRI(51, POSTED),
+       REG16(0x588),
+       REG16(0x588),
+       REG16(0x588),
+       REG16(0x588),
+       REG16(0x588),
+       REG16(0x588),
+       REG(0x028),
+       REG(0x09c),
+       REG(0x0c0),
+       REG(0x178),
+       REG(0x17c),
+       REG16(0x358),
+       REG(0x170),
+       REG(0x150),
+       REG(0x154),
+       REG(0x158),
+       REG16(0x41c),
+       REG16(0x600),
+       REG16(0x604),
+       REG16(0x608),
+       REG16(0x60c),
+       REG16(0x610),
+       REG16(0x614),
+       REG16(0x618),
+       REG16(0x61c),
+       REG16(0x620),
+       REG16(0x624),
+       REG16(0x628),
+       REG16(0x62c),
+       REG16(0x630),
+       REG16(0x634),
+       REG16(0x638),
+       REG16(0x63c),
+       REG16(0x640),
+       REG16(0x644),
+       REG16(0x648),
+       REG16(0x64c),
+       REG16(0x650),
+       REG16(0x654),
+       REG16(0x658),
+       REG16(0x65c),
+       REG16(0x660),
+       REG16(0x664),
+       REG16(0x668),
+       REG16(0x66c),
+       REG16(0x670),
+       REG16(0x674),
+       REG16(0x678),
+       REG16(0x67c),
+       REG(0x068),
+       REG(0x084),
+       NOP(1),
 
-       END(80)
+       END(192)
 };
 
 #undef END
@@ -1102,18 +1263,6 @@ static void intel_engine_context_out(struct intel_engine_cs *engine)
        write_sequnlock_irqrestore(&engine->stats.lock, flags);
 }
 
-static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
-{
-       if (INTEL_GEN(engine->i915) >= 12)
-               return 0x60;
-       else if (INTEL_GEN(engine->i915) >= 9)
-               return 0x54;
-       else if (engine->class == RENDER_CLASS)
-               return 0x58;
-       else
-               return -1;
-}
-
 static void
 execlists_check_context(const struct intel_context *ce,
                        const struct intel_engine_cs *engine)
@@ -1161,7 +1310,7 @@ static void restore_default_state(struct intel_context *ce,
 
        if (engine->pinned_default_state)
                memcpy(regs, /* skip restoring the vanilla PPHWSP */
-                      engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
+                      engine->pinned_default_state + LRC_STATE_OFFSET,
                       engine->context_size - PAGE_SIZE);
 
        execlists_init_reg_state(regs, ce, engine, ce->ring, false);
@@ -2384,13 +2533,6 @@ gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
        return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
 }
 
-static inline void flush_hwsp(const struct i915_request *rq)
-{
-       mb();
-       clflush((void *)READ_ONCE(rq->hwsp_seqno));
-       mb();
-}
-
 static void process_csb(struct intel_engine_cs *engine)
 {
        struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -2498,7 +2640,11 @@ static void process_csb(struct intel_engine_cs *engine)
                         * We rely on the hardware being strongly
                         * ordered, that the breadcrumb write is
                         * coherent (visible from the CPU) before the
-                        * user interrupt and CSB is processed.
+                        * user interrupt is processed. One might assume
+                        * that the breadcrumb write being before the
+                        * user interrupt and the CS event for the context
+                        * switch would therefore be before the CS event
+                        * itself...
                         */
                        if (GEM_SHOW_DEBUG() &&
                            !i915_request_completed(*execlists->active)) {
@@ -2506,19 +2652,8 @@ static void process_csb(struct intel_engine_cs *engine)
                                const u32 *regs __maybe_unused =
                                        rq->context->lrc_reg_state;
 
-                               /*
-                                * Flush the breadcrumb before crying foul.
-                                *
-                                * Since we have hit this on icl and seen the
-                                * breadcrumb advance as we print out the debug
-                                * info (so the problem corrected itself without
-                                * lasting damage), and we know that icl suffers
-                                * from missing global observation points in
-                                * execlists, presume that affects even more
-                                * coherency.
-                                */
-                               flush_hwsp(rq);
-
+                               ENGINE_TRACE(engine,
+                                            "context completed before request!\n");
                                ENGINE_TRACE(engine,
                                             "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
                                             ENGINE_READ(engine, RING_START),
@@ -2538,11 +2673,6 @@ static void process_csb(struct intel_engine_cs *engine)
                                             regs[CTX_RING_START],
                                             regs[CTX_RING_HEAD],
                                             regs[CTX_RING_TAIL]);
-
-                               /* Still? Declare it caput! */
-                               if (!i915_request_completed(rq) &&
-                                   !reset_in_progress(execlists))
-                                       GEM_BUG_ON("context completed before request");
                        }
 
                        execlists_schedule_out(*execlists->active++);
@@ -3136,12 +3266,132 @@ check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
 
 static void execlists_context_unpin(struct intel_context *ce)
 {
-       check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
+       check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
                      ce->engine);
 
        i915_gem_object_unpin_map(ce->state->obj);
 }
 
+static u32 *
+gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
+{
+       *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
+               MI_SRM_LRM_GLOBAL_GTT |
+               MI_LRI_LRM_CS_MMIO;
+       *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
+       *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
+               CTX_TIMESTAMP * sizeof(u32);
+       *cs++ = 0;
+
+       *cs++ = MI_LOAD_REGISTER_REG |
+               MI_LRR_SOURCE_CS_MMIO |
+               MI_LRI_LRM_CS_MMIO;
+       *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
+       *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
+
+       *cs++ = MI_LOAD_REGISTER_REG |
+               MI_LRR_SOURCE_CS_MMIO |
+               MI_LRI_LRM_CS_MMIO;
+       *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
+       *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
+
+       return cs;
+}
+
+static u32 *
+gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
+{
+       GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
+
+       *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
+               MI_SRM_LRM_GLOBAL_GTT |
+               MI_LRI_LRM_CS_MMIO;
+       *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
+       *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
+               (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
+       *cs++ = 0;
+
+       return cs;
+}
+
+static u32 *
+gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
+{
+       GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
+
+       *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
+               MI_SRM_LRM_GLOBAL_GTT |
+               MI_LRI_LRM_CS_MMIO;
+       *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
+       *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
+               (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
+       *cs++ = 0;
+
+       *cs++ = MI_LOAD_REGISTER_REG |
+               MI_LRR_SOURCE_CS_MMIO |
+               MI_LRI_LRM_CS_MMIO;
+       *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
+       *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
+
+       return cs;
+}
+
+static u32 *
+gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
+{
+       cs = gen12_emit_timestamp_wa(ce, cs);
+       cs = gen12_emit_cmd_buf_wa(ce, cs);
+       cs = gen12_emit_restore_scratch(ce, cs);
+
+       return cs;
+}
+
+static u32 *
+gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
+{
+       cs = gen12_emit_timestamp_wa(ce, cs);
+       cs = gen12_emit_restore_scratch(ce, cs);
+
+       return cs;
+}
+
+static inline u32 context_wa_bb_offset(const struct intel_context *ce)
+{
+       return PAGE_SIZE * ce->wa_bb_page;
+}
+
+static u32 *context_indirect_bb(const struct intel_context *ce)
+{
+       void *ptr;
+
+       GEM_BUG_ON(!ce->wa_bb_page);
+
+       ptr = ce->lrc_reg_state;
+       ptr -= LRC_STATE_OFFSET; /* back to start of context image */
+       ptr += context_wa_bb_offset(ce);
+
+       return ptr;
+}
+
+static void
+setup_indirect_ctx_bb(const struct intel_context *ce,
+                     const struct intel_engine_cs *engine,
+                     u32 *(*emit)(const struct intel_context *, u32 *))
+{
+       u32 * const start = context_indirect_bb(ce);
+       u32 *cs;
+
+       cs = emit(ce, start);
+       GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
+       while ((unsigned long)cs % CACHELINE_BYTES)
+               *cs++ = MI_NOOP;
+
+       lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
+                                   i915_ggtt_offset(ce->state) +
+                                   context_wa_bb_offset(ce),
+                                   (cs - start) * sizeof(*cs));
+}
+
 static void
 __execlists_update_reg_state(const struct intel_context *ce,
                             const struct intel_engine_cs *engine,
@@ -3165,6 +3415,18 @@ __execlists_update_reg_state(const struct intel_context *ce,
 
                i915_oa_init_reg_state(ce, engine);
        }
+
+       if (ce->wa_bb_page) {
+               u32 *(*fn)(const struct intel_context *ce, u32 *cs);
+
+               fn = gen12_emit_indirect_ctx_xcs;
+               if (ce->engine->class == RENDER_CLASS)
+                       fn = gen12_emit_indirect_ctx_rcs;
+
+               /* Mutually exclusive wrt to global indirect bb */
+               GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
+               setup_indirect_ctx_bb(ce, engine, fn);
+       }
 }
 
 static int
@@ -3183,7 +3445,7 @@ __execlists_context_pin(struct intel_context *ce,
                return PTR_ERR(vaddr);
 
        ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
-       ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
+       ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
        __execlists_update_reg_state(ce, engine, ce->ring->tail);
 
        return 0;
@@ -3615,6 +3877,62 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
        return ret;
 }
 
+static void reset_csb_pointers(struct intel_engine_cs *engine)
+{
+       struct intel_engine_execlists * const execlists = &engine->execlists;
+       const unsigned int reset_value = execlists->csb_size - 1;
+
+       ring_set_paused(engine, 0);
+
+       /*
+        * After a reset, the HW starts writing into CSB entry [0]. We
+        * therefore have to set our HEAD pointer back one entry so that
+        * the *first* entry we check is entry 0. To complicate this further,
+        * as we don't wait for the first interrupt after reset, we have to
+        * fake the HW write to point back to the last entry so that our
+        * inline comparison of our cached head position against the last HW
+        * write works even before the first interrupt.
+        */
+       execlists->csb_head = reset_value;
+       WRITE_ONCE(*execlists->csb_write, reset_value);
+       wmb(); /* Make sure this is visible to HW (paranoia?) */
+
+       /*
+        * Sometimes Icelake forgets to reset its pointers on a GPU reset.
+        * Bludgeon them with a mmio update to be sure.
+        */
+       ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
+                    reset_value << 8 | reset_value);
+       ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
+
+       invalidate_csb_entries(&execlists->csb_status[0],
+                              &execlists->csb_status[reset_value]);
+}
+
+static void execlists_sanitize(struct intel_engine_cs *engine)
+{
+       /*
+        * Poison residual state on resume, in case the suspend didn't!
+        *
+        * We have to assume that across suspend/resume (or other loss
+        * of control) that the contents of our pinned buffers has been
+        * lost, replaced by garbage. Since this doesn't always happen,
+        * let's poison such state so that we more quickly spot when
+        * we falsely assume it has been preserved.
+        */
+       if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
+               memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
+
+       reset_csb_pointers(engine);
+
+       /*
+        * The kernel_context HWSP is stored in the status_page. As above,
+        * that may be lost on resume/initialisation, and so we need to
+        * reset the value in the HWSP.
+        */
+       intel_timeline_reset_seqno(engine->kernel_context->timeline);
+}
+
 static void enable_error_interrupt(struct intel_engine_cs *engine)
 {
        u32 status;
@@ -3754,38 +4072,6 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
        intel_engine_stop_cs(engine);
 }
 
-static void reset_csb_pointers(struct intel_engine_cs *engine)
-{
-       struct intel_engine_execlists * const execlists = &engine->execlists;
-       const unsigned int reset_value = execlists->csb_size - 1;
-
-       ring_set_paused(engine, 0);
-
-       /*
-        * After a reset, the HW starts writing into CSB entry [0]. We
-        * therefore have to set our HEAD pointer back one entry so that
-        * the *first* entry we check is entry 0. To complicate this further,
-        * as we don't wait for the first interrupt after reset, we have to
-        * fake the HW write to point back to the last entry so that our
-        * inline comparison of our cached head position against the last HW
-        * write works even before the first interrupt.
-        */
-       execlists->csb_head = reset_value;
-       WRITE_ONCE(*execlists->csb_write, reset_value);
-       wmb(); /* Make sure this is visible to HW (paranoia?) */
-
-       /*
-        * Sometimes Icelake forgets to reset its pointers on a GPU reset.
-        * Bludgeon them with a mmio update to be sure.
-        */
-       ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
-                    reset_value << 8 | reset_value);
-       ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
-
-       invalidate_csb_entries(&execlists->csb_status[0],
-                              &execlists->csb_status[reset_value]);
-}
-
 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
 {
        int x;
@@ -4534,6 +4820,8 @@ static void execlists_shutdown(struct intel_engine_cs *engine)
 
 static void execlists_release(struct intel_engine_cs *engine)
 {
+       engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
+
        execlists_shutdown(engine);
 
        intel_engine_cleanup_common(engine);
@@ -4659,48 +4947,13 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine)
        else
                execlists->csb_size = GEN11_CSB_ENTRIES;
 
-       reset_csb_pointers(engine);
-
        /* Finally, take ownership and responsibility for cleanup! */
+       engine->sanitize = execlists_sanitize;
        engine->release = execlists_release;
 
        return 0;
 }
 
-static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
-{
-       u32 indirect_ctx_offset;
-
-       switch (INTEL_GEN(engine->i915)) {
-       default:
-               MISSING_CASE(INTEL_GEN(engine->i915));
-               /* fall through */
-       case 12:
-               indirect_ctx_offset =
-                       GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
-               break;
-       case 11:
-               indirect_ctx_offset =
-                       GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
-               break;
-       case 10:
-               indirect_ctx_offset =
-                       GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
-               break;
-       case 9:
-               indirect_ctx_offset =
-                       GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
-               break;
-       case 8:
-               indirect_ctx_offset =
-                       GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
-               break;
-       }
-
-       return indirect_ctx_offset;
-}
-
-
 static void init_common_reg_state(u32 * const regs,
                                  const struct intel_engine_cs *engine,
                                  const struct intel_ring *ring,
@@ -4722,27 +4975,23 @@ static void init_common_reg_state(u32 * const regs,
 }
 
 static void init_wa_bb_reg_state(u32 * const regs,
-                                const struct intel_engine_cs *engine,
-                                u32 pos_bb_per_ctx)
+                                const struct intel_engine_cs *engine)
 {
        const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
 
        if (wa_ctx->per_ctx.size) {
                const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
 
-               regs[pos_bb_per_ctx] =
+               GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
+               regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
                        (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
        }
 
        if (wa_ctx->indirect_ctx.size) {
-               const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
-
-               regs[pos_bb_per_ctx + 2] =
-                       (ggtt_offset + wa_ctx->indirect_ctx.offset) |
-                       (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
-
-               regs[pos_bb_per_ctx + 4] =
-                       intel_lr_indirect_ctx_offset(engine) << 6;
+               lrc_ring_setup_indirect_ctx(regs, engine,
+                                           i915_ggtt_offset(wa_ctx->vma) +
+                                           wa_ctx->indirect_ctx.offset,
+                                           wa_ctx->indirect_ctx.size);
        }
 }
 
@@ -4791,10 +5040,7 @@ static void execlists_init_reg_state(u32 *regs,
        init_common_reg_state(regs, engine, ring, inhibit);
        init_ppgtt_reg_state(regs, vm_alias(ce->vm));
 
-       init_wa_bb_reg_state(regs, engine,
-                            INTEL_GEN(engine->i915) >= 12 ?
-                            GEN12_CTX_BB_PER_CTX_PTR :
-                            CTX_BB_PER_CTX_PTR);
+       init_wa_bb_reg_state(regs, engine);
 
        __reset_stop_ring(regs, engine);
 }
@@ -4842,7 +5088,7 @@ populate_lr_context(struct intel_context *ce,
         * The second page of the context object contains some registers which
         * must be set up prior to the first execution.
         */
-       execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
+       execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
                                 ce, engine, ring, inhibit);
 
        ret = 0;
@@ -4867,6 +5113,11 @@ static int __execlists_context_alloc(struct intel_context *ce,
        if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
                context_size += I915_GTT_PAGE_SIZE; /* for redzone */
 
+       if (INTEL_GEN(engine->i915) == 12) {
+               ce->wa_bb_page = context_size / PAGE_SIZE;
+               context_size += PAGE_SIZE;
+       }
+
        ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
        if (IS_ERR(ctx_obj))
                return PTR_ERR(ctx_obj);
@@ -5086,12 +5337,15 @@ static void virtual_submission_tasklet(unsigned long data)
                return;
 
        local_irq_disable();
-       for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
-               struct intel_engine_cs *sibling = ve->siblings[n];
+       for (n = 0; n < ve->num_siblings; n++) {
+               struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
                struct ve_node * const node = &ve->nodes[sibling->id];
                struct rb_node **parent, *rb;
                bool first;
 
+               if (!READ_ONCE(ve->request))
+                       break; /* already handled by a sibling's tasklet */
+
                if (unlikely(!(mask & sibling->mask))) {
                        if (!RB_EMPTY_NODE(&node->rb)) {
                                spin_lock(&sibling->active.lock);