drm/i915: Use indirect ctx bb to mend CMD_BUF_CCTL
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / gt / intel_lrc.c
index 31455ec..c8014c2 100644 (file)
 
 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
-#define WA_TAIL_DWORDS 2
-#define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
 
 struct virtual_engine {
        struct intel_engine_cs base;
@@ -240,6 +238,123 @@ __execlists_update_reg_state(const struct intel_context *ce,
                             const struct intel_engine_cs *engine,
                             u32 head);
 
+static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
+{
+       if (INTEL_GEN(engine->i915) >= 12)
+               return 0x60;
+       else if (INTEL_GEN(engine->i915) >= 9)
+               return 0x54;
+       else if (engine->class == RENDER_CLASS)
+               return 0x58;
+       else
+               return -1;
+}
+
+static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
+{
+       if (INTEL_GEN(engine->i915) >= 12)
+               return 0x74;
+       else if (INTEL_GEN(engine->i915) >= 9)
+               return 0x68;
+       else if (engine->class == RENDER_CLASS)
+               return 0xd8;
+       else
+               return -1;
+}
+
+static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
+{
+       if (INTEL_GEN(engine->i915) >= 12)
+               return 0x12;
+       else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
+               return 0x18;
+       else
+               return -1;
+}
+
+static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
+{
+       int x;
+
+       x = lrc_ring_wa_bb_per_ctx(engine);
+       if (x < 0)
+               return x;
+
+       return x + 2;
+}
+
+static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
+{
+       int x;
+
+       x = lrc_ring_indirect_ptr(engine);
+       if (x < 0)
+               return x;
+
+       return x + 2;
+}
+
+static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
+{
+       if (engine->class != RENDER_CLASS)
+               return -1;
+
+       if (INTEL_GEN(engine->i915) >= 12)
+               return 0xb6;
+       else if (INTEL_GEN(engine->i915) >= 11)
+               return 0xaa;
+       else
+               return -1;
+}
+
+static u32
+lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
+{
+       switch (INTEL_GEN(engine->i915)) {
+       default:
+               MISSING_CASE(INTEL_GEN(engine->i915));
+               fallthrough;
+       case 12:
+               return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+       case 11:
+               return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+       case 10:
+               return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+       case 9:
+               return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+       case 8:
+               return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+       }
+}
+
+static void
+lrc_ring_setup_indirect_ctx(u32 *regs,
+                           const struct intel_engine_cs *engine,
+                           u32 ctx_bb_ggtt_addr,
+                           u32 size)
+{
+       GEM_BUG_ON(!size);
+       GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
+       GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
+       regs[lrc_ring_indirect_ptr(engine) + 1] =
+               ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
+
+       GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
+       regs[lrc_ring_indirect_offset(engine) + 1] =
+               lrc_ring_indirect_offset_default(engine) << 6;
+}
+
+static u32 intel_context_get_runtime(const struct intel_context *ce)
+{
+       /*
+        * We can use either ppHWSP[16] which is recorded before the context
+        * switch (and so excludes the cost of context switches) or use the
+        * value from the context image itself, which is saved/restored earlier
+        * and so includes the cost of the save.
+        */
+       return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
+}
+
 static void mark_eio(struct i915_request *rq)
 {
        if (i915_request_completed(rq))
@@ -247,7 +362,7 @@ static void mark_eio(struct i915_request *rq)
 
        GEM_BUG_ON(i915_request_signaled(rq));
 
-       dma_fence_set_error(&rq->fence, -EIO);
+       i915_request_set_error_once(rq, -EIO);
        i915_request_mark_complete(rq);
 }
 
@@ -295,7 +410,7 @@ static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 
 static inline int rq_prio(const struct i915_request *rq)
 {
-       return rq->sched.attr.priority;
+       return READ_ONCE(rq->sched.attr.priority);
 }
 
 static int effective_prio(const struct i915_request *rq)
@@ -505,7 +620,7 @@ static void set_offsets(u32 *regs,
 #define REG16(x) \
        (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
        (((x) >> 2) & 0x7f)
-#define END(x) 0, (x)
+#define END(total_state_size) 0, (total_state_size)
 {
        const u32 base = engine->mmio_base;
 
@@ -528,7 +643,7 @@ static void set_offsets(u32 *regs,
                if (flags & POSTED)
                        *regs |= MI_LRI_FORCE_POSTED;
                if (INTEL_GEN(engine->i915) >= 11)
-                       *regs |= MI_LRI_CS_MMIO;
+                       *regs |= MI_LRI_LRM_CS_MMIO;
                regs++;
 
                GEM_BUG_ON(!count);
@@ -913,8 +1028,63 @@ static const u8 gen12_rcs_offsets[] = {
        NOP(6),
        LRI(1, 0),
        REG(0x0c8),
+       NOP(3 + 9 + 1),
+
+       LRI(51, POSTED),
+       REG16(0x588),
+       REG16(0x588),
+       REG16(0x588),
+       REG16(0x588),
+       REG16(0x588),
+       REG16(0x588),
+       REG(0x028),
+       REG(0x09c),
+       REG(0x0c0),
+       REG(0x178),
+       REG(0x17c),
+       REG16(0x358),
+       REG(0x170),
+       REG(0x150),
+       REG(0x154),
+       REG(0x158),
+       REG16(0x41c),
+       REG16(0x600),
+       REG16(0x604),
+       REG16(0x608),
+       REG16(0x60c),
+       REG16(0x610),
+       REG16(0x614),
+       REG16(0x618),
+       REG16(0x61c),
+       REG16(0x620),
+       REG16(0x624),
+       REG16(0x628),
+       REG16(0x62c),
+       REG16(0x630),
+       REG16(0x634),
+       REG16(0x638),
+       REG16(0x63c),
+       REG16(0x640),
+       REG16(0x644),
+       REG16(0x648),
+       REG16(0x64c),
+       REG16(0x650),
+       REG16(0x654),
+       REG16(0x658),
+       REG16(0x65c),
+       REG16(0x660),
+       REG16(0x664),
+       REG16(0x668),
+       REG16(0x66c),
+       REG16(0x670),
+       REG16(0x674),
+       REG16(0x678),
+       REG16(0x67c),
+       REG(0x068),
+       REG(0x084),
+       NOP(1),
 
-       END(80)
+       END(192)
 };
 
 #undef END
@@ -1006,7 +1176,7 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
                                i915_request_cancel_breadcrumb(rq);
                                spin_unlock(&rq->lock);
                        }
-                       rq->engine = owner;
+                       WRITE_ONCE(rq->engine, owner);
                        owner->submit_request(rq);
                        active = NULL;
                }
@@ -1093,18 +1263,6 @@ static void intel_engine_context_out(struct intel_engine_cs *engine)
        write_sequnlock_irqrestore(&engine->stats.lock, flags);
 }
 
-static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
-{
-       if (INTEL_GEN(engine->i915) >= 12)
-               return 0x60;
-       else if (INTEL_GEN(engine->i915) >= 9)
-               return 0x54;
-       else if (engine->class == RENDER_CLASS)
-               return 0x58;
-       else
-               return -1;
-}
-
 static void
 execlists_check_context(const struct intel_context *ce,
                        const struct intel_engine_cs *engine)
@@ -1152,10 +1310,11 @@ static void restore_default_state(struct intel_context *ce,
 
        if (engine->pinned_default_state)
                memcpy(regs, /* skip restoring the vanilla PPHWSP */
-                      engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
+                      engine->pinned_default_state + LRC_STATE_OFFSET,
                       engine->context_size - PAGE_SIZE);
 
        execlists_init_reg_state(regs, ce, engine, ce->ring, false);
+       ce->runtime.last = intel_context_get_runtime(ce);
 }
 
 static void reset_active(struct i915_request *rq,
@@ -1197,6 +1356,37 @@ static void reset_active(struct i915_request *rq,
        ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
 }
 
+static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
+{
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+       ce->runtime.num_underflow += dt < 0;
+       ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
+#endif
+}
+
+static void intel_context_update_runtime(struct intel_context *ce)
+{
+       u32 old;
+       s32 dt;
+
+       if (intel_context_is_barrier(ce))
+               return;
+
+       old = ce->runtime.last;
+       ce->runtime.last = intel_context_get_runtime(ce);
+       dt = ce->runtime.last - old;
+
+       if (unlikely(dt <= 0)) {
+               CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
+                        old, ce->runtime.last, dt);
+               st_update_runtime_underflow(ce, dt);
+               return;
+       }
+
+       ewma_runtime_add(&ce->runtime.avg, dt);
+       ce->runtime.total += dt;
+}
+
 static inline struct intel_engine_cs *
 __execlists_schedule_in(struct i915_request *rq)
 {
@@ -1211,12 +1401,12 @@ __execlists_schedule_in(struct i915_request *rq)
        if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
                execlists_check_context(ce, engine);
 
+       ce->lrc_desc &= ~GENMASK_ULL(47, 37);
        if (ce->tag) {
                /* Use a fixed tag for OA and friends */
                ce->lrc_desc |= (u64)ce->tag << 32;
        } else {
                /* We don't need a strict matching tag, just different values */
-               ce->lrc_desc &= ~GENMASK_ULL(47, 37);
                ce->lrc_desc |=
                        (u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
                        GEN11_SW_CTX_ID_SHIFT;
@@ -1276,10 +1466,11 @@ __execlists_schedule_out(struct i915_request *rq,
         * If we have just completed this context, the engine may now be
         * idle and we want to re-enter powersaving.
         */
-       if (list_is_last(&rq->link, &ce->timeline->requests) &&
+       if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
            i915_request_completed(rq))
                intel_engine_add_retire(engine, ce->timeline);
 
+       intel_context_update_runtime(ce);
        intel_engine_context_out(engine);
        execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
        intel_gt_pm_put_async(engine->gt);
@@ -1374,6 +1565,23 @@ static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc
        }
 }
 
+static __maybe_unused char *
+dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
+{
+       if (!rq)
+               return "";
+
+       snprintf(buf, buflen, "%s%llx:%lld%s prio %d",
+                prefix,
+                rq->fence.context, rq->fence.seqno,
+                i915_request_completed(rq) ? "!" :
+                i915_request_started(rq) ? "*" :
+                "",
+                rq_prio(rq));
+
+       return buf;
+}
+
 static __maybe_unused void
 trace_ports(const struct intel_engine_execlists *execlists,
            const char *msg,
@@ -1381,18 +1589,20 @@ trace_ports(const struct intel_engine_execlists *execlists,
 {
        const struct intel_engine_cs *engine =
                container_of(execlists, typeof(*engine), execlists);
+       char __maybe_unused p0[40], p1[40];
 
        if (!ports[0])
                return;
 
-       ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
-                    ports[0]->fence.context,
-                    ports[0]->fence.seqno,
-                    i915_request_completed(ports[0]) ? "!" :
-                    i915_request_started(ports[0]) ? "*" :
-                    "",
-                    ports[1] ? ports[1]->fence.context : 0,
-                    ports[1] ? ports[1]->fence.seqno : 0);
+       ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
+                    dump_port(p0, sizeof(p0), "", ports[0]),
+                    dump_port(p1, sizeof(p1), ", ", ports[1]));
+}
+
+static inline bool
+reset_in_progress(const struct intel_engine_execlists *execlists)
+{
+       return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
 }
 
 static __maybe_unused bool
@@ -1401,9 +1611,14 @@ assert_pending_valid(const struct intel_engine_execlists *execlists,
 {
        struct i915_request * const *port, *rq;
        struct intel_context *ce = NULL;
+       bool sentinel = false;
 
        trace_ports(execlists, msg, execlists->pending);
 
+       /* We may be messing around with the lists during reset, lalala */
+       if (reset_in_progress(execlists))
+               return true;
+
        if (!execlists->pending[0]) {
                GEM_TRACE_ERR("Nothing pending for promotion!\n");
                return false;
@@ -1430,6 +1645,26 @@ assert_pending_valid(const struct intel_engine_execlists *execlists,
                }
                ce = rq->context;
 
+               /*
+                * Sentinels are supposed to be lonely so they flush the
+                * current exection off the HW. Check that they are the
+                * only request in the pending submission.
+                */
+               if (sentinel) {
+                       GEM_TRACE_ERR("context:%llx after sentinel in pending[%zd]\n",
+                                     ce->timeline->fence_context,
+                                     port - execlists->pending);
+                       return false;
+               }
+
+               sentinel = i915_request_has_sentinel(rq);
+               if (sentinel && port != execlists->pending) {
+                       GEM_TRACE_ERR("sentinel context:%llx not in prime position[%zd]\n",
+                                     ce->timeline->fence_context,
+                                     port - execlists->pending);
+                       return false;
+               }
+
                /* Hold tightly onto the lock to prevent concurrent retires! */
                if (!spin_trylock_irqsave(&rq->lock, flags))
                        continue;
@@ -1525,6 +1760,11 @@ static bool can_merge_ctx(const struct intel_context *prev,
        return true;
 }
 
+static unsigned long i915_request_flags(const struct i915_request *rq)
+{
+       return READ_ONCE(rq->fence.flags);
+}
+
 static bool can_merge_rq(const struct i915_request *prev,
                         const struct i915_request *next)
 {
@@ -1542,7 +1782,7 @@ static bool can_merge_rq(const struct i915_request *prev,
        if (i915_request_completed(next))
                return true;
 
-       if (unlikely((prev->fence.flags ^ next->fence.flags) &
+       if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
                     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
                      BIT(I915_FENCE_FLAG_SENTINEL))))
                return false;
@@ -1550,6 +1790,7 @@ static bool can_merge_rq(const struct i915_request *prev,
        if (!can_merge_ctx(prev->context, next->context))
                return false;
 
+       GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
        return true;
 }
 
@@ -1585,7 +1826,7 @@ static bool virtual_matches(const struct virtual_engine *ve,
 }
 
 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
-                                    struct intel_engine_cs *engine)
+                                    struct i915_request *rq)
 {
        struct intel_engine_cs *old = ve->siblings[0];
 
@@ -1593,9 +1834,19 @@ static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
 
        spin_lock(&old->breadcrumbs.irq_lock);
        if (!list_empty(&ve->context.signal_link)) {
-               list_move_tail(&ve->context.signal_link,
-                              &engine->breadcrumbs.signalers);
-               intel_engine_signal_breadcrumbs(engine);
+               list_del_init(&ve->context.signal_link);
+
+               /*
+                * We cannot acquire the new engine->breadcrumbs.irq_lock
+                * (as we are holding a breadcrumbs.irq_lock already),
+                * so attach this request to the signaler on submission.
+                * The queued irq_work will occur when we finally drop
+                * the engine->active.lock after dequeue.
+                */
+               set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags);
+
+               /* Also transfer the pending irq_work for the old breadcrumb. */
+               intel_engine_signal_breadcrumbs(rq->engine);
        }
        spin_unlock(&old->breadcrumbs.irq_lock);
 }
@@ -1605,6 +1856,11 @@ static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
                                     &(rq__)->sched.waiters_list, \
                                     wait_link)
 
+#define for_each_signaler(p__, rq__) \
+       list_for_each_entry_rcu(p__, \
+                               &(rq__)->sched.signalers_list, \
+                               signal_link)
+
 static void defer_request(struct i915_request *rq, struct list_head * const pl)
 {
        LIST_HEAD(list);
@@ -1661,7 +1917,8 @@ static void defer_active(struct intel_engine_cs *engine)
 }
 
 static bool
-need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
+need_timeslice(const struct intel_engine_cs *engine,
+              const struct i915_request *rq)
 {
        int hint;
 
@@ -1675,6 +1932,32 @@ need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
        return hint >= effective_prio(rq);
 }
 
+static bool
+timeslice_yield(const struct intel_engine_execlists *el,
+               const struct i915_request *rq)
+{
+       /*
+        * Once bitten, forever smitten!
+        *
+        * If the active context ever busy-waited on a semaphore,
+        * it will be treated as a hog until the end of its timeslice (i.e.
+        * until it is scheduled out and replaced by a new submission,
+        * possibly even its own lite-restore). The HW only sends an interrupt
+        * on the first miss, and we do know if that semaphore has been
+        * signaled, or even if it is now stuck on another semaphore. Play
+        * safe, yield if it might be stuck -- it will be given a fresh
+        * timeslice in the near future.
+        */
+       return upper_32_bits(rq->context->lrc_desc) == READ_ONCE(el->yield);
+}
+
+static bool
+timeslice_expired(const struct intel_engine_execlists *el,
+                 const struct i915_request *rq)
+{
+       return timer_expired(&el->timer) || timeslice_yield(el, rq);
+}
+
 static int
 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
 {
@@ -1690,15 +1973,15 @@ timeslice(const struct intel_engine_cs *engine)
        return READ_ONCE(engine->props.timeslice_duration_ms);
 }
 
-static unsigned long
-active_timeslice(const struct intel_engine_cs *engine)
+static unsigned long active_timeslice(const struct intel_engine_cs *engine)
 {
-       const struct i915_request *rq = *engine->execlists.active;
+       const struct intel_engine_execlists *execlists = &engine->execlists;
+       const struct i915_request *rq = *execlists->active;
 
        if (!rq || i915_request_completed(rq))
                return 0;
 
-       if (engine->execlists.switch_priority_hint < effective_prio(rq))
+       if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
                return 0;
 
        return timeslice(engine);
@@ -1706,22 +1989,39 @@ active_timeslice(const struct intel_engine_cs *engine)
 
 static void set_timeslice(struct intel_engine_cs *engine)
 {
+       unsigned long duration;
+
        if (!intel_engine_has_timeslices(engine))
                return;
 
-       set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
+       duration = active_timeslice(engine);
+       ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
+
+       set_timer_ms(&engine->execlists.timer, duration);
 }
 
 static void start_timeslice(struct intel_engine_cs *engine)
 {
        struct intel_engine_execlists *execlists = &engine->execlists;
+       const int prio = queue_prio(execlists);
+       unsigned long duration;
 
-       execlists->switch_priority_hint = execlists->queue_priority_hint;
+       if (!intel_engine_has_timeslices(engine))
+               return;
+
+       WRITE_ONCE(execlists->switch_priority_hint, prio);
+       if (prio == INT_MIN)
+               return;
 
        if (timer_pending(&execlists->timer))
                return;
 
-       set_timer_ms(&execlists->timer, timeslice(engine));
+       duration = timeslice(engine);
+       ENGINE_TRACE(engine,
+                    "start timeslicing, prio:%d, interval:%lu",
+                    prio, duration);
+
+       set_timer_ms(&execlists->timer, duration);
 }
 
 static void record_preemption(struct intel_engine_execlists *execlists)
@@ -1818,11 +2118,26 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
         * of trouble.
         */
        active = READ_ONCE(execlists->active);
-       while ((last = *active) && i915_request_completed(last))
-               active++;
 
-       if (last) {
+       /*
+        * In theory we can skip over completed contexts that have not
+        * yet been processed by events (as those events are in flight):
+        *
+        * while ((last = *active) && i915_request_completed(last))
+        *      active++;
+        *
+        * However, the GPU cannot handle this as it will ultimately
+        * find itself trying to jump back into a context it has just
+        * completed and barf.
+        */
+
+       if ((last = *active)) {
                if (need_preempt(engine, last, rb)) {
+                       if (i915_request_completed(last)) {
+                               tasklet_hi_schedule(&execlists->tasklet);
+                               return;
+                       }
+
                        ENGINE_TRACE(engine,
                                     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
                                     last->fence.context,
@@ -1849,13 +2164,19 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 
                        last = NULL;
                } else if (need_timeslice(engine, last) &&
-                          timer_expired(&engine->execlists.timer)) {
+                          timeslice_expired(execlists, last)) {
+                       if (i915_request_completed(last)) {
+                               tasklet_hi_schedule(&execlists->tasklet);
+                               return;
+                       }
+
                        ENGINE_TRACE(engine,
-                                    "expired last=%llx:%lld, prio=%d, hint=%d\n",
+                                    "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
                                     last->fence.context,
                                     last->fence.seqno,
                                     last->sched.attr.priority,
-                                    execlists->queue_priority_hint);
+                                    execlists->queue_priority_hint,
+                                    yesno(timeslice_yield(execlists, last)));
 
                        ring_set_paused(engine, 1);
                        defer_active(engine);
@@ -1938,13 +2259,14 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
                                     "",
                                     yesno(engine != ve->siblings[0]));
 
-                       ve->request = NULL;
-                       ve->base.execlists.queue_priority_hint = INT_MIN;
+                       WRITE_ONCE(ve->request, NULL);
+                       WRITE_ONCE(ve->base.execlists.queue_priority_hint,
+                                  INT_MIN);
                        rb_erase_cached(rb, &execlists->virtual);
                        RB_CLEAR_NODE(rb);
 
                        GEM_BUG_ON(!(rq->execution_mask & engine->mask));
-                       rq->engine = engine;
+                       WRITE_ONCE(rq->engine, engine);
 
                        if (engine != ve->siblings[0]) {
                                u32 *regs = ve->context.lrc_reg_state;
@@ -1957,7 +2279,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
                                                                        engine);
 
                                if (!list_empty(&ve->context.signals))
-                                       virtual_xfer_breadcrumbs(ve, engine);
+                                       virtual_xfer_breadcrumbs(ve, rq);
 
                                /*
                                 * Move the bound engine to the top of the list
@@ -2064,6 +2386,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
                                GEM_BUG_ON(last &&
                                           !can_merge_ctx(last->context,
                                                          rq->context));
+                               GEM_BUG_ON(last &&
+                                          i915_seqno_passed(last->fence.seqno,
+                                                            rq->fence.seqno));
 
                                submit = true;
                                last = rq;
@@ -2112,6 +2437,7 @@ done:
                }
                clear_ports(port + 1, last_port - port);
 
+               WRITE_ONCE(execlists->yield, -1);
                execlists_submit_ports(engine);
                set_preempt_timeout(engine, *active);
        } else {
@@ -2134,6 +2460,7 @@ cancel_port_requests(struct intel_engine_execlists * const execlists)
                execlists_schedule_out(*port);
        clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
 
+       smp_wmb(); /* complete the seqlock for execlists_active() */
        WRITE_ONCE(execlists->active, execlists->inflight);
 }
 
@@ -2144,12 +2471,6 @@ invalidate_csb_entries(const u32 *first, const u32 *last)
        clflush((void *)last);
 }
 
-static inline bool
-reset_in_progress(const struct intel_engine_execlists *execlists)
-{
-       return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
-}
-
 /*
  * Starting with Gen12, the status has a new format:
  *
@@ -2240,7 +2561,6 @@ static void process_csb(struct intel_engine_cs *engine)
         */
        head = execlists->csb_head;
        tail = READ_ONCE(*execlists->csb_write);
-       ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
        if (unlikely(head == tail))
                return;
 
@@ -2254,6 +2574,7 @@ static void process_csb(struct intel_engine_cs *engine)
         */
        rmb();
 
+       ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
        do {
                bool promote;
 
@@ -2288,11 +2609,11 @@ static void process_csb(struct intel_engine_cs *engine)
                if (promote) {
                        struct i915_request * const *old = execlists->active;
 
+                       ring_set_paused(engine, 0);
+
                        /* Point active to the new ELSP; prevent overwriting */
                        WRITE_ONCE(execlists->active, execlists->pending);
-
-                       if (!inject_preempt_hang(execlists))
-                               ring_set_paused(engine, 0);
+                       smp_wmb(); /* notify execlists_active() */
 
                        /* cancel old inflight, prepare for switch */
                        trace_ports(execlists, "preempted", old);
@@ -2301,11 +2622,12 @@ static void process_csb(struct intel_engine_cs *engine)
 
                        /* switch pending to inflight */
                        GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
-                       WRITE_ONCE(execlists->active,
-                                  memcpy(execlists->inflight,
-                                         execlists->pending,
-                                         execlists_num_ports(execlists) *
-                                         sizeof(*execlists->pending)));
+                       memcpy(execlists->inflight,
+                              execlists->pending,
+                              execlists_num_ports(execlists) *
+                              sizeof(*execlists->pending));
+                       smp_wmb(); /* complete the seqlock */
+                       WRITE_ONCE(execlists->active, execlists->inflight);
 
                        WRITE_ONCE(execlists->pending[0], NULL);
                } else {
@@ -2318,10 +2640,41 @@ static void process_csb(struct intel_engine_cs *engine)
                         * We rely on the hardware being strongly
                         * ordered, that the breadcrumb write is
                         * coherent (visible from the CPU) before the
-                        * user interrupt and CSB is processed.
+                        * user interrupt is processed. One might assume
+                        * that the breadcrumb write being before the
+                        * user interrupt and the CS event for the context
+                        * switch would therefore be before the CS event
+                        * itself...
                         */
-                       GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
-                                  !reset_in_progress(execlists));
+                       if (GEM_SHOW_DEBUG() &&
+                           !i915_request_completed(*execlists->active)) {
+                               struct i915_request *rq = *execlists->active;
+                               const u32 *regs __maybe_unused =
+                                       rq->context->lrc_reg_state;
+
+                               ENGINE_TRACE(engine,
+                                            "context completed before request!\n");
+                               ENGINE_TRACE(engine,
+                                            "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
+                                            ENGINE_READ(engine, RING_START),
+                                            ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
+                                            ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
+                                            ENGINE_READ(engine, RING_CTL),
+                                            ENGINE_READ(engine, RING_MI_MODE));
+                               ENGINE_TRACE(engine,
+                                            "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
+                                            i915_ggtt_offset(rq->ring->vma),
+                                            rq->head, rq->tail,
+                                            rq->fence.context,
+                                            lower_32_bits(rq->fence.seqno),
+                                            hwsp_seqno(rq));
+                               ENGINE_TRACE(engine,
+                                            "ctx:{start:%08x, head:%04x, tail:%04x}, ",
+                                            regs[CTX_RING_START],
+                                            regs[CTX_RING_HEAD],
+                                            regs[CTX_RING_TAIL]);
+                       }
+
                        execlists_schedule_out(*execlists->active++);
 
                        GEM_BUG_ON(execlists->active - execlists->inflight >
@@ -2349,7 +2702,7 @@ static void process_csb(struct intel_engine_cs *engine)
 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
 {
        lockdep_assert_held(&engine->active.lock);
-       if (!engine->execlists.pending[0]) {
+       if (!READ_ONCE(engine->execlists.pending[0])) {
                rcu_read_lock(); /* protect peeking at execlists->active */
                execlists_dequeue(engine);
                rcu_read_unlock();
@@ -2366,12 +2719,12 @@ static void __execlists_hold(struct i915_request *rq)
                if (i915_request_is_active(rq))
                        __i915_request_unsubmit(rq);
 
-               RQ_TRACE(rq, "on hold\n");
                clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
                list_move_tail(&rq->sched.link, &rq->engine->active.hold);
                i915_request_set_hold(rq);
+               RQ_TRACE(rq, "on hold\n");
 
-               list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
+               for_each_waiter(p, rq) {
                        struct i915_request *w =
                                container_of(p->waiter, typeof(*w), sched);
 
@@ -2385,7 +2738,7 @@ static void __execlists_hold(struct i915_request *rq)
                        if (i915_request_completed(w))
                                continue;
 
-                       if (i915_request_on_hold(rq))
+                       if (i915_request_on_hold(w))
                                continue;
 
                        list_move_tail(&w->sched.link, &list);
@@ -2443,6 +2796,7 @@ static bool execlists_hold(struct intel_engine_cs *engine,
        GEM_BUG_ON(i915_request_on_hold(rq));
        GEM_BUG_ON(rq->engine != engine);
        __execlists_hold(rq);
+       GEM_BUG_ON(list_empty(&engine->active.hold));
 
 unlock:
        spin_unlock_irq(&engine->active.lock);
@@ -2452,23 +2806,27 @@ unlock:
 static bool hold_request(const struct i915_request *rq)
 {
        struct i915_dependency *p;
+       bool result = false;
 
        /*
         * If one of our ancestors is on hold, we must also be on hold,
         * otherwise we will bypass it and execute before it.
         */
-       list_for_each_entry(p, &rq->sched.signalers_list, signal_link) {
+       rcu_read_lock();
+       for_each_signaler(p, rq) {
                const struct i915_request *s =
                        container_of(p->signaler, typeof(*s), sched);
 
                if (s->engine != rq->engine)
                        continue;
 
-               if (i915_request_on_hold(s))
-                       return true;
+               result = i915_request_on_hold(s);
+               if (result)
+                       break;
        }
+       rcu_read_unlock();
 
-       return false;
+       return result;
 }
 
 static void __execlists_unhold(struct i915_request *rq)
@@ -2478,6 +2836,8 @@ static void __execlists_unhold(struct i915_request *rq)
        do {
                struct i915_dependency *p;
 
+               RQ_TRACE(rq, "hold release\n");
+
                GEM_BUG_ON(!i915_request_on_hold(rq));
                GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
 
@@ -2486,21 +2846,24 @@ static void __execlists_unhold(struct i915_request *rq)
                               i915_sched_lookup_priolist(rq->engine,
                                                          rq_prio(rq)));
                set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
-               RQ_TRACE(rq, "hold release\n");
 
                /* Also release any children on this engine that are ready */
-               list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
+               for_each_waiter(p, rq) {
                        struct i915_request *w =
                                container_of(p->waiter, typeof(*w), sched);
 
+                       /* Propagate any change in error status */
+                       if (rq->fence.error)
+                               i915_request_set_error_once(w, rq->fence.error);
+
                        if (w->engine != rq->engine)
                                continue;
 
-                       if (!i915_request_on_hold(rq))
+                       if (!i915_request_on_hold(w))
                                continue;
 
                        /* Check that no other parents are also on hold */
-                       if (hold_request(rq))
+                       if (hold_request(w))
                                continue;
 
                        list_move_tail(&w->sched.link, &list);
@@ -2599,6 +2962,45 @@ err_cap:
        return NULL;
 }
 
+static struct i915_request *
+active_context(struct intel_engine_cs *engine, u32 ccid)
+{
+       const struct intel_engine_execlists * const el = &engine->execlists;
+       struct i915_request * const *port, *rq;
+
+       /*
+        * Use the most recent result from process_csb(), but just in case
+        * we trigger an error (via interrupt) before the first CS event has
+        * been written, peek at the next submission.
+        */
+
+       for (port = el->active; (rq = *port); port++) {
+               if (upper_32_bits(rq->context->lrc_desc) == ccid) {
+                       ENGINE_TRACE(engine,
+                                    "ccid found at active:%zd\n",
+                                    port - el->active);
+                       return rq;
+               }
+       }
+
+       for (port = el->pending; (rq = *port); port++) {
+               if (upper_32_bits(rq->context->lrc_desc) == ccid) {
+                       ENGINE_TRACE(engine,
+                                    "ccid found at pending:%zd\n",
+                                    port - el->pending);
+                       return rq;
+               }
+       }
+
+       ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
+       return NULL;
+}
+
+static u32 active_ccid(struct intel_engine_cs *engine)
+{
+       return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
+}
+
 static bool execlists_capture(struct intel_engine_cs *engine)
 {
        struct execlists_capture *cap;
@@ -2615,13 +3017,13 @@ static bool execlists_capture(struct intel_engine_cs *engine)
        if (!cap)
                return true;
 
-       cap->rq = execlists_active(&engine->execlists);
-       GEM_BUG_ON(!cap->rq);
-
-       rcu_read_lock();
-       cap->rq = active_request(cap->rq->context->timeline, cap->rq);
-       cap->rq = i915_request_get_rcu(cap->rq);
-       rcu_read_unlock();
+       spin_lock_irq(&engine->active.lock);
+       cap->rq = active_context(engine, active_ccid(engine));
+       if (cap->rq) {
+               cap->rq = active_request(cap->rq->context->timeline, cap->rq);
+               cap->rq = i915_request_get_rcu(cap->rq);
+       }
+       spin_unlock_irq(&engine->active.lock);
        if (!cap->rq)
                goto err_free;
 
@@ -2660,27 +3062,25 @@ err_free:
        return false;
 }
 
-static noinline void preempt_reset(struct intel_engine_cs *engine)
+static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
 {
        const unsigned int bit = I915_RESET_ENGINE + engine->id;
        unsigned long *lock = &engine->gt->reset.flags;
 
-       if (i915_modparams.reset < 3)
+       if (!intel_has_reset_engine(engine->gt))
                return;
 
        if (test_and_set_bit(bit, lock))
                return;
 
+       ENGINE_TRACE(engine, "reset for %s\n", msg);
+
        /* Mark this tasklet as disabled to avoid waiting for it to complete */
        tasklet_disable_nosync(&engine->execlists.tasklet);
 
-       ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
-                    READ_ONCE(engine->props.preempt_timeout_ms),
-                    jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
-
        ring_set_paused(engine, 1); /* Freeze the current request in place */
        if (execlists_capture(engine))
-               intel_engine_reset(engine, "preemption time out");
+               intel_engine_reset(engine, msg);
        else
                ring_set_paused(engine, 0);
 
@@ -2711,6 +3111,13 @@ static void execlists_submission_tasklet(unsigned long data)
        bool timeout = preempt_timeout(engine);
 
        process_csb(engine);
+
+       if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
+               engine->execlists.error_interrupt = 0;
+               if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
+                       execlists_reset(engine, "CS error");
+       }
+
        if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
                unsigned long flags;
 
@@ -2719,8 +3126,8 @@ static void execlists_submission_tasklet(unsigned long data)
                spin_unlock_irqrestore(&engine->active.lock, flags);
 
                /* Recheck after serialising with direct-submission */
-               if (timeout && preempt_timeout(engine))
-                       preempt_reset(engine);
+               if (unlikely(timeout && preempt_timeout(engine)))
+                       execlists_reset(engine, "preemption time out");
        }
 }
 
@@ -2759,10 +3166,14 @@ static void __submit_queue_imm(struct intel_engine_cs *engine)
        if (reset_in_progress(execlists))
                return; /* defer until we restart the engine following reset */
 
-       if (execlists->tasklet.func == execlists_submission_tasklet)
-               __execlists_submission_tasklet(engine);
-       else
-               tasklet_hi_schedule(&execlists->tasklet);
+       /* Hopefully we clear execlists->pending[] to let us through */
+       if (READ_ONCE(execlists->pending[0]) &&
+           tasklet_trylock(&execlists->tasklet)) {
+               process_csb(engine);
+               tasklet_unlock(&execlists->tasklet);
+       }
+
+       __execlists_submission_tasklet(engine);
 }
 
 static void submit_queue(struct intel_engine_cs *engine,
@@ -2793,6 +3204,7 @@ static void execlists_submit_request(struct i915_request *request)
        spin_lock_irqsave(&engine->active.lock, flags);
 
        if (unlikely(ancestor_on_hold(engine, request))) {
+               RQ_TRACE(request, "ancestor on hold\n");
                list_add_tail(&request->sched.link, &engine->active.hold);
                i915_request_set_hold(request);
        } else {
@@ -2847,19 +3259,139 @@ check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
        vaddr += engine->context_size;
 
        if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
-               dev_err_once(engine->i915->drm.dev,
+               drm_err_once(&engine->i915->drm,
                             "%s context redzone overwritten!\n",
                             engine->name);
 }
 
 static void execlists_context_unpin(struct intel_context *ce)
 {
-       check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
+       check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
                      ce->engine);
 
        i915_gem_object_unpin_map(ce->state->obj);
 }
 
+static u32 *
+gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
+{
+       *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
+               MI_SRM_LRM_GLOBAL_GTT |
+               MI_LRI_LRM_CS_MMIO;
+       *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
+       *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
+               CTX_TIMESTAMP * sizeof(u32);
+       *cs++ = 0;
+
+       *cs++ = MI_LOAD_REGISTER_REG |
+               MI_LRR_SOURCE_CS_MMIO |
+               MI_LRI_LRM_CS_MMIO;
+       *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
+       *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
+
+       *cs++ = MI_LOAD_REGISTER_REG |
+               MI_LRR_SOURCE_CS_MMIO |
+               MI_LRI_LRM_CS_MMIO;
+       *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
+       *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
+
+       return cs;
+}
+
+static u32 *
+gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
+{
+       GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
+
+       *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
+               MI_SRM_LRM_GLOBAL_GTT |
+               MI_LRI_LRM_CS_MMIO;
+       *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
+       *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
+               (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
+       *cs++ = 0;
+
+       return cs;
+}
+
+static u32 *
+gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
+{
+       GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
+
+       *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
+               MI_SRM_LRM_GLOBAL_GTT |
+               MI_LRI_LRM_CS_MMIO;
+       *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
+       *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
+               (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
+       *cs++ = 0;
+
+       *cs++ = MI_LOAD_REGISTER_REG |
+               MI_LRR_SOURCE_CS_MMIO |
+               MI_LRI_LRM_CS_MMIO;
+       *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
+       *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
+
+       return cs;
+}
+
+static u32 *
+gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
+{
+       cs = gen12_emit_timestamp_wa(ce, cs);
+       cs = gen12_emit_cmd_buf_wa(ce, cs);
+       cs = gen12_emit_restore_scratch(ce, cs);
+
+       return cs;
+}
+
+static u32 *
+gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
+{
+       cs = gen12_emit_timestamp_wa(ce, cs);
+       cs = gen12_emit_restore_scratch(ce, cs);
+
+       return cs;
+}
+
+static inline u32 context_wa_bb_offset(const struct intel_context *ce)
+{
+       return PAGE_SIZE * ce->wa_bb_page;
+}
+
+static u32 *context_indirect_bb(const struct intel_context *ce)
+{
+       void *ptr;
+
+       GEM_BUG_ON(!ce->wa_bb_page);
+
+       ptr = ce->lrc_reg_state;
+       ptr -= LRC_STATE_OFFSET; /* back to start of context image */
+       ptr += context_wa_bb_offset(ce);
+
+       return ptr;
+}
+
+static void
+setup_indirect_ctx_bb(const struct intel_context *ce,
+                     const struct intel_engine_cs *engine,
+                     u32 *(*emit)(const struct intel_context *, u32 *))
+{
+       u32 * const start = context_indirect_bb(ce);
+       u32 *cs;
+
+       cs = emit(ce, start);
+       GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
+       while ((unsigned long)cs % CACHELINE_BYTES)
+               *cs++ = MI_NOOP;
+
+       lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
+                                   i915_ggtt_offset(ce->state) +
+                                   context_wa_bb_offset(ce),
+                                   (cs - start) * sizeof(*cs));
+}
+
 static void
 __execlists_update_reg_state(const struct intel_context *ce,
                             const struct intel_engine_cs *engine,
@@ -2874,6 +3406,7 @@ __execlists_update_reg_state(const struct intel_context *ce,
        regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
        regs[CTX_RING_HEAD] = head;
        regs[CTX_RING_TAIL] = ring->tail;
+       regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
 
        /* RPCS */
        if (engine->class == RENDER_CLASS) {
@@ -2882,6 +3415,18 @@ __execlists_update_reg_state(const struct intel_context *ce,
 
                i915_oa_init_reg_state(ce, engine);
        }
+
+       if (ce->wa_bb_page) {
+               u32 *(*fn)(const struct intel_context *ce, u32 *cs);
+
+               fn = gen12_emit_indirect_ctx_xcs;
+               if (ce->engine->class == RENDER_CLASS)
+                       fn = gen12_emit_indirect_ctx_rcs;
+
+               /* Mutually exclusive wrt to global indirect bb */
+               GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
+               setup_indirect_ctx_bb(ce, engine, fn);
+       }
 }
 
 static int
@@ -2900,7 +3445,7 @@ __execlists_context_pin(struct intel_context *ce,
                return PTR_ERR(vaddr);
 
        ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
-       ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
+       ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
        __execlists_update_reg_state(ce, engine, ce->ring->tail);
 
        return 0;
@@ -2921,22 +3466,6 @@ static void execlists_context_reset(struct intel_context *ce)
        CE_TRACE(ce, "reset\n");
        GEM_BUG_ON(!intel_context_is_pinned(ce));
 
-       /*
-        * Because we emit WA_TAIL_DWORDS there may be a disparity
-        * between our bookkeeping in ce->ring->head and ce->ring->tail and
-        * that stored in context. As we only write new commands from
-        * ce->ring->tail onwards, everything before that is junk. If the GPU
-        * starts reading from its RING_HEAD from the context, it may try to
-        * execute that junk and die.
-        *
-        * The contexts that are stilled pinned on resume belong to the
-        * kernel, and are local to each engine. All other contexts will
-        * have their head/tail sanitized upon pinning before use, so they
-        * will never see garbage,
-        *
-        * So to avoid that we reset the context images upon resume. For
-        * simplicity, we just zero everything out.
-        */
        intel_ring_reset(ce->ring, ce->ring->emit);
 
        /* Scrub away the garbage */
@@ -2964,7 +3493,8 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq)
 {
        u32 *cs;
 
-       GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
+       if (!i915_request_timeline(rq)->has_initial_breadcrumb)
+               return 0;
 
        cs = intel_ring_begin(rq, 6);
        if (IS_ERR(cs))
@@ -3257,7 +3787,7 @@ static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
                goto err;
        }
 
-       err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
+       err = i915_ggtt_pin(vma, 0, PIN_HIGH);
        if (err)
                goto err;
 
@@ -3313,7 +3843,8 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
 
        ret = lrc_setup_wa_ctx(engine);
        if (ret) {
-               DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
+               drm_dbg(&engine->i915->drm,
+                       "Failed to setup context WA page: %d\n", ret);
                return ret;
        }
 
@@ -3346,6 +3877,105 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
        return ret;
 }
 
+static void reset_csb_pointers(struct intel_engine_cs *engine)
+{
+       struct intel_engine_execlists * const execlists = &engine->execlists;
+       const unsigned int reset_value = execlists->csb_size - 1;
+
+       ring_set_paused(engine, 0);
+
+       /*
+        * After a reset, the HW starts writing into CSB entry [0]. We
+        * therefore have to set our HEAD pointer back one entry so that
+        * the *first* entry we check is entry 0. To complicate this further,
+        * as we don't wait for the first interrupt after reset, we have to
+        * fake the HW write to point back to the last entry so that our
+        * inline comparison of our cached head position against the last HW
+        * write works even before the first interrupt.
+        */
+       execlists->csb_head = reset_value;
+       WRITE_ONCE(*execlists->csb_write, reset_value);
+       wmb(); /* Make sure this is visible to HW (paranoia?) */
+
+       /*
+        * Sometimes Icelake forgets to reset its pointers on a GPU reset.
+        * Bludgeon them with a mmio update to be sure.
+        */
+       ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
+                    reset_value << 8 | reset_value);
+       ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
+
+       invalidate_csb_entries(&execlists->csb_status[0],
+                              &execlists->csb_status[reset_value]);
+}
+
+static void execlists_sanitize(struct intel_engine_cs *engine)
+{
+       /*
+        * Poison residual state on resume, in case the suspend didn't!
+        *
+        * We have to assume that across suspend/resume (or other loss
+        * of control) that the contents of our pinned buffers has been
+        * lost, replaced by garbage. Since this doesn't always happen,
+        * let's poison such state so that we more quickly spot when
+        * we falsely assume it has been preserved.
+        */
+       if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
+               memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
+
+       reset_csb_pointers(engine);
+
+       /*
+        * The kernel_context HWSP is stored in the status_page. As above,
+        * that may be lost on resume/initialisation, and so we need to
+        * reset the value in the HWSP.
+        */
+       intel_timeline_reset_seqno(engine->kernel_context->timeline);
+}
+
+static void enable_error_interrupt(struct intel_engine_cs *engine)
+{
+       u32 status;
+
+       engine->execlists.error_interrupt = 0;
+       ENGINE_WRITE(engine, RING_EMR, ~0u);
+       ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
+
+       status = ENGINE_READ(engine, RING_ESR);
+       if (unlikely(status)) {
+               drm_err(&engine->i915->drm,
+                       "engine '%s' resumed still in error: %08x\n",
+                       engine->name, status);
+               __intel_gt_reset(engine->gt, engine->mask);
+       }
+
+       /*
+        * On current gen8+, we have 2 signals to play with
+        *
+        * - I915_ERROR_INSTUCTION (bit 0)
+        *
+        *    Generate an error if the command parser encounters an invalid
+        *    instruction
+        *
+        *    This is a fatal error.
+        *
+        * - CP_PRIV (bit 2)
+        *
+        *    Generate an error on privilege violation (where the CP replaces
+        *    the instruction with a no-op). This also fires for writes into
+        *    read-only scratch pages.
+        *
+        *    This is a non-fatal error, parsing continues.
+        *
+        * * there are a few others defined for odd HW that we do not use
+        *
+        * Since CP_PRIV fires for cases where we have chosen to ignore the
+        * error (as the HW is validating and suppressing the mistakes), we
+        * only unmask the instruction error bit.
+        */
+       ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
+}
+
 static void enable_execlists(struct intel_engine_cs *engine)
 {
        u32 mode;
@@ -3367,6 +3997,8 @@ static void enable_execlists(struct intel_engine_cs *engine)
                        i915_ggtt_offset(engine->status_page.vma));
        ENGINE_POSTING_READ(engine, RING_HWS_PGA);
 
+       enable_error_interrupt(engine);
+
        engine->context_tag = 0;
 }
 
@@ -3375,7 +4007,8 @@ static bool unexpected_starting_state(struct intel_engine_cs *engine)
        bool unexpected = false;
 
        if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
-               DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
+               drm_dbg(&engine->i915->drm,
+                       "STOP_RING still set in RING_MI_MODE\n");
                unexpected = true;
        }
 
@@ -3384,9 +4017,6 @@ static bool unexpected_starting_state(struct intel_engine_cs *engine)
 
 static int execlists_resume(struct intel_engine_cs *engine)
 {
-       intel_engine_apply_workarounds(engine);
-       intel_engine_apply_whitelist(engine);
-
        intel_mocs_init_engine(engine);
 
        intel_engine_reset_breadcrumbs(engine);
@@ -3438,41 +4068,10 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
         *
         * FIXME: Wa for more modern gens needs to be validated
         */
+       ring_set_paused(engine, 1);
        intel_engine_stop_cs(engine);
 }
 
-static void reset_csb_pointers(struct intel_engine_cs *engine)
-{
-       struct intel_engine_execlists * const execlists = &engine->execlists;
-       const unsigned int reset_value = execlists->csb_size - 1;
-
-       ring_set_paused(engine, 0);
-
-       /*
-        * After a reset, the HW starts writing into CSB entry [0]. We
-        * therefore have to set our HEAD pointer back one entry so that
-        * the *first* entry we check is entry 0. To complicate this further,
-        * as we don't wait for the first interrupt after reset, we have to
-        * fake the HW write to point back to the last entry so that our
-        * inline comparison of our cached head position against the last HW
-        * write works even before the first interrupt.
-        */
-       execlists->csb_head = reset_value;
-       WRITE_ONCE(*execlists->csb_write, reset_value);
-       wmb(); /* Make sure this is visible to HW (paranoia?) */
-
-       /*
-        * Sometimes Icelake forgets to reset its pointers on a GPU reset.
-        * Bludgeon them with a mmio update to be sure.
-        */
-       ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
-                    reset_value << 8 | reset_value);
-       ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
-
-       invalidate_csb_entries(&execlists->csb_status[0],
-                              &execlists->csb_status[reset_value]);
-}
-
 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
 {
        int x;
@@ -3517,9 +4116,6 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
        if (!rq)
                goto unwind;
 
-       /* We still have requests in-flight; the engine should be active */
-       GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
-
        ce = rq->context;
        GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
 
@@ -3529,8 +4125,12 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
                goto out_replay;
        }
 
+       /* We still have requests in-flight; the engine should be active */
+       GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
+
        /* Context has requests still in-flight; it should not be idle! */
        GEM_BUG_ON(i915_active_is_idle(&ce->active));
+
        rq = active_request(ce->timeline, rq);
        head = intel_ring_wrap(ce->ring, rq->head);
        GEM_BUG_ON(head == ce->ring->tail);
@@ -3604,7 +4204,10 @@ static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
 
 static void nop_submission_tasklet(unsigned long data)
 {
+       struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
+
        /* The driver is wedged; don't process any more events. */
+       WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
 }
 
 static void execlists_reset_cancel(struct intel_engine_cs *engine)
@@ -4217,6 +4820,8 @@ static void execlists_shutdown(struct intel_engine_cs *engine)
 
 static void execlists_release(struct intel_engine_cs *engine)
 {
+       engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
+
        execlists_shutdown(engine);
 
        intel_engine_cleanup_common(engine);
@@ -4273,6 +4878,8 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
 
        engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
        engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
+       engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
+       engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
 }
 
 static void rcs_submission_override(struct intel_engine_cs *engine)
@@ -4317,7 +4924,7 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine)
                 * because we only expect rare glitches but nothing
                 * critical to prevent us from using GPU
                 */
-               DRM_ERROR("WA batch buffer initialization failed\n");
+               drm_err(&i915->drm, "WA batch buffer initialization failed\n");
 
        if (HAS_LOGICAL_RING_ELSQ(i915)) {
                execlists->submit_reg = uncore->regs +
@@ -4340,48 +4947,13 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine)
        else
                execlists->csb_size = GEN11_CSB_ENTRIES;
 
-       reset_csb_pointers(engine);
-
        /* Finally, take ownership and responsibility for cleanup! */
+       engine->sanitize = execlists_sanitize;
        engine->release = execlists_release;
 
        return 0;
 }
 
-static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
-{
-       u32 indirect_ctx_offset;
-
-       switch (INTEL_GEN(engine->i915)) {
-       default:
-               MISSING_CASE(INTEL_GEN(engine->i915));
-               /* fall through */
-       case 12:
-               indirect_ctx_offset =
-                       GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
-               break;
-       case 11:
-               indirect_ctx_offset =
-                       GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
-               break;
-       case 10:
-               indirect_ctx_offset =
-                       GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
-               break;
-       case 9:
-               indirect_ctx_offset =
-                       GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
-               break;
-       case 8:
-               indirect_ctx_offset =
-                       GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
-               break;
-       }
-
-       return indirect_ctx_offset;
-}
-
-
 static void init_common_reg_state(u32 * const regs,
                                  const struct intel_engine_cs *engine,
                                  const struct intel_ring *ring,
@@ -4399,30 +4971,27 @@ static void init_common_reg_state(u32 * const regs,
        regs[CTX_CONTEXT_CONTROL] = ctl;
 
        regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
+       regs[CTX_TIMESTAMP] = 0;
 }
 
 static void init_wa_bb_reg_state(u32 * const regs,
-                                const struct intel_engine_cs *engine,
-                                u32 pos_bb_per_ctx)
+                                const struct intel_engine_cs *engine)
 {
        const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
 
        if (wa_ctx->per_ctx.size) {
                const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
 
-               regs[pos_bb_per_ctx] =
+               GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
+               regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
                        (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
        }
 
        if (wa_ctx->indirect_ctx.size) {
-               const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
-
-               regs[pos_bb_per_ctx + 2] =
-                       (ggtt_offset + wa_ctx->indirect_ctx.offset) |
-                       (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
-
-               regs[pos_bb_per_ctx + 4] =
-                       intel_lr_indirect_ctx_offset(engine) << 6;
+               lrc_ring_setup_indirect_ctx(regs, engine,
+                                           i915_ggtt_offset(wa_ctx->vma) +
+                                           wa_ctx->indirect_ctx.offset,
+                                           wa_ctx->indirect_ctx.size);
        }
 }
 
@@ -4471,10 +5040,7 @@ static void execlists_init_reg_state(u32 *regs,
        init_common_reg_state(regs, engine, ring, inhibit);
        init_ppgtt_reg_state(regs, vm_alias(ce->vm));
 
-       init_wa_bb_reg_state(regs, engine,
-                            INTEL_GEN(engine->i915) >= 12 ?
-                            GEN12_CTX_BB_PER_CTX_PTR :
-                            CTX_BB_PER_CTX_PTR);
+       init_wa_bb_reg_state(regs, engine);
 
        __reset_stop_ring(regs, engine);
 }
@@ -4492,7 +5058,8 @@ populate_lr_context(struct intel_context *ce,
        vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
        if (IS_ERR(vaddr)) {
                ret = PTR_ERR(vaddr);
-               DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
+               drm_dbg(&engine->i915->drm,
+                       "Could not map object pages! (%d)\n", ret);
                return ret;
        }
 
@@ -4514,9 +5081,14 @@ populate_lr_context(struct intel_context *ce,
                inhibit = false;
        }
 
-       /* The second page of the context object contains some fields which must
-        * be set up prior to the first execution. */
-       execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
+       /* Clear the ppHWSP (inc. per-context counters) */
+       memset(vaddr, 0, PAGE_SIZE);
+
+       /*
+        * The second page of the context object contains some registers which
+        * must be set up prior to the first execution.
+        */
+       execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
                                 ce, engine, ring, inhibit);
 
        ret = 0;
@@ -4541,6 +5113,11 @@ static int __execlists_context_alloc(struct intel_context *ce,
        if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
                context_size += I915_GTT_PAGE_SIZE; /* for redzone */
 
+       if (INTEL_GEN(engine->i915) == 12) {
+               ce->wa_bb_page = context_size / PAGE_SIZE;
+               context_size += PAGE_SIZE;
+       }
+
        ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
        if (IS_ERR(ctx_obj))
                return PTR_ERR(ctx_obj);
@@ -4553,8 +5130,17 @@ static int __execlists_context_alloc(struct intel_context *ce,
 
        if (!ce->timeline) {
                struct intel_timeline *tl;
+               struct i915_vma *hwsp;
+
+               /*
+                * Use the static global HWSP for the kernel context, and
+                * a dynamically allocated cacheline for everyone else.
+                */
+               hwsp = NULL;
+               if (unlikely(intel_context_is_barrier(ce)))
+                       hwsp = engine->status_page.vma;
 
-               tl = intel_timeline_create(engine->gt, NULL);
+               tl = intel_timeline_create(engine->gt, hwsp);
                if (IS_ERR(tl)) {
                        ret = PTR_ERR(tl);
                        goto error_deref_obj;
@@ -4571,7 +5157,8 @@ static int __execlists_context_alloc(struct intel_context *ce,
 
        ret = populate_lr_context(ce, ctx_obj, engine, ring);
        if (ret) {
-               DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
+               drm_dbg(&engine->i915->drm,
+                       "Failed to populate LRC: %d\n", ret);
                goto error_ring_free;
        }
 
@@ -4624,6 +5211,8 @@ static void virtual_context_destroy(struct kref *kref)
                __execlists_context_fini(&ve->context);
        intel_context_fini(&ve->context);
 
+       intel_engine_free_request_pool(&ve->base);
+
        kfree(ve->bonds);
        kfree(ve);
 }
@@ -4723,7 +5312,7 @@ static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
        mask = rq->execution_mask;
        if (unlikely(!mask)) {
                /* Invalid selection, submit to a random engine in error */
-               i915_request_skip(rq, -ENODEV);
+               i915_request_set_error_once(rq, -ENODEV);
                mask = ve->siblings[0]->mask;
        }
 
@@ -4737,7 +5326,7 @@ static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
 static void virtual_submission_tasklet(unsigned long data)
 {
        struct virtual_engine * const ve = (struct virtual_engine *)data;
-       const int prio = ve->base.execlists.queue_priority_hint;
+       const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
        intel_engine_mask_t mask;
        unsigned int n;
 
@@ -4748,12 +5337,15 @@ static void virtual_submission_tasklet(unsigned long data)
                return;
 
        local_irq_disable();
-       for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
-               struct intel_engine_cs *sibling = ve->siblings[n];
+       for (n = 0; n < ve->num_siblings; n++) {
+               struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
                struct ve_node * const node = &ve->nodes[sibling->id];
                struct rb_node **parent, *rb;
                bool first;
 
+               if (!READ_ONCE(ve->request))
+                       break; /* already handled by a sibling's tasklet */
+
                if (unlikely(!(mask & sibling->mask))) {
                        if (!RB_EMPTY_NODE(&node->rb)) {
                                spin_lock(&sibling->active.lock);
@@ -4804,10 +5396,8 @@ static void virtual_submission_tasklet(unsigned long data)
 submit_engine:
                GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
                node->prio = prio;
-               if (first && prio > sibling->execlists.queue_priority_hint) {
-                       sibling->execlists.queue_priority_hint = prio;
+               if (first && prio > sibling->execlists.queue_priority_hint)
                        tasklet_hi_schedule(&sibling->execlists.tasklet);
-               }
 
                spin_unlock(&sibling->active.lock);
        }
@@ -5133,11 +5723,15 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
                show_request(m, last, "\t\tE ");
        }
 
-       last = NULL;
-       count = 0;
+       if (execlists->switch_priority_hint != INT_MIN)
+               drm_printf(m, "\t\tSwitch priority hint: %d\n",
+                          READ_ONCE(execlists->switch_priority_hint));
        if (execlists->queue_priority_hint != INT_MIN)
                drm_printf(m, "\t\tQueue priority hint: %d\n",
-                          execlists->queue_priority_hint);
+                          READ_ONCE(execlists->queue_priority_hint));
+
+       last = NULL;
+       count = 0;
        for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
                struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
                int i;