Merge drm/drm-next into drm-intel-next-queued

[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / intel_lrc.c
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c

index 7c4c8fb..4b28225 100644 (file)
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -137,6 +137,7 @@
  #include <drm/i915_drm.h>
  #include "i915_drv.h"
  #include "i915_gem_render_state.h"
+#include "i915_vgpu.h"
  #include "intel_lrc_reg.h"
  #include "intel_mocs.h"
  #include "intel_workarounds.h"
@@ -164,7 +165,8 @@
  #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
  
  static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
-                                           struct intel_engine_cs *engine);
+                                           struct intel_engine_cs *engine,
+                                           struct intel_context *ce);
  static void execlists_init_reg_state(u32 *reg_state,
                                      struct i915_gem_context *ctx,
                                      struct intel_engine_cs *engine,
@@ -189,12 +191,7 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
                 !i915_request_completed(last));
  }
  
-/**
- * intel_lr_context_descriptor_update() - calculate & cache the descriptor
- *                                       descriptor for a pinned context
- * @ctx: Context to work on
- * @engine: Engine the descriptor will be used with
- *
+/*
   * The context descriptor encodes various attributes of a context,
   * including its GTT address and some flags. Because it's fairly
   * expensive to calculate, we'll just do it once and cache the result,
@@ -204,7 +201,7 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
   *
   *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
   *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
- *      bits 32-52:    ctx ID, a globally unique tag
+ *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
   *      bits 53-54:    mbz, reserved for use by hardware
   *      bits 55-63:    group ID, currently unused and set to 0
   *
@@ -222,9 +219,9 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
   */
  static void
  intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
-                                  struct intel_engine_cs *engine)
+                                  struct intel_engine_cs *engine,
+                                  struct intel_context *ce)
  {
-       struct intel_context *ce = to_intel_context(ctx, engine);
         u64 desc;
  
         BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
@@ -237,6 +234,11 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
                                                                 /* bits 12-31 */
         GEM_BUG_ON(desc & GENMASK_ULL(63, 32));
  
+       /*
+        * The following 32bits are copied into the OA reports (dword 2).
+        * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
+        * anything below.
+        */
         if (INTEL_GEN(ctx->i915) >= 11) {
                 GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
                 desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
@@ -271,7 +273,7 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
  find_priolist:
         /* most positive priority is scheduled first, equal priorities fifo */
         rb = NULL;
-       parent = &execlists->queue.rb_node;
+       parent = &execlists->queue.rb_root.rb_node;
         while (*parent) {
                 rb = *parent;
                 p = to_priolist(rb);
@@ -309,10 +311,7 @@ find_priolist:
         p->priority = prio;
         INIT_LIST_HEAD(&p->requests);
         rb_link_node(&p->node, rb, parent);
-       rb_insert_color(&p->node, &execlists->queue);
-
-       if (first)
-               execlists->first = &p->node;
+       rb_insert_color_cached(&p->node, &execlists->queue, first);
  
         return p;
  }
@@ -345,6 +344,7 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
                         last_prio = rq_prio(rq);
                         p = lookup_priolist(engine, last_prio);
                 }
+               GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
  
                 GEM_BUG_ON(p->priority != rq_prio(rq));
                 list_add(&rq->sched.link, &p->requests);
@@ -418,9 +418,9 @@ execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
  
  static u64 execlists_update_context(struct i915_request *rq)
  {
-       struct intel_context *ce = to_intel_context(rq->ctx, rq->engine);
+       struct intel_context *ce = rq->hw_context;
         struct i915_hw_ppgtt *ppgtt =
-               rq->ctx->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
+               rq->gem_context->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
         u32 *reg_state = ce->lrc_reg_state;
  
         reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
@@ -430,7 +430,7 @@ static u64 execlists_update_context(struct i915_request *rq)
          * PML4 is allocated during ppgtt init, so this is not needed
          * in 48-bit mode.
          */
-       if (ppgtt && !i915_vm_is_48bit(&ppgtt->base))
+       if (!i915_vm_is_48bit(&ppgtt->vm))
                 execlists_update_context_pdps(ppgtt, reg_state);
  
         return ce->lrc_desc;
@@ -453,6 +453,16 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
         struct execlist_port *port = execlists->port;
         unsigned int n;
  
+       /*
+        * We can skip acquiring intel_runtime_pm_get() here as it was taken
+        * on our behalf by the request (see i915_gem_mark_busy()) and it will
+        * not be relinquished until the device is idle (see
+        * i915_gem_idle_work_handler()). As a precaution, we make sure
+        * that all ELSP are drained i.e. we have processed the CSB,
+        * before allowing ourselves to idle and calling intel_runtime_pm_put().
+        */
+       GEM_BUG_ON(!engine->i915->gt.awake);
+
         /*
          * ELSQ note: the submit queue is not cleared after being submitted
          * to the HW so we need to make sure we always clean it up. This is
@@ -495,14 +505,14 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
         execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
  }
  
-static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
+static bool ctx_single_port_submission(const struct intel_context *ce)
  {
         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
-               i915_gem_context_force_single_submission(ctx));
+               i915_gem_context_force_single_submission(ce->gem_context));
  }
  
-static bool can_merge_ctx(const struct i915_gem_context *prev,
-                         const struct i915_gem_context *next)
+static bool can_merge_ctx(const struct intel_context *prev,
+                         const struct intel_context *next)
  {
         if (prev != next)
                 return false;
@@ -532,11 +542,6 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
  
         GEM_BUG_ON(execlists->preempt_complete_status !=
                    upper_32_bits(ce->lrc_desc));
-       GEM_BUG_ON((ce->lrc_reg_state[CTX_CONTEXT_CONTROL + 1] &
-                   _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
-                                      CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT)) !=
-                  _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
-                                     CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
  
         /*
          * Switch to our empty preempt context so
@@ -552,11 +557,24 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
         if (execlists->ctrl_reg)
                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
  
-       execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
-       execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
+       execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
+       execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT);
+}
+
+static void complete_preempt_context(struct intel_engine_execlists *execlists)
+{
+       GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT));
+
+       if (inject_preempt_hang(execlists))
+               return;
+
+       execlists_cancel_port_requests(execlists);
+       __unwind_incomplete_requests(container_of(execlists,
+                                                 struct intel_engine_cs,
+                                                 execlists));
  }
  
-static bool __execlists_dequeue(struct intel_engine_cs *engine)
+static void execlists_dequeue(struct intel_engine_cs *engine)
  {
         struct intel_engine_execlists * const execlists = &engine->execlists;
         struct execlist_port *port = execlists->port;
@@ -566,9 +584,8 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
         struct rb_node *rb;
         bool submit = false;
  
-       lockdep_assert_held(&engine->timeline.lock);
-
-       /* Hardware submission is through 2 ports. Conceptually each port
+       /*
+        * Hardware submission is through 2 ports. Conceptually each port
          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
          * static for a context, and unique to each, so we only execute
          * requests belonging to a single context from each ring. RING_HEAD
@@ -589,9 +606,6 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
          * and context switches) submission.
          */
  
-       rb = execlists->first;
-       GEM_BUG_ON(rb_first(&execlists->queue) != rb);
-
         if (last) {
                 /*
                  * Don't resubmit or switch until all outstanding
@@ -602,8 +616,6 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
                 GEM_BUG_ON(!execlists_is_active(execlists,
                                                 EXECLISTS_ACTIVE_USER));
                 GEM_BUG_ON(!port_count(&port[0]));
-               if (port_count(&port[0]) > 1)
-                       return false;
  
                 /*
                  * If we write to ELSP a second time before the HW has had
@@ -613,11 +625,11 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
                  * the HW to indicate that it has had a chance to respond.
                  */
                 if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
-                       return false;
+                       return;
  
                 if (need_preempt(engine, last, execlists->queue_priority)) {
                         inject_preempt_context(engine);
-                       return false;
+                       return;
                 }
  
                 /*
@@ -642,7 +654,7 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
                  * priorities of the ports haven't been switch.
                  */
                 if (port_count(&port[1]))
-                       return false;
+                       return;
  
                 /*
                  * WaIdleLiteRestore:bdw,skl
@@ -655,7 +667,7 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
                 last->tail = last->wa_tail;
         }
  
-       while (rb) {
+       while ((rb = rb_first_cached(&execlists->queue))) {
                 struct i915_priolist *p = to_priolist(rb);
                 struct i915_request *rq, *rn;
  
@@ -671,7 +683,8 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
                          * second request, and so we never need to tell the
                          * hardware about the first.
                          */
-                       if (last && !can_merge_ctx(rq->ctx, last->ctx)) {
+                       if (last &&
+                           !can_merge_ctx(rq->hw_context, last->hw_context)) {
                                 /*
                                  * If we are on the second port and cannot
                                  * combine this request with the last, then we
@@ -690,14 +703,14 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
                                  * the same context (even though a different
                                  * request) to the second port.
                                  */
-                               if (ctx_single_port_submission(last->ctx) ||
-                                   ctx_single_port_submission(rq->ctx)) {
+                               if (ctx_single_port_submission(last->hw_context) ||
+                                   ctx_single_port_submission(rq->hw_context)) {
                                         __list_del_many(&p->requests,
                                                         &rq->sched.link);
                                         goto done;
                                 }
  
-                               GEM_BUG_ON(last->ctx == rq->ctx);
+                               GEM_BUG_ON(last->hw_context == rq->hw_context);
  
                                 if (submit)
                                         port_assign(port, last);
@@ -713,8 +726,7 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
                         submit = true;
                 }
  
-               rb = rb_next(rb);
-               rb_erase(&p->node, &execlists->queue);
+               rb_erase_cached(&p->node, &execlists->queue);
                 INIT_LIST_HEAD(&p->requests);
                 if (p->priority != I915_PRIORITY_NORMAL)
                         kmem_cache_free(engine->i915->priorities, p);
@@ -740,35 +752,23 @@ done:
         execlists->queue_priority =
                 port != execlists->port ? rq_prio(last) : INT_MIN;
  
-       execlists->first = rb;
-       if (submit)
+       if (submit) {
                 port_assign(port, last);
+               execlists_submit_ports(engine);
+       }
  
         /* We must always keep the beast fed if we have work piled up */
-       GEM_BUG_ON(execlists->first && !port_isset(execlists->port));
+       GEM_BUG_ON(rb_first_cached(&execlists->queue) &&
+                  !port_isset(execlists->port));
  
         /* Re-evaluate the executing context setup after each preemptive kick */
         if (last)
                 execlists_user_begin(execlists, execlists->port);
  
-       return submit;
-}
-
-static void execlists_dequeue(struct intel_engine_cs *engine)
-{
-       struct intel_engine_execlists * const execlists = &engine->execlists;
-       unsigned long flags;
-       bool submit;
-
-       spin_lock_irqsave(&engine->timeline.lock, flags);
-       submit = __execlists_dequeue(engine);
-       spin_unlock_irqrestore(&engine->timeline.lock, flags);
-
-       if (submit)
-               execlists_submit_ports(engine);
-
-       GEM_BUG_ON(port_isset(execlists->port) &&
-                  !execlists_is_active(execlists, EXECLISTS_ACTIVE_USER));
+       /* If the engine is now idle, so should be the flag; and vice versa. */
+       GEM_BUG_ON(execlists_is_active(&engine->execlists,
+                                      EXECLISTS_ACTIVE_USER) ==
+                  !port_isset(engine->execlists.port));
  }
  
  void
@@ -799,82 +799,27 @@ execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
                 port++;
         }
  
-       execlists_user_end(execlists);
+       execlists_clear_all_active(execlists);
  }
  
-static void clear_gtiir(struct intel_engine_cs *engine)
+static void reset_csb_pointers(struct intel_engine_execlists *execlists)
  {
-       struct drm_i915_private *dev_priv = engine->i915;
-       int i;
-
         /*
-        * Clear any pending interrupt state.
-        *
-        * We do it twice out of paranoia that some of the IIR are
-        * double buffered, and so if we only reset it once there may
-        * still be an interrupt pending.
+        * After a reset, the HW starts writing into CSB entry [0]. We
+        * therefore have to set our HEAD pointer back one entry so that
+        * the *first* entry we check is entry 0. To complicate this further,
+        * as we don't wait for the first interrupt after reset, we have to
+        * fake the HW write to point back to the last entry so that our
+        * inline comparison of our cached head position against the last HW
+        * write works even before the first interrupt.
          */
-       if (INTEL_GEN(dev_priv) >= 11) {
-               static const struct {
-                       u8 bank;
-                       u8 bit;
-               } gen11_gtiir[] = {
-                       [RCS] = {0, GEN11_RCS0},
-                       [BCS] = {0, GEN11_BCS},
-                       [_VCS(0)] = {1, GEN11_VCS(0)},
-                       [_VCS(1)] = {1, GEN11_VCS(1)},
-                       [_VCS(2)] = {1, GEN11_VCS(2)},
-                       [_VCS(3)] = {1, GEN11_VCS(3)},
-                       [_VECS(0)] = {1, GEN11_VECS(0)},
-                       [_VECS(1)] = {1, GEN11_VECS(1)},
-               };
-               unsigned long irqflags;
-
-               GEM_BUG_ON(engine->id >= ARRAY_SIZE(gen11_gtiir));
-
-               spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
-               for (i = 0; i < 2; i++) {
-                       gen11_reset_one_iir(dev_priv,
-                                           gen11_gtiir[engine->id].bank,
-                                           gen11_gtiir[engine->id].bit);
-               }
-               spin_unlock_irqrestore(&dev_priv->irq_lock, irqflags);
-       } else {
-               static const u8 gtiir[] = {
-                       [RCS]  = 0,
-                       [BCS]  = 0,
-                       [VCS]  = 1,
-                       [VCS2] = 1,
-                       [VECS] = 3,
-               };
-
-               GEM_BUG_ON(engine->id >= ARRAY_SIZE(gtiir));
-
-               for (i = 0; i < 2; i++) {
-                       I915_WRITE(GEN8_GT_IIR(gtiir[engine->id]),
-                                  engine->irq_keep_mask);
-                       POSTING_READ(GEN8_GT_IIR(gtiir[engine->id]));
-               }
-               GEM_BUG_ON(I915_READ(GEN8_GT_IIR(gtiir[engine->id])) &
-                          engine->irq_keep_mask);
-       }
+       execlists->csb_head = execlists->csb_write_reset;
+       WRITE_ONCE(*execlists->csb_write, execlists->csb_write_reset);
  }
  
-static void reset_irq(struct intel_engine_cs *engine)
+static void nop_submission_tasklet(unsigned long data)
  {
-       /* Mark all CS interrupts as complete */
-       smp_store_mb(engine->execlists.active, 0);
-       synchronize_hardirq(engine->i915->drm.irq);
-
-       clear_gtiir(engine);
-
-       /*
-        * The port is checked prior to scheduling a tasklet, but
-        * just in case we have suspended the tasklet to do the
-        * wedging make sure that when it wakes, it decides there
-        * is no work to do by clearing the irq_posted bit.
-        */
-       clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
+       /* The driver is wedged; don't process any more events. */
  }
  
  static void execlists_cancel_requests(struct intel_engine_cs *engine)
@@ -901,13 +846,11 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
          * submission's irq state, we also wish to remind ourselves that
          * it is irq state.)
          */
-       local_irq_save(flags);
+       spin_lock_irqsave(&engine->timeline.lock, flags);
  
         /* Cancel the requests on the HW and clear the ELSP tracker. */
         execlists_cancel_port_requests(execlists);
-       reset_irq(engine);
-
-       spin_lock(&engine->timeline.lock);
+       execlists_user_end(execlists);
  
         /* Mark all executing requests as skipped. */
         list_for_each_entry(rq, &engine->timeline.requests, link) {
@@ -917,8 +860,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
         }
  
         /* Flush the queued requests to the timeline list (for retiring). */
-       rb = execlists->first;
-       while (rb) {
+       while ((rb = rb_first_cached(&execlists->queue))) {
                 struct i915_priolist *p = to_priolist(rb);
  
                 list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
@@ -928,8 +870,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
                         __i915_request_submit(rq);
                 }
  
-               rb = rb_next(rb);
-               rb_erase(&p->node, &execlists->queue);
+               rb_erase_cached(&p->node, &execlists->queue);
                 INIT_LIST_HEAD(&p->requests);
                 if (p->priority != I915_PRIORITY_NORMAL)
                         kmem_cache_free(engine->i915->priorities, p);
@@ -938,221 +879,198 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
         /* Remaining _unready_ requests will be nop'ed when submitted */
  
         execlists->queue_priority = INT_MIN;
-       execlists->queue = RB_ROOT;
-       execlists->first = NULL;
+       execlists->queue = RB_ROOT_CACHED;
         GEM_BUG_ON(port_isset(execlists->port));
  
-       spin_unlock(&engine->timeline.lock);
+       GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
+       execlists->tasklet.func = nop_submission_tasklet;
  
-       local_irq_restore(flags);
+       spin_unlock_irqrestore(&engine->timeline.lock, flags);
  }
  
-/*
- * Check the unread Context Status Buffers and manage the submission of new
- * contexts to the ELSP accordingly.
- */
-static void execlists_submission_tasklet(unsigned long data)
+static inline bool
+reset_in_progress(const struct intel_engine_execlists *execlists)
+{
+       return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
+}
+
+static void process_csb(struct intel_engine_cs *engine)
  {
-       struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
         struct intel_engine_execlists * const execlists = &engine->execlists;
         struct execlist_port *port = execlists->port;
-       struct drm_i915_private *dev_priv = engine->i915;
-       bool fw = false;
+       const u32 * const buf = execlists->csb_status;
+       u8 head, tail;
  
         /*
-        * We can skip acquiring intel_runtime_pm_get() here as it was taken
-        * on our behalf by the request (see i915_gem_mark_busy()) and it will
-        * not be relinquished until the device is idle (see
-        * i915_gem_idle_work_handler()). As a precaution, we make sure
-        * that all ELSP are drained i.e. we have processed the CSB,
-        * before allowing ourselves to idle and calling intel_runtime_pm_put().
+        * Note that csb_write, csb_status may be either in HWSP or mmio.
+        * When reading from the csb_write mmio register, we have to be
+        * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
+        * the low 4bits. As it happens we know the next 4bits are always
+        * zero and so we can simply masked off the low u8 of the register
+        * and treat it identically to reading from the HWSP (without having
+        * to use explicit shifting and masking, and probably bifurcating
+        * the code to handle the legacy mmio read).
          */
-       GEM_BUG_ON(!dev_priv->gt.awake);
+       head = execlists->csb_head;
+       tail = READ_ONCE(*execlists->csb_write);
+       GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
+       if (unlikely(head == tail))
+               return;
  
         /*
-        * Prefer doing test_and_clear_bit() as a two stage operation to avoid
-        * imposing the cost of a locked atomic transaction when submitting a
-        * new request (outside of the context-switch interrupt).
+        * Hopefully paired with a wmb() in HW!
+        *
+        * We must complete the read of the write pointer before any reads
+        * from the CSB, so that we do not see stale values. Without an rmb
+        * (lfence) the HW may speculatively perform the CSB[] reads *before*
+        * we perform the READ_ONCE(*csb_write).
          */
-       while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
-               /* The HWSP contains a (cacheable) mirror of the CSB */
-               const u32 *buf =
-                       &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
-               unsigned int head, tail;
+       rmb();
  
-               if (unlikely(execlists->csb_use_mmio)) {
-                       buf = (u32 * __force)
-                               (dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
-                       execlists->csb_head = -1; /* force mmio read of CSB ptrs */
-               }
+       do {
+               struct i915_request *rq;
+               unsigned int status;
+               unsigned int count;
  
-               /* Clear before reading to catch new interrupts */
-               clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-               smp_mb__after_atomic();
+               if (++head == GEN8_CSB_ENTRIES)
+                       head = 0;
  
-               if (unlikely(execlists->csb_head == -1)) { /* following a reset */
-                       if (!fw) {
-                               intel_uncore_forcewake_get(dev_priv,
-                                                          execlists->fw_domains);
-                               fw = true;
-                       }
+               /*
+                * We are flying near dragons again.
+                *
+                * We hold a reference to the request in execlist_port[]
+                * but no more than that. We are operating in softirq
+                * context and so cannot hold any mutex or sleep. That
+                * prevents us stopping the requests we are processing
+                * in port[] from being retired simultaneously (the
+                * breadcrumb will be complete before we see the
+                * context-switch). As we only hold the reference to the
+                * request, any pointer chasing underneath the request
+                * is subject to a potential use-after-free. Thus we
+                * store all of the bookkeeping within port[] as
+                * required, and avoid using unguarded pointers beneath
+                * request itself. The same applies to the atomic
+                * status notifier.
+                */
  
-                       head = readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
-                       tail = GEN8_CSB_WRITE_PTR(head);
-                       head = GEN8_CSB_READ_PTR(head);
-                       execlists->csb_head = head;
-               } else {
-                       const int write_idx =
-                               intel_hws_csb_write_index(dev_priv) -
-                               I915_HWS_CSB_BUF0_INDEX;
+               GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
+                         engine->name, head,
+                         buf[2 * head + 0], buf[2 * head + 1],
+                         execlists->active);
+
+               status = buf[2 * head];
+               if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
+                             GEN8_CTX_STATUS_PREEMPTED))
+                       execlists_set_active(execlists,
+                                            EXECLISTS_ACTIVE_HWACK);
+               if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
+                       execlists_clear_active(execlists,
+                                              EXECLISTS_ACTIVE_HWACK);
+
+               if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
+                       continue;
+
+               /* We should never get a COMPLETED | IDLE_ACTIVE! */
+               GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
  
-                       head = execlists->csb_head;
-                       tail = READ_ONCE(buf[write_idx]);
-                       rmb(); /* Hopefully paired with a wmb() in HW */
+               if (status & GEN8_CTX_STATUS_COMPLETE &&
+                   buf[2*head + 1] == execlists->preempt_complete_status) {
+                       GEM_TRACE("%s preempt-idle\n", engine->name);
+                       complete_preempt_context(execlists);
+                       continue;
                 }
-               GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
-                         engine->name,
-                         head, GEN8_CSB_READ_PTR(readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
-                         tail, GEN8_CSB_WRITE_PTR(readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
  
-               while (head != tail) {
-                       struct i915_request *rq;
-                       unsigned int status;
-                       unsigned int count;
+               if (status & GEN8_CTX_STATUS_PREEMPTED &&
+                   execlists_is_active(execlists,
+                                       EXECLISTS_ACTIVE_PREEMPT))
+                       continue;
  
-                       if (++head == GEN8_CSB_ENTRIES)
-                               head = 0;
+               GEM_BUG_ON(!execlists_is_active(execlists,
+                                               EXECLISTS_ACTIVE_USER));
  
-                       /* We are flying near dragons again.
-                        *
-                        * We hold a reference to the request in execlist_port[]
-                        * but no more than that. We are operating in softirq
-                        * context and so cannot hold any mutex or sleep. That
-                        * prevents us stopping the requests we are processing
-                        * in port[] from being retired simultaneously (the
-                        * breadcrumb will be complete before we see the
-                        * context-switch). As we only hold the reference to the
-                        * request, any pointer chasing underneath the request
-                        * is subject to a potential use-after-free. Thus we
-                        * store all of the bookkeeping within port[] as
-                        * required, and avoid using unguarded pointers beneath
-                        * request itself. The same applies to the atomic
-                        * status notifier.
+               rq = port_unpack(port, &count);
+               GEM_TRACE("%s out[0]: ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
+                         engine->name,
+                         port->context_id, count,
+                         rq ? rq->global_seqno : 0,
+                         rq ? rq->fence.context : 0,
+                         rq ? rq->fence.seqno : 0,
+                         intel_engine_get_seqno(engine),
+                         rq ? rq_prio(rq) : 0);
+
+               /* Check the context/desc id for this event matches */
+               GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
+
+               GEM_BUG_ON(count == 0);
+               if (--count == 0) {
+                       /*
+                        * On the final event corresponding to the
+                        * submission of this context, we expect either
+                        * an element-switch event or a completion
+                        * event (and on completion, the active-idle
+                        * marker). No more preemptions, lite-restore
+                        * or otherwise.
                          */
+                       GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
+                       GEM_BUG_ON(port_isset(&port[1]) &&
+                                  !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
+                       GEM_BUG_ON(!port_isset(&port[1]) &&
+                                  !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
  
-                       status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
-                       GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
-                                 engine->name, head,
-                                 status, buf[2*head + 1],
-                                 execlists->active);
-
-                       if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
-                                     GEN8_CTX_STATUS_PREEMPTED))
-                               execlists_set_active(execlists,
-                                                    EXECLISTS_ACTIVE_HWACK);
-                       if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
-                               execlists_clear_active(execlists,
-                                                      EXECLISTS_ACTIVE_HWACK);
-
-                       if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
-                               continue;
-
-                       /* We should never get a COMPLETED | IDLE_ACTIVE! */
-                       GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
-
-                       if (status & GEN8_CTX_STATUS_COMPLETE &&
-                           buf[2*head + 1] == execlists->preempt_complete_status) {
-                               GEM_TRACE("%s preempt-idle\n", engine->name);
-
-                               execlists_cancel_port_requests(execlists);
-                               execlists_unwind_incomplete_requests(execlists);
-
-                               GEM_BUG_ON(!execlists_is_active(execlists,
-                                                               EXECLISTS_ACTIVE_PREEMPT));
-                               execlists_clear_active(execlists,
-                                                      EXECLISTS_ACTIVE_PREEMPT);
-                               continue;
-                       }
-
-                       if (status & GEN8_CTX_STATUS_PREEMPTED &&
-                           execlists_is_active(execlists,
-                                               EXECLISTS_ACTIVE_PREEMPT))
-                               continue;
-
-                       GEM_BUG_ON(!execlists_is_active(execlists,
-                                                       EXECLISTS_ACTIVE_USER));
-
-                       rq = port_unpack(port, &count);
-                       GEM_TRACE("%s out[0]: ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
-                                 engine->name,
-                                 port->context_id, count,
-                                 rq ? rq->global_seqno : 0,
-                                 rq ? rq->fence.context : 0,
-                                 rq ? rq->fence.seqno : 0,
-                                 intel_engine_get_seqno(engine),
-                                 rq ? rq_prio(rq) : 0);
+                       /*
+                        * We rely on the hardware being strongly
+                        * ordered, that the breadcrumb write is
+                        * coherent (visible from the CPU) before the
+                        * user interrupt and CSB is processed.
+                        */
+                       GEM_BUG_ON(!i915_request_completed(rq));
  
-                       /* Check the context/desc id for this event matches */
-                       GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
+                       execlists_context_schedule_out(rq,
+                                                      INTEL_CONTEXT_SCHEDULE_OUT);
+                       i915_request_put(rq);
  
-                       GEM_BUG_ON(count == 0);
-                       if (--count == 0) {
-                               /*
-                                * On the final event corresponding to the
-                                * submission of this context, we expect either
-                                * an element-switch event or a completion
-                                * event (and on completion, the active-idle
-                                * marker). No more preemptions, lite-restore
-                                * or otherwise.
-                                */
-                               GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
-                               GEM_BUG_ON(port_isset(&port[1]) &&
-                                          !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
-                               GEM_BUG_ON(!port_isset(&port[1]) &&
-                                          !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
+                       GEM_TRACE("%s completed ctx=%d\n",
+                                 engine->name, port->context_id);
  
-                               /*
-                                * We rely on the hardware being strongly
-                                * ordered, that the breadcrumb write is
-                                * coherent (visible from the CPU) before the
-                                * user interrupt and CSB is processed.
-                                */
-                               GEM_BUG_ON(!i915_request_completed(rq));
-
-                               execlists_context_schedule_out(rq,
-                                                              INTEL_CONTEXT_SCHEDULE_OUT);
-                               i915_request_put(rq);
-
-                               GEM_TRACE("%s completed ctx=%d\n",
-                                         engine->name, port->context_id);
-
-                               port = execlists_port_complete(execlists, port);
-                               if (port_isset(port))
-                                       execlists_user_begin(execlists, port);
-                               else
-                                       execlists_user_end(execlists);
-                       } else {
-                               port_set(port, port_pack(rq, count));
-                       }
+                       port = execlists_port_complete(execlists, port);
+                       if (port_isset(port))
+                               execlists_user_begin(execlists, port);
+                       else
+                               execlists_user_end(execlists);
+               } else {
+                       port_set(port, port_pack(rq, count));
                 }
+       } while (head != tail);
  
-               if (head != execlists->csb_head) {
-                       execlists->csb_head = head;
-                       writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
-                              dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
-               }
-       }
+       execlists->csb_head = head;
+}
+
+static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
+{
+       lockdep_assert_held(&engine->timeline.lock);
  
-       if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT))
+       process_csb(engine);
+       if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
                 execlists_dequeue(engine);
+}
+
+/*
+ * Check the unread Context Status Buffers and manage the submission of new
+ * contexts to the ELSP accordingly.
+ */
+static void execlists_submission_tasklet(unsigned long data)
+{
+       struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
+       unsigned long flags;
  
-       if (fw)
-               intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
+       GEM_TRACE("%s awake?=%d, active=%x\n",
+                 engine->name,
+                 engine->i915->gt.awake,
+                 engine->execlists.active);
  
-       /* If the engine is now idle, so should be the flag; and vice versa. */
-       GEM_BUG_ON(execlists_is_active(&engine->execlists,
-                                      EXECLISTS_ACTIVE_USER) ==
-                  !port_isset(engine->execlists.port));
+       spin_lock_irqsave(&engine->timeline.lock, flags);
+       __execlists_submission_tasklet(engine);
+       spin_unlock_irqrestore(&engine->timeline.lock, flags);
  }
  
  static void queue_request(struct intel_engine_cs *engine,
@@ -1163,16 +1081,30 @@ static void queue_request(struct intel_engine_cs *engine,
                       &lookup_priolist(engine, prio)->requests);
  }
  
-static void __submit_queue(struct intel_engine_cs *engine, int prio)
+static void __update_queue(struct intel_engine_cs *engine, int prio)
  {
         engine->execlists.queue_priority = prio;
-       tasklet_hi_schedule(&engine->execlists.tasklet);
+}
+
+static void __submit_queue_imm(struct intel_engine_cs *engine)
+{
+       struct intel_engine_execlists * const execlists = &engine->execlists;
+
+       if (reset_in_progress(execlists))
+               return; /* defer until we restart the engine following reset */
+
+       if (execlists->tasklet.func == execlists_submission_tasklet)
+               __execlists_submission_tasklet(engine);
+       else
+               tasklet_hi_schedule(&execlists->tasklet);
  }
  
  static void submit_queue(struct intel_engine_cs *engine, int prio)
  {
-       if (prio > engine->execlists.queue_priority)
-               __submit_queue(engine, prio);
+       if (prio > engine->execlists.queue_priority) {
+               __update_queue(engine, prio);
+               __submit_queue_imm(engine);
+       }
  }
  
  static void execlists_submit_request(struct i915_request *request)
@@ -1184,11 +1116,12 @@ static void execlists_submit_request(struct i915_request *request)
         spin_lock_irqsave(&engine->timeline.lock, flags);
  
         queue_request(engine, &request->sched, rq_prio(request));
-       submit_queue(engine, rq_prio(request));
  
-       GEM_BUG_ON(!engine->execlists.first);
+       GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
         GEM_BUG_ON(list_empty(&request->sched.link));
  
+       submit_queue(engine, rq_prio(request));
+
         spin_unlock_irqrestore(&engine->timeline.lock, flags);
  }
  
@@ -1315,13 +1248,42 @@ static void execlists_schedule(struct i915_request *request,
                 }
  
                 if (prio > engine->execlists.queue_priority &&
-                   i915_sw_fence_done(&sched_to_request(node)->submit))
-                       __submit_queue(engine, prio);
+                   i915_sw_fence_done(&sched_to_request(node)->submit)) {
+                       /* defer submission until after all of our updates */
+                       __update_queue(engine, prio);
+                       tasklet_hi_schedule(&engine->execlists.tasklet);
+               }
         }
  
         spin_unlock_irq(&engine->timeline.lock);
  }
  
+static void execlists_context_destroy(struct intel_context *ce)
+{
+       GEM_BUG_ON(ce->pin_count);
+
+       if (!ce->state)
+               return;
+
+       intel_ring_free(ce->ring);
+
+       GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj));
+       i915_gem_object_put(ce->state->obj);
+}
+
+static void execlists_context_unpin(struct intel_context *ce)
+{
+       i915_gem_context_unpin_hw_id(ce->gem_context);
+
+       intel_ring_unpin(ce->ring);
+
+       ce->state->obj->pin_global--;
+       i915_gem_object_unpin_map(ce->state->obj);
+       i915_vma_unpin(ce->state);
+
+       i915_gem_context_put(ce->gem_context);
+}
+
  static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
  {
         unsigned int flags;
@@ -1333,33 +1295,26 @@ static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
          * on an active context (which by nature is already on the GPU).
          */
         if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
-               err = i915_gem_object_set_to_gtt_domain(vma->obj, true);
+               err = i915_gem_object_set_to_wc_domain(vma->obj, true);
                 if (err)
                         return err;
         }
  
         flags = PIN_GLOBAL | PIN_HIGH;
-       if (ctx->ggtt_offset_bias)
-               flags |= PIN_OFFSET_BIAS | ctx->ggtt_offset_bias;
+       flags |= PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma);
  
-       return i915_vma_pin(vma, 0, GEN8_LR_CONTEXT_ALIGN, flags);
+       return i915_vma_pin(vma, 0, 0, flags);
  }
  
-static struct intel_ring *
-execlists_context_pin(struct intel_engine_cs *engine,
-                     struct i915_gem_context *ctx)
+static struct intel_context *
+__execlists_context_pin(struct intel_engine_cs *engine,
+                       struct i915_gem_context *ctx,
+                       struct intel_context *ce)
  {
-       struct intel_context *ce = to_intel_context(ctx, engine);
         void *vaddr;
         int ret;
  
-       lockdep_assert_held(&ctx->i915->drm.struct_mutex);
-
-       if (likely(ce->pin_count++))
-               goto out;
-       GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
-
-       ret = execlists_context_deferred_alloc(ctx, engine);
+       ret = execlists_context_deferred_alloc(ctx, engine, ce);
         if (ret)
                 goto err;
         GEM_BUG_ON(!ce->state);
@@ -1368,28 +1323,38 @@ execlists_context_pin(struct intel_engine_cs *engine,
         if (ret)
                 goto err;
  
-       vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
+       vaddr = i915_gem_object_pin_map(ce->state->obj,
+                                       i915_coherent_map_type(ctx->i915) |
+                                       I915_MAP_OVERRIDE);
         if (IS_ERR(vaddr)) {
                 ret = PTR_ERR(vaddr);
                 goto unpin_vma;
         }
  
-       ret = intel_ring_pin(ce->ring, ctx->i915, ctx->ggtt_offset_bias);
+       ret = intel_ring_pin(ce->ring);
         if (ret)
                 goto unpin_map;
  
-       intel_lr_context_descriptor_update(ctx, engine);
+       ret = i915_gem_context_pin_hw_id(ctx);
+       if (ret)
+               goto unpin_ring;
+
+       intel_lr_context_descriptor_update(ctx, engine, ce);
+
+       GEM_BUG_ON(!intel_ring_offset_valid(ce->ring, ce->ring->head));
  
         ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
         ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
                 i915_ggtt_offset(ce->ring->vma);
-       ce->lrc_reg_state[CTX_RING_HEAD+1] = ce->ring->head;
+       ce->lrc_reg_state[CTX_RING_HEAD + 1] = ce->ring->head;
+       ce->lrc_reg_state[CTX_RING_TAIL + 1] = ce->ring->tail;
  
         ce->state->obj->pin_global++;
         i915_gem_context_get(ctx);
-out:
-       return ce->ring;
+       return ce;
  
+unpin_ring:
+       intel_ring_unpin(ce->ring);
  unpin_map:
         i915_gem_object_unpin_map(ce->state->obj);
  unpin_vma:
@@ -1399,33 +1364,34 @@ err:
         return ERR_PTR(ret);
  }
  
-static void execlists_context_unpin(struct intel_engine_cs *engine,
-                                   struct i915_gem_context *ctx)
+static const struct intel_context_ops execlists_context_ops = {
+       .unpin = execlists_context_unpin,
+       .destroy = execlists_context_destroy,
+};
+
+static struct intel_context *
+execlists_context_pin(struct intel_engine_cs *engine,
+                     struct i915_gem_context *ctx)
  {
         struct intel_context *ce = to_intel_context(ctx, engine);
  
         lockdep_assert_held(&ctx->i915->drm.struct_mutex);
-       GEM_BUG_ON(ce->pin_count == 0);
-
-       if (--ce->pin_count)
-               return;
+       GEM_BUG_ON(!(ctx->ppgtt ?: ctx->i915->mm.aliasing_ppgtt));
  
-       intel_ring_unpin(ce->ring);
+       if (likely(ce->pin_count++))
+               return ce;
+       GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
  
-       ce->state->obj->pin_global--;
-       i915_gem_object_unpin_map(ce->state->obj);
-       i915_vma_unpin(ce->state);
+       ce->ops = &execlists_context_ops;
  
-       i915_gem_context_put(ctx);
+       return __execlists_context_pin(engine, ctx, ce);
  }
  
  static int execlists_request_alloc(struct i915_request *request)
  {
-       struct intel_context *ce =
-               to_intel_context(request->ctx, request->engine);
         int ret;
  
-       GEM_BUG_ON(!ce->pin_count);
+       GEM_BUG_ON(!request->hw_context->pin_count);
  
         /* Flush enough space to reduce the likelihood of waiting after
          * we start building the request - in which case we will just
@@ -1538,29 +1504,56 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
         return batch;
  }
  
+struct lri {
+       i915_reg_t reg;
+       u32 value;
+};
+
+static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
+{
+       GEM_BUG_ON(!count || count > 63);
+
+       *batch++ = MI_LOAD_REGISTER_IMM(count);
+       do {
+               *batch++ = i915_mmio_reg_offset(lri->reg);
+               *batch++ = lri->value;
+       } while (lri++, --count);
+       *batch++ = MI_NOOP;
+
+       return batch;
+}
+
  static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
  {
+       static const struct lri lri[] = {
+               /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
+               {
+                       COMMON_SLICE_CHICKEN2,
+                       __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
+                                      0),
+               },
+
+               /* BSpec: 11391 */
+               {
+                       FF_SLICE_CHICKEN,
+                       __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
+                                      FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
+               },
+
+               /* BSpec: 11299 */
+               {
+                       _3D_CHICKEN3,
+                       __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
+                                      _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
+               }
+       };
+
         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
  
         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
  
-       *batch++ = MI_LOAD_REGISTER_IMM(3);
-
-       /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
-       *batch++ = i915_mmio_reg_offset(COMMON_SLICE_CHICKEN2);
-       *batch++ = _MASKED_BIT_DISABLE(
-                       GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE);
-
-       /* BSpec: 11391 */
-       *batch++ = i915_mmio_reg_offset(FF_SLICE_CHICKEN);
-       *batch++ = _MASKED_BIT_ENABLE(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX);
-
-       /* BSpec: 11299 */
-       *batch++ = i915_mmio_reg_offset(_3D_CHICKEN3);
-       *batch++ = _MASKED_BIT_ENABLE(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX);
-
-       *batch++ = MI_NOOP;
+       batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
  
         /* WaClearSlmSpaceAtContextSwitch:kbl */
         /* Actual scratch location is at 128 bytes offset */
@@ -1652,13 +1645,13 @@ static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
         if (IS_ERR(obj))
                 return PTR_ERR(obj);
  
-       vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL);
+       vma = i915_vma_instance(obj, &engine->i915->ggtt.vm, NULL);
         if (IS_ERR(vma)) {
                 err = PTR_ERR(vma);
                 goto err;
         }
  
-       err = i915_vma_pin(vma, 0, PAGE_SIZE, PIN_GLOBAL | PIN_HIGH);
+       err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
         if (err)
                 goto err;
  
@@ -1672,7 +1665,7 @@ err:
  
  static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
  {
-       i915_vma_unpin_and_release(&engine->wa_ctx.vma);
+       i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
  }
  
  typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
@@ -1767,31 +1760,40 @@ static void enable_execlists(struct intel_engine_cs *engine)
                 I915_WRITE(RING_MODE_GEN7(engine),
                            _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
  
+       I915_WRITE(RING_MI_MODE(engine->mmio_base),
+                  _MASKED_BIT_DISABLE(STOP_RING));
+
         I915_WRITE(RING_HWS_PGA(engine->mmio_base),
                    engine->status_page.ggtt_offset);
         POSTING_READ(RING_HWS_PGA(engine->mmio_base));
+}
  
-       /* Following the reset, we need to reload the CSB read/write pointers */
-       engine->execlists.csb_head = -1;
+static bool unexpected_starting_state(struct intel_engine_cs *engine)
+{
+       struct drm_i915_private *dev_priv = engine->i915;
+       bool unexpected = false;
+
+       if (I915_READ(RING_MI_MODE(engine->mmio_base)) & STOP_RING) {
+               DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
+               unexpected = true;
+       }
+
+       return unexpected;
  }
  
  static int gen8_init_common_ring(struct intel_engine_cs *engine)
  {
-       struct intel_engine_execlists * const execlists = &engine->execlists;
-       int ret;
-
-       ret = intel_mocs_init_engine(engine);
-       if (ret)
-               return ret;
+       intel_mocs_init_engine(engine);
  
         intel_engine_reset_breadcrumbs(engine);
-       intel_engine_init_hangcheck(engine);
  
-       enable_execlists(engine);
+       if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
+               struct drm_printer p = drm_debug_printer(__func__);
  
-       /* After a GPU reset, we may have requests to replay */
-       if (execlists->first)
-               tasklet_schedule(&execlists->tasklet);
+               intel_engine_dump(engine, &p, NULL);
+       }
+
+       enable_execlists(engine);
  
         return 0;
  }
@@ -1833,8 +1835,70 @@ static int gen9_init_render_ring(struct intel_engine_cs *engine)
         return 0;
  }
  
-static void reset_common_ring(struct intel_engine_cs *engine,
-                             struct i915_request *request)
+static struct i915_request *
+execlists_reset_prepare(struct intel_engine_cs *engine)
+{
+       struct intel_engine_execlists * const execlists = &engine->execlists;
+       struct i915_request *request, *active;
+       unsigned long flags;
+
+       GEM_TRACE("%s: depth<-%d\n", engine->name,
+                 atomic_read(&execlists->tasklet.count));
+
+       /*
+        * Prevent request submission to the hardware until we have
+        * completed the reset in i915_gem_reset_finish(). If a request
+        * is completed by one engine, it may then queue a request
+        * to a second via its execlists->tasklet *just* as we are
+        * calling engine->init_hw() and also writing the ELSP.
+        * Turning off the execlists->tasklet until the reset is over
+        * prevents the race.
+        */
+       __tasklet_disable_sync_once(&execlists->tasklet);
+
+       spin_lock_irqsave(&engine->timeline.lock, flags);
+
+       /*
+        * We want to flush the pending context switches, having disabled
+        * the tasklet above, we can assume exclusive access to the execlists.
+        * For this allows us to catch up with an inflight preemption event,
+        * and avoid blaming an innocent request if the stall was due to the
+        * preemption itself.
+        */
+       process_csb(engine);
+
+       /*
+        * The last active request can then be no later than the last request
+        * now in ELSP[0]. So search backwards from there, so that if the GPU
+        * has advanced beyond the last CSB update, it will be pardoned.
+        */
+       active = NULL;
+       request = port_request(execlists->port);
+       if (request) {
+               /*
+                * Prevent the breadcrumb from advancing before we decide
+                * which request is currently active.
+                */
+               intel_engine_stop_cs(engine);
+
+               list_for_each_entry_from_reverse(request,
+                                                &engine->timeline.requests,
+                                                link) {
+                       if (__i915_request_completed(request,
+                                                    request->global_seqno))
+                               break;
+
+                       active = request;
+               }
+       }
+
+       spin_unlock_irqrestore(&engine->timeline.lock, flags);
+
+       return active;
+}
+
+static void execlists_reset(struct intel_engine_cs *engine,
+                           struct i915_request *request)
  {
         struct intel_engine_execlists * const execlists = &engine->execlists;
         unsigned long flags;
@@ -1844,8 +1908,7 @@ static void reset_common_ring(struct intel_engine_cs *engine,
                   engine->name, request ? request->global_seqno : 0,
                   intel_engine_get_seqno(engine));
  
-       /* See execlists_cancel_requests() for the irq/spinlock split. */
-       local_irq_save(flags);
+       spin_lock_irqsave(&engine->timeline.lock, flags);
  
         /*
          * Catch up with any missed context-switch interrupts.
@@ -1857,14 +1920,14 @@ static void reset_common_ring(struct intel_engine_cs *engine,
          * requests were completed.
          */
         execlists_cancel_port_requests(execlists);
-       reset_irq(engine);
  
         /* Push back any incomplete requests for replay after the reset. */
-       spin_lock(&engine->timeline.lock);
         __unwind_incomplete_requests(engine);
-       spin_unlock(&engine->timeline.lock);
  
-       local_irq_restore(flags);
+       /* Following the reset, we need to reload the CSB read/write pointers */
+       reset_csb_pointers(&engine->execlists);
+
+       spin_unlock_irqrestore(&engine->timeline.lock, flags);
  
         /*
          * If the request was innocent, we leave the request in the ELSP
@@ -1888,35 +1951,48 @@ static void reset_common_ring(struct intel_engine_cs *engine,
          * future request will be after userspace has had the opportunity
          * to recreate its own state.
          */
-       regs = to_intel_context(request->ctx, engine)->lrc_reg_state;
-       if (engine->default_state) {
-               void *defaults;
-
-               defaults = i915_gem_object_pin_map(engine->default_state,
-                                                  I915_MAP_WB);
-               if (!IS_ERR(defaults)) {
-                       memcpy(regs, /* skip restoring the vanilla PPHWSP */
-                              defaults + LRC_STATE_PN * PAGE_SIZE,
-                              engine->context_size - PAGE_SIZE);
-                       i915_gem_object_unpin_map(engine->default_state);
-               }
+       regs = request->hw_context->lrc_reg_state;
+       if (engine->pinned_default_state) {
+               memcpy(regs, /* skip restoring the vanilla PPHWSP */
+                      engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
+                      engine->context_size - PAGE_SIZE);
         }
-       execlists_init_reg_state(regs, request->ctx, engine, request->ring);
+       execlists_init_reg_state(regs,
+                                request->gem_context, engine, request->ring);
  
         /* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
         regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
-       regs[CTX_RING_HEAD + 1] = request->postfix;
  
-       request->ring->head = request->postfix;
+       request->ring->head = intel_ring_wrap(request->ring, request->postfix);
+       regs[CTX_RING_HEAD + 1] = request->ring->head;
+
         intel_ring_update_space(request->ring);
  
         /* Reset WaIdleLiteRestore:bdw,skl as well */
         unwind_wa_tail(request);
  }
  
+static void execlists_reset_finish(struct intel_engine_cs *engine)
+{
+       struct intel_engine_execlists * const execlists = &engine->execlists;
+
+       /*
+        * After a GPU reset, we may have requests to replay. Do so now while
+        * we still have the forcewake to be sure that the GPU is not allowed
+        * to sleep before we restart and reload a context.
+        *
+        */
+       if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
+               execlists->tasklet.func(execlists->tasklet.data);
+
+       tasklet_enable(&execlists->tasklet);
+       GEM_TRACE("%s: depth->%d\n", engine->name,
+                 atomic_read(&execlists->tasklet.count));
+}
+
  static int intel_logical_ring_emit_pdps(struct i915_request *rq)
  {
-       struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
+       struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
         struct intel_engine_cs *engine = rq->engine;
         const int num_lri_cmds = GEN8_3LVL_PDPES * 2;
         u32 *cs;
@@ -1955,15 +2031,15 @@ static int gen8_emit_bb_start(struct i915_request *rq,
          * it is unsafe in case of lite-restore (because the ctx is
          * not idle). PML4 is allocated during ppgtt init so this is
          * not needed in 48-bit.*/
-       if (rq->ctx->ppgtt &&
-           (intel_engine_flag(rq->engine) & rq->ctx->ppgtt->pd_dirty_rings) &&
-           !i915_vm_is_48bit(&rq->ctx->ppgtt->base) &&
+       if (rq->gem_context->ppgtt &&
+           (intel_engine_flag(rq->engine) & rq->gem_context->ppgtt->pd_dirty_rings) &&
+           !i915_vm_is_48bit(&rq->gem_context->ppgtt->vm) &&
             !intel_vgpu_active(rq->i915)) {
                 ret = intel_logical_ring_emit_pdps(rq);
                 if (ret)
                         return ret;
  
-               rq->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
+               rq->gem_context->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
         }
  
         cs = intel_ring_begin(rq, 6);
@@ -1991,8 +2067,7 @@ static int gen8_emit_bb_start(struct i915_request *rq,
  
         /* FIXME(BDW): Address space and security selectors. */
         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
-               (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) |
-               (flags & I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0);
+               (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
         *cs++ = lower_32_bits(offset);
         *cs++ = upper_32_bits(offset);
  
@@ -2217,13 +2292,15 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
         kfree(engine);
  }
  
-static void execlists_set_default_submission(struct intel_engine_cs *engine)
+void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
  {
         engine->submit_request = execlists_submit_request;
         engine->cancel_requests = execlists_cancel_requests;
         engine->schedule = execlists_schedule;
         engine->execlists.tasklet.func = execlists_submission_tasklet;
  
+       engine->reset.prepare = execlists_reset_prepare;
+
         engine->park = NULL;
         engine->unpark = NULL;
  
@@ -2243,18 +2320,19 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
  {
         /* Default vfuncs which can be overriden by each engine. */
         engine->init_hw = gen8_init_common_ring;
-       engine->reset_hw = reset_common_ring;
  
-       engine->context_pin = execlists_context_pin;
-       engine->context_unpin = execlists_context_unpin;
+       engine->reset.prepare = execlists_reset_prepare;
+       engine->reset.reset = execlists_reset;
+       engine->reset.finish = execlists_reset_finish;
  
+       engine->context_pin = execlists_context_pin;
         engine->request_alloc = execlists_request_alloc;
  
         engine->emit_flush = gen8_emit_flush;
         engine->emit_breadcrumb = gen8_emit_breadcrumb;
         engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
  
-       engine->set_default_submission = execlists_set_default_submission;
+       engine->set_default_submission = intel_execlists_set_default_submission;
  
         if (INTEL_GEN(engine->i915) < 11) {
                 engine->irq_enable = gen8_logical_ring_enable_irq;
@@ -2294,28 +2372,11 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
  static void
  logical_ring_setup(struct intel_engine_cs *engine)
  {
-       struct drm_i915_private *dev_priv = engine->i915;
-       enum forcewake_domains fw_domains;
-
         intel_engine_setup_common(engine);
  
         /* Intentionally left blank. */
         engine->buffer = NULL;
  
-       fw_domains = intel_uncore_forcewake_for_reg(dev_priv,
-                                                   RING_ELSP(engine),
-                                                   FW_REG_WRITE);
-
-       fw_domains |= intel_uncore_forcewake_for_reg(dev_priv,
-                                                    RING_CONTEXT_STATUS_PTR(engine),
-                                                    FW_REG_READ | FW_REG_WRITE);
-
-       fw_domains |= intel_uncore_forcewake_for_reg(dev_priv,
-                                                    RING_CONTEXT_STATUS_BUF_BASE(engine),
-                                                    FW_REG_READ);
-
-       engine->execlists.fw_domains = fw_domains;
-
         tasklet_init(&engine->execlists.tasklet,
                      execlists_submission_tasklet, (unsigned long)engine);
  
@@ -2323,38 +2384,62 @@ logical_ring_setup(struct intel_engine_cs *engine)
         logical_ring_default_irqs(engine);
  }
  
+static bool csb_force_mmio(struct drm_i915_private *i915)
+{
+       /* Older GVT emulation depends upon intercepting CSB mmio */
+       return intel_vgpu_active(i915) && !intel_vgpu_has_hwsp_emulation(i915);
+}
+
  static int logical_ring_init(struct intel_engine_cs *engine)
  {
+       struct drm_i915_private *i915 = engine->i915;
+       struct intel_engine_execlists * const execlists = &engine->execlists;
         int ret;
  
         ret = intel_engine_init_common(engine);
         if (ret)
-               goto error;
+               return ret;
  
-       if (HAS_LOGICAL_RING_ELSQ(engine->i915)) {
-               engine->execlists.submit_reg = engine->i915->regs +
+       if (HAS_LOGICAL_RING_ELSQ(i915)) {
+               execlists->submit_reg = i915->regs +
                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(engine));
-               engine->execlists.ctrl_reg = engine->i915->regs +
+               execlists->ctrl_reg = i915->regs +
                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(engine));
         } else {
-               engine->execlists.submit_reg = engine->i915->regs +
+               execlists->submit_reg = i915->regs +
                         i915_mmio_reg_offset(RING_ELSP(engine));
         }
  
-       engine->execlists.preempt_complete_status = ~0u;
-       if (engine->i915->preempt_context) {
+       execlists->preempt_complete_status = ~0u;
+       if (i915->preempt_context) {
                 struct intel_context *ce =
-                       to_intel_context(engine->i915->preempt_context, engine);
+                       to_intel_context(i915->preempt_context, engine);
  
-               engine->execlists.preempt_complete_status =
+               execlists->preempt_complete_status =
                         upper_32_bits(ce->lrc_desc);
         }
  
-       return 0;
+       execlists->csb_read =
+               i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
+       if (csb_force_mmio(i915)) {
+               execlists->csb_status = (u32 __force *)
+                       (i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
  
-error:
-       intel_logical_ring_cleanup(engine);
-       return ret;
+               execlists->csb_write = (u32 __force *)execlists->csb_read;
+               execlists->csb_write_reset =
+                       _MASKED_FIELD(GEN8_CSB_WRITE_PTR_MASK,
+                                     GEN8_CSB_ENTRIES - 1);
+       } else {
+               execlists->csb_status =
+                       &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
+
+               execlists->csb_write =
+                       &engine->status_page.page_addr[intel_hws_csb_write_index(i915)];
+               execlists->csb_write_reset = GEN8_CSB_ENTRIES - 1;
+       }
+       reset_csb_pointers(execlists);
+
+       return 0;
  }
  
  int logical_render_ring_init(struct intel_engine_cs *engine)
@@ -2377,10 +2462,14 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
         engine->emit_breadcrumb = gen8_emit_breadcrumb_rcs;
         engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_rcs_sz;
  
-       ret = intel_engine_create_scratch(engine, PAGE_SIZE);
+       ret = logical_ring_init(engine);
         if (ret)
                 return ret;
  
+       ret = intel_engine_create_scratch(engine, PAGE_SIZE);
+       if (ret)
+               goto err_cleanup_common;
+
         ret = intel_init_workaround_bb(engine);
         if (ret) {
                 /*
@@ -2392,7 +2481,11 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
                           ret);
         }
  
-       return logical_ring_init(engine);
+       return 0;
+
+err_cleanup_common:
+       intel_engine_cleanup_common(engine);
+       return ret;
  }
  
  int logical_xcs_ring_init(struct intel_engine_cs *engine)
@@ -2405,6 +2498,9 @@ int logical_xcs_ring_init(struct intel_engine_cs *engine)
  static u32
  make_rpcs(struct drm_i915_private *dev_priv)
  {
+       bool subslice_pg = INTEL_INFO(dev_priv)->sseu.has_subslice_pg;
+       u8 slices = hweight8(INTEL_INFO(dev_priv)->sseu.slice_mask);
+       u8 subslices = hweight8(INTEL_INFO(dev_priv)->sseu.subslice_mask[0]);
         u32 rpcs = 0;
  
         /*
@@ -2414,6 +2510,38 @@ make_rpcs(struct drm_i915_private *dev_priv)
         if (INTEL_GEN(dev_priv) < 9)
                 return 0;
  
+       /*
+        * Since the SScount bitfield in GEN8_R_PWR_CLK_STATE is only three bits
+        * wide and Icelake has up to eight subslices, specfial programming is
+        * needed in order to correctly enable all subslices.
+        *
+        * According to documentation software must consider the configuration
+        * as 2x4x8 and hardware will translate this to 1x8x8.
+        *
+        * Furthemore, even though SScount is three bits, maximum documented
+        * value for it is four. From this some rules/restrictions follow:
+        *
+        * 1.
+        * If enabled subslice count is greater than four, two whole slices must
+        * be enabled instead.
+        *
+        * 2.
+        * When more than one slice is enabled, hardware ignores the subslice
+        * count altogether.
+        *
+        * From these restrictions it follows that it is not possible to enable
+        * a count of subslices between the SScount maximum of four restriction,
+        * and the maximum available number on a particular SKU. Either all
+        * subslices are enabled, or a count between one and four on the first
+        * slice.
+        */
+       if (IS_GEN11(dev_priv) && slices == 1 && subslices >= 4) {
+               GEM_BUG_ON(subslices & 1);
+
+               subslice_pg = false;
+               slices *= 2;
+       }
+
         /*
          * Starting in Gen9, render power gating can leave
          * slice/subslice/EU in a partially enabled state. We
@@ -2421,24 +2549,50 @@ make_rpcs(struct drm_i915_private *dev_priv)
          * enablement.
         */
         if (INTEL_INFO(dev_priv)->sseu.has_slice_pg) {
-               rpcs |= GEN8_RPCS_S_CNT_ENABLE;
-               rpcs |= hweight8(INTEL_INFO(dev_priv)->sseu.slice_mask) <<
-                       GEN8_RPCS_S_CNT_SHIFT;
-               rpcs |= GEN8_RPCS_ENABLE;
+               u32 mask, val = slices;
+
+               if (INTEL_GEN(dev_priv) >= 11) {
+                       mask = GEN11_RPCS_S_CNT_MASK;
+                       val <<= GEN11_RPCS_S_CNT_SHIFT;
+               } else {
+                       mask = GEN8_RPCS_S_CNT_MASK;
+                       val <<= GEN8_RPCS_S_CNT_SHIFT;
+               }
+
+               GEM_BUG_ON(val & ~mask);
+               val &= mask;
+
+               rpcs |= GEN8_RPCS_ENABLE | GEN8_RPCS_S_CNT_ENABLE | val;
         }
  
-       if (INTEL_INFO(dev_priv)->sseu.has_subslice_pg) {
-               rpcs |= GEN8_RPCS_SS_CNT_ENABLE;
-               rpcs |= hweight8(INTEL_INFO(dev_priv)->sseu.subslice_mask[0]) <<
-                       GEN8_RPCS_SS_CNT_SHIFT;
-               rpcs |= GEN8_RPCS_ENABLE;
+       if (subslice_pg) {
+               u32 val = subslices;
+
+               val <<= GEN8_RPCS_SS_CNT_SHIFT;
+
+               GEM_BUG_ON(val & ~GEN8_RPCS_SS_CNT_MASK);
+               val &= GEN8_RPCS_SS_CNT_MASK;
+
+               rpcs |= GEN8_RPCS_ENABLE | GEN8_RPCS_SS_CNT_ENABLE | val;
         }
  
         if (INTEL_INFO(dev_priv)->sseu.has_eu_pg) {
-               rpcs |= INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
-                       GEN8_RPCS_EU_MIN_SHIFT;
-               rpcs |= INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
-                       GEN8_RPCS_EU_MAX_SHIFT;
+               u32 val;
+
+               val = INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
+                     GEN8_RPCS_EU_MIN_SHIFT;
+               GEM_BUG_ON(val & ~GEN8_RPCS_EU_MIN_MASK);
+               val &= GEN8_RPCS_EU_MIN_MASK;
+
+               rpcs |= val;
+
+               val = INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
+                     GEN8_RPCS_EU_MAX_SHIFT;
+               GEM_BUG_ON(val & ~GEN8_RPCS_EU_MAX_MASK);
+               val &= GEN8_RPCS_EU_MAX_MASK;
+
+               rpcs |= val;
+
                 rpcs |= GEN8_RPCS_ENABLE;
         }
  
@@ -2482,7 +2636,7 @@ static void execlists_init_reg_state(u32 *regs,
         struct drm_i915_private *dev_priv = engine->i915;
         struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt;
         u32 base = engine->mmio_base;
-       bool rcs = engine->id == RCS;
+       bool rcs = engine->class == RENDER_CLASS;
  
         /* A context is actually a big batch buffer with several
          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
@@ -2495,11 +2649,13 @@ static void execlists_init_reg_state(u32 *regs,
                                  MI_LRI_FORCE_POSTED;
  
         CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(engine),
-               _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
-                                   CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT) |
-               _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
-                                  (HAS_RESOURCE_STREAMER(dev_priv) ?
-                                  CTX_CTRL_RS_CTX_ENABLE : 0)));
+               _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
+               _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH));
+       if (INTEL_GEN(dev_priv) < 11) {
+               regs[CTX_CONTEXT_CONTROL + 1] |=
+                       _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
+                                           CTX_CTRL_RS_CTX_ENABLE);
+       }
         CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
         CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
         CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
@@ -2550,7 +2706,7 @@ static void execlists_init_reg_state(u32 *regs,
         CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), 0);
         CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), 0);
  
-       if (ppgtt && i915_vm_is_48bit(&ppgtt->base)) {
+       if (i915_vm_is_48bit(&ppgtt->vm)) {
                 /* 64b PPGTT (48bit canonical)
                  * PDP0_DESCRIPTOR contains the base address to PML4 and
                  * other PDP Descriptors are ignored.
@@ -2565,6 +2721,10 @@ static void execlists_init_reg_state(u32 *regs,
  
                 i915_oa_init_reg_state(engine, ctx, regs);
         }
+
+       regs[CTX_END] = MI_BATCH_BUFFER_END;
+       if (INTEL_GEN(dev_priv) >= 10)
+               regs[CTX_END] |= BIT(0);
  }
  
  static int
@@ -2629,10 +2789,10 @@ err_unpin_ctx:
  }
  
  static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
-                                           struct intel_engine_cs *engine)
+                                           struct intel_engine_cs *engine,
+                                           struct intel_context *ce)
  {
         struct drm_i915_gem_object *ctx_obj;
-       struct intel_context *ce = to_intel_context(ctx, engine);
         struct i915_vma *vma;
         uint32_t context_size;
         struct intel_ring *ring;
@@ -2654,7 +2814,7 @@ static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
         if (IS_ERR(ctx_obj))
                 return PTR_ERR(ctx_obj);
  
-       vma = i915_vma_instance(ctx_obj, &ctx->i915->ggtt.base, NULL);
+       vma = i915_vma_instance(ctx_obj, &ctx->i915->ggtt.vm, NULL);
         if (IS_ERR(vma)) {
                 ret = PTR_ERR(vma);
                 goto error_deref_obj;
@@ -2691,13 +2851,14 @@ error_deref_obj:
         return ret;
  }
  
-void intel_lr_context_resume(struct drm_i915_private *dev_priv)
+void intel_lr_context_resume(struct drm_i915_private *i915)
  {
         struct intel_engine_cs *engine;
         struct i915_gem_context *ctx;
         enum intel_engine_id id;
  
-       /* Because we emit WA_TAIL_DWORDS there may be a disparity
+       /*
+        * Because we emit WA_TAIL_DWORDS there may be a disparity
          * between our bookkeeping in ce->ring->head and ce->ring->tail and
          * that stored in context. As we only write new commands from
          * ce->ring->tail onwards, everything before that is junk. If the GPU
@@ -2707,28 +2868,22 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv)
          * So to avoid that we reset the context images upon resume. For
          * simplicity, we just zero everything out.
          */
-       list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
-               for_each_engine(engine, dev_priv, id) {
+       list_for_each_entry(ctx, &i915->contexts.list, link) {
+               for_each_engine(engine, i915, id) {
                         struct intel_context *ce =
                                 to_intel_context(ctx, engine);
-                       u32 *reg;
  
                         if (!ce->state)
                                 continue;
  
-                       reg = i915_gem_object_pin_map(ce->state->obj,
-                                                     I915_MAP_WB);
-                       if (WARN_ON(IS_ERR(reg)))
-                               continue;
-
-                       reg += LRC_STATE_PN * PAGE_SIZE / sizeof(*reg);
-                       reg[CTX_RING_HEAD+1] = 0;
-                       reg[CTX_RING_TAIL+1] = 0;
+                       intel_ring_reset(ce->ring, 0);
  
-                       ce->state->obj->mm.dirty = true;
-                       i915_gem_object_unpin_map(ce->state->obj);
+                       if (ce->pin_count) { /* otherwise done in context_pin */
+                               u32 *regs = ce->lrc_reg_state;
  
-                       intel_ring_reset(ce->ring, 0);
+                               regs[CTX_RING_HEAD + 1] = ce->ring->head;
+                               regs[CTX_RING_TAIL + 1] = ce->ring->tail;
+                       }
                 }
         }
  }