drm/i915/execlists: Cancel banned contexts on schedule-out

author Chris Wilson <chris@chris-wilson.co.uk>

Wed, 23 Oct 2019 13:31:06 +0000 (14:31 +0100)

committer Chris Wilson <chris@chris-wilson.co.uk>

Wed, 23 Oct 2019 22:52:10 +0000 (23:52 +0100)
author Chris Wilson <chris@chris-wilson.co.uk>
Wed, 23 Oct 2019 13:31:06 +0000 (14:31 +0100)
committer Chris Wilson <chris@chris-wilson.co.uk>
Wed, 23 Oct 2019 22:52:10 +0000 (23:52 +0100)
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c

index ff0dd29..651d5dd 100644 (file)
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -234,6 +234,9 @@ static void execlists_init_reg_state(u32 *reg_state,
                                      const struct intel_engine_cs *engine,
                                      const struct intel_ring *ring,
                                      bool close);
+static void
+__execlists_update_reg_state(const struct intel_context *ce,
+                            const struct intel_engine_cs *engine);
  
  static void mark_eio(struct i915_request *rq)
  {
@@ -246,6 +249,31 @@ static void mark_eio(struct i915_request *rq)
         i915_request_mark_complete(rq);
  }
  
+static struct i915_request *active_request(struct i915_request *rq)
+{
+       const struct intel_context * const ce = rq->hw_context;
+       struct i915_request *active = NULL;
+       struct list_head *list;
+
+       if (!i915_request_is_active(rq)) /* unwound, but incomplete! */
+               return rq;
+
+       rcu_read_lock();
+       list = &rcu_dereference(rq->timeline)->requests;
+       list_for_each_entry_from_reverse(rq, list, link) {
+               if (i915_request_completed(rq))
+                       break;
+
+               if (rq->hw_context != ce)
+                       break;
+
+               active = rq;
+       }
+       rcu_read_unlock();
+
+       return active;
+}
+
  static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
  {
         return (i915_ggtt_offset(engine->status_page.vma) +
@@ -972,6 +1000,58 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
                 tasklet_schedule(&ve->base.execlists.tasklet);
  }
  
+static void restore_default_state(struct intel_context *ce,
+                                 struct intel_engine_cs *engine)
+{
+       u32 *regs = ce->lrc_reg_state;
+
+       if (engine->pinned_default_state)
+               memcpy(regs, /* skip restoring the vanilla PPHWSP */
+                      engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
+                      engine->context_size - PAGE_SIZE);
+
+       execlists_init_reg_state(regs, ce, engine, ce->ring, false);
+}
+
+static void reset_active(struct i915_request *rq,
+                        struct intel_engine_cs *engine)
+{
+       struct intel_context * const ce = rq->hw_context;
+
+       /*
+        * The executing context has been cancelled. We want to prevent
+        * further execution along this context and propagate the error on
+        * to anything depending on its results.
+        *
+        * In __i915_request_submit(), we apply the -EIO and remove the
+        * requests' payloads for any banned requests. But first, we must
+        * rewind the context back to the start of the incomplete request so
+        * that we do not jump back into the middle of the batch.
+        *
+        * We preserve the breadcrumbs and semaphores of the incomplete
+        * requests so that inter-timeline dependencies (i.e other timelines)
+        * remain correctly ordered. And we defer to __i915_request_submit()
+        * so that all asynchronous waits are correctly handled.
+        */
+       GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
+                 __func__, engine->name, rq->fence.context, rq->fence.seqno);
+
+       /* On resubmission of the active request, payload will be scrubbed */
+       rq = active_request(rq);
+       if (rq)
+               ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
+       else
+               ce->ring->head = ce->ring->tail;
+       intel_ring_update_space(ce->ring);
+
+       /* Scrub the context image to prevent replaying the previous batch */
+       restore_default_state(ce, engine);
+       __execlists_update_reg_state(ce, engine);
+
+       /* We've switched away, so this should be a no-op, but intent matters */
+       ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
+}
+
  static inline void
  __execlists_schedule_out(struct i915_request *rq,
                          struct intel_engine_cs * const engine)
@@ -982,6 +1062,9 @@ __execlists_schedule_out(struct i915_request *rq,
         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
         intel_gt_pm_put(engine->gt);
  
+       if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
+               reset_active(rq, engine);
+
         /*
          * If this is part of a virtual engine, its next request may
          * have been blocked waiting for access to the active context.
@@ -1380,6 +1463,10 @@ static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
         if (!rq)
                 return 0;
  
+       /* Force a fast reset for terminated contexts (ignoring sysfs!) */
+       if (unlikely(i915_gem_context_is_banned(rq->gem_context)))
+               return 1;
+
         return READ_ONCE(engine->props.preempt_timeout_ms);
  }
  
@@ -2792,29 +2879,6 @@ static void reset_csb_pointers(struct intel_engine_cs *engine)
                                &execlists->csb_status[reset_value]);
  }
  
-static struct i915_request *active_request(struct i915_request *rq)
-{
-       const struct intel_context * const ce = rq->hw_context;
-       struct i915_request *active = NULL;
-       struct list_head *list;
-
-       if (!i915_request_is_active(rq)) /* unwound, but incomplete! */
-               return rq;
-
-       list = &i915_request_active_timeline(rq)->requests;
-       list_for_each_entry_from_reverse(rq, list, link) {
-               if (i915_request_completed(rq))
-                       break;
-
-               if (rq->hw_context != ce)
-                       break;
-
-               active = rq;
-       }
-
-       return active;
-}
-
  static void __execlists_reset_reg_state(const struct intel_context *ce,
                                         const struct intel_engine_cs *engine)
  {
@@ -2831,7 +2895,6 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
         struct intel_engine_execlists * const execlists = &engine->execlists;
         struct intel_context *ce;
         struct i915_request *rq;
-       u32 *regs;
  
         mb(); /* paranoia: read the CSB pointers from after the reset */
         clflush(execlists->csb_write);
@@ -2907,13 +2970,7 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
          * to recreate its own state.
          */
         GEM_BUG_ON(!intel_context_is_pinned(ce));
-       regs = ce->lrc_reg_state;
-       if (engine->pinned_default_state) {
-               memcpy(regs, /* skip restoring the vanilla PPHWSP */
-                      engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
-                      engine->context_size - PAGE_SIZE);
-       }
-       execlists_init_reg_state(regs, ce, engine, ce->ring, false);
+       restore_default_state(ce, engine);
  
  out_replay:
         GEM_TRACE("%s replay {head:%04x, tail:%04x\n",
@@ -4574,16 +4631,8 @@ void intel_lr_context_reset(struct intel_engine_cs *engine,
          * future request will be after userspace has had the opportunity
          * to recreate its own state.
          */
-       if (scrub) {
-               u32 *regs = ce->lrc_reg_state;
-
-               if (engine->pinned_default_state) {
-                       memcpy(regs, /* skip restoring the vanilla PPHWSP */
-                              engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
-                              engine->context_size - PAGE_SIZE);
-               }
-               execlists_init_reg_state(regs, ce, engine, ce->ring, false);
-       }
+       if (scrub)
+               restore_default_state(ce, engine);
  
         /* Rerun the request; its payload has been neutered (if guilty). */
         ce->ring->head = head;
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c

index b635267..d5d268b 100644 (file)
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -7,6 +7,7 @@
  #include <linux/prime_numbers.h>
  
  #include "gem/i915_gem_pm.h"
+#include "gt/intel_engine_heartbeat.h"
  #include "gt/intel_reset.h"
  
  #include "i915_selftest.h"
@@ -1160,6 +1161,325 @@ err_wedged:
         goto err_client_b;
  }
  
+struct live_preempt_cancel {
+       struct intel_engine_cs *engine;
+       struct preempt_client a, b;
+};
+
+static int __cancel_active0(struct live_preempt_cancel *arg)
+{
+       struct i915_request *rq;
+       struct igt_live_test t;
+       int err;
+
+       /* Preempt cancel of ELSP0 */
+       GEM_TRACE("%s(%s)\n", __func__, arg->engine->name);
+       if (igt_live_test_begin(&t, arg->engine->i915,
+                               __func__, arg->engine->name))
+               return -EIO;
+
+       clear_bit(CONTEXT_BANNED, &arg->a.ctx->flags);
+       rq = spinner_create_request(&arg->a.spin,
+                                   arg->a.ctx, arg->engine,
+                                   MI_ARB_CHECK);
+       if (IS_ERR(rq))
+               return PTR_ERR(rq);
+
+       i915_request_get(rq);
+       i915_request_add(rq);
+       if (!igt_wait_for_spinner(&arg->a.spin, rq)) {
+               err = -EIO;
+               goto out;
+       }
+
+       i915_gem_context_set_banned(arg->a.ctx);
+       err = intel_engine_pulse(arg->engine);
+       if (err)
+               goto out;
+
+       if (i915_request_wait(rq, 0, HZ / 5) < 0) {
+               err = -EIO;
+               goto out;
+       }
+
+       if (rq->fence.error != -EIO) {
+               pr_err("Cancelled inflight0 request did not report -EIO\n");
+               err = -EINVAL;
+               goto out;
+       }
+
+out:
+       i915_request_put(rq);
+       if (igt_live_test_end(&t))
+               err = -EIO;
+       return err;
+}
+
+static int __cancel_active1(struct live_preempt_cancel *arg)
+{
+       struct i915_request *rq[2] = {};
+       struct igt_live_test t;
+       int err;
+
+       /* Preempt cancel of ELSP1 */
+       GEM_TRACE("%s(%s)\n", __func__, arg->engine->name);
+       if (igt_live_test_begin(&t, arg->engine->i915,
+                               __func__, arg->engine->name))
+               return -EIO;
+
+       clear_bit(CONTEXT_BANNED, &arg->a.ctx->flags);
+       rq[0] = spinner_create_request(&arg->a.spin,
+                                      arg->a.ctx, arg->engine,
+                                      MI_NOOP); /* no preemption */
+       if (IS_ERR(rq[0]))
+               return PTR_ERR(rq[0]);
+
+       i915_request_get(rq[0]);
+       i915_request_add(rq[0]);
+       if (!igt_wait_for_spinner(&arg->a.spin, rq[0])) {
+               err = -EIO;
+               goto out;
+       }
+
+       clear_bit(CONTEXT_BANNED, &arg->b.ctx->flags);
+       rq[1] = spinner_create_request(&arg->b.spin,
+                                      arg->b.ctx, arg->engine,
+                                      MI_ARB_CHECK);
+       if (IS_ERR(rq[1])) {
+               err = PTR_ERR(rq[1]);
+               goto out;
+       }
+
+       i915_request_get(rq[1]);
+       err = i915_request_await_dma_fence(rq[1], &rq[0]->fence);
+       i915_request_add(rq[1]);
+       if (err)
+               goto out;
+
+       i915_gem_context_set_banned(arg->b.ctx);
+       err = intel_engine_pulse(arg->engine);
+       if (err)
+               goto out;
+
+       igt_spinner_end(&arg->a.spin);
+       if (i915_request_wait(rq[1], 0, HZ / 5) < 0) {
+               err = -EIO;
+               goto out;
+       }
+
+       if (rq[0]->fence.error != 0) {
+               pr_err("Normal inflight0 request did not complete\n");
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (rq[1]->fence.error != -EIO) {
+               pr_err("Cancelled inflight1 request did not report -EIO\n");
+               err = -EINVAL;
+               goto out;
+       }
+
+out:
+       i915_request_put(rq[1]);
+       i915_request_put(rq[0]);
+       if (igt_live_test_end(&t))
+               err = -EIO;
+       return err;
+}
+
+static int __cancel_queued(struct live_preempt_cancel *arg)
+{
+       struct i915_request *rq[3] = {};
+       struct igt_live_test t;
+       int err;
+
+       /* Full ELSP and one in the wings */
+       GEM_TRACE("%s(%s)\n", __func__, arg->engine->name);
+       if (igt_live_test_begin(&t, arg->engine->i915,
+                               __func__, arg->engine->name))
+               return -EIO;
+
+       clear_bit(CONTEXT_BANNED, &arg->a.ctx->flags);
+       rq[0] = spinner_create_request(&arg->a.spin,
+                                      arg->a.ctx, arg->engine,
+                                      MI_ARB_CHECK);
+       if (IS_ERR(rq[0]))
+               return PTR_ERR(rq[0]);
+
+       i915_request_get(rq[0]);
+       i915_request_add(rq[0]);
+       if (!igt_wait_for_spinner(&arg->a.spin, rq[0])) {
+               err = -EIO;
+               goto out;
+       }
+
+       clear_bit(CONTEXT_BANNED, &arg->b.ctx->flags);
+       rq[1] = igt_request_alloc(arg->b.ctx, arg->engine);
+       if (IS_ERR(rq[1])) {
+               err = PTR_ERR(rq[1]);
+               goto out;
+       }
+
+       i915_request_get(rq[1]);
+       err = i915_request_await_dma_fence(rq[1], &rq[0]->fence);
+       i915_request_add(rq[1]);
+       if (err)
+               goto out;
+
+       rq[2] = spinner_create_request(&arg->b.spin,
+                                      arg->a.ctx, arg->engine,
+                                      MI_ARB_CHECK);
+       if (IS_ERR(rq[2])) {
+               err = PTR_ERR(rq[2]);
+               goto out;
+       }
+
+       i915_request_get(rq[2]);
+       err = i915_request_await_dma_fence(rq[2], &rq[1]->fence);
+       i915_request_add(rq[2]);
+       if (err)
+               goto out;
+
+       i915_gem_context_set_banned(arg->a.ctx);
+       err = intel_engine_pulse(arg->engine);
+       if (err)
+               goto out;
+
+       if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
+               err = -EIO;
+               goto out;
+       }
+
+       if (rq[0]->fence.error != -EIO) {
+               pr_err("Cancelled inflight0 request did not report -EIO\n");
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (rq[1]->fence.error != 0) {
+               pr_err("Normal inflight1 request did not complete\n");
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (rq[2]->fence.error != -EIO) {
+               pr_err("Cancelled queued request did not report -EIO\n");
+               err = -EINVAL;
+               goto out;
+       }
+
+out:
+       i915_request_put(rq[2]);
+       i915_request_put(rq[1]);
+       i915_request_put(rq[0]);
+       if (igt_live_test_end(&t))
+               err = -EIO;
+       return err;
+}
+
+static int __cancel_hostile(struct live_preempt_cancel *arg)
+{
+       struct i915_request *rq;
+       int err;
+
+       /* Preempt cancel non-preemptible spinner in ELSP0 */
+       if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
+               return 0;
+
+       GEM_TRACE("%s(%s)\n", __func__, arg->engine->name);
+       clear_bit(CONTEXT_BANNED, &arg->a.ctx->flags);
+       rq = spinner_create_request(&arg->a.spin,
+                                   arg->a.ctx, arg->engine,
+                                   MI_NOOP); /* preemption disabled */
+       if (IS_ERR(rq))
+               return PTR_ERR(rq);
+
+       i915_request_get(rq);
+       i915_request_add(rq);
+       if (!igt_wait_for_spinner(&arg->a.spin, rq)) {
+               err = -EIO;
+               goto out;
+       }
+
+       i915_gem_context_set_banned(arg->a.ctx);
+       err = intel_engine_pulse(arg->engine); /* force reset */
+       if (err)
+               goto out;
+
+       if (i915_request_wait(rq, 0, HZ / 5) < 0) {
+               err = -EIO;
+               goto out;
+       }
+
+       if (rq->fence.error != -EIO) {
+               pr_err("Cancelled inflight0 request did not report -EIO\n");
+               err = -EINVAL;
+               goto out;
+       }
+
+out:
+       i915_request_put(rq);
+       if (igt_flush_test(arg->engine->i915))
+               err = -EIO;
+       return err;
+}
+
+static int live_preempt_cancel(void *arg)
+{
+       struct intel_gt *gt = arg;
+       struct live_preempt_cancel data;
+       enum intel_engine_id id;
+       int err = -ENOMEM;
+
+       /*
+        * To cancel an inflight context, we need to first remove it from the
+        * GPU. That sounds like preemption! Plus a little bit of bookkeeping.
+        */
+
+       if (!HAS_LOGICAL_RING_PREEMPTION(gt->i915))
+               return 0;
+
+       if (preempt_client_init(gt, &data.a))
+               return -ENOMEM;
+       if (preempt_client_init(gt, &data.b))
+               goto err_client_a;
+
+       for_each_engine(data.engine, gt, id) {
+               if (!intel_engine_has_preemption(data.engine))
+                       continue;
+
+               err = __cancel_active0(&data);
+               if (err)
+                       goto err_wedged;
+
+               err = __cancel_active1(&data);
+               if (err)
+                       goto err_wedged;
+
+               err = __cancel_queued(&data);
+               if (err)
+                       goto err_wedged;
+
+               err = __cancel_hostile(&data);
+               if (err)
+                       goto err_wedged;
+       }
+
+       err = 0;
+err_client_b:
+       preempt_client_fini(&data.b);
+err_client_a:
+       preempt_client_fini(&data.a);
+       return err;
+
+err_wedged:
+       GEM_TRACE_DUMP();
+       igt_spinner_end(&data.b.spin);
+       igt_spinner_end(&data.a.spin);
+       intel_gt_set_wedged(gt);
+       goto err_client_b;
+}
+
  static int live_suppress_self_preempt(void *arg)
  {
         struct intel_gt *gt = arg;
@@ -2693,6 +3013,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
                 SUBTEST(live_preempt),
                 SUBTEST(live_late_preempt),
                 SUBTEST(live_nopreempt),
+               SUBTEST(live_preempt_cancel),
                 SUBTEST(live_suppress_self_preempt),
                 SUBTEST(live_suppress_wait_preempt),
                 SUBTEST(live_chain_preempt),
author	Chris Wilson <chris@chris-wilson.co.uk>
	Wed, 23 Oct 2019 13:31:06 +0000 (14:31 +0100)
committer	Chris Wilson <chris@chris-wilson.co.uk>
	Wed, 23 Oct 2019 22:52:10 +0000 (23:52 +0100)
drivers/gpu/drm/i915/gt/intel_lrc.c		patch \| blob \| history
drivers/gpu/drm/i915/gt/selftest_lrc.c		patch \| blob \| history