drm/i915/execlists: Offline error capture

[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / gt / intel_lrc.c
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c

index 43375c4..2d6b41e 100644 (file)
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2393,7 +2393,6 @@ static void __execlists_hold(struct i915_request *rq)
         } while (rq);
  }
  
-__maybe_unused
  static void execlists_hold(struct intel_engine_cs *engine,
                            struct i915_request *rq)
  {
@@ -2473,7 +2472,6 @@ static void __execlists_unhold(struct i915_request *rq)
         } while (rq);
  }
  
-__maybe_unused
  static void execlists_unhold(struct intel_engine_cs *engine,
                              struct i915_request *rq)
  {
@@ -2493,6 +2491,123 @@ static void execlists_unhold(struct intel_engine_cs *engine,
         spin_unlock_irq(&engine->active.lock);
  }
  
+struct execlists_capture {
+       struct work_struct work;
+       struct i915_request *rq;
+       struct i915_gpu_coredump *error;
+};
+
+static void execlists_capture_work(struct work_struct *work)
+{
+       struct execlists_capture *cap = container_of(work, typeof(*cap), work);
+       const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
+       struct intel_engine_cs *engine = cap->rq->engine;
+       struct intel_gt_coredump *gt = cap->error->gt;
+       struct intel_engine_capture_vma *vma;
+
+       /* Compress all the objects attached to the request, slow! */
+       vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
+       if (vma) {
+               struct i915_vma_compress *compress =
+                       i915_vma_capture_prepare(gt);
+
+               intel_engine_coredump_add_vma(gt->engine, vma, compress);
+               i915_vma_capture_finish(gt, compress);
+       }
+
+       gt->simulated = gt->engine->simulated;
+       cap->error->simulated = gt->simulated;
+
+       /* Publish the error state, and announce it to the world */
+       i915_error_state_store(cap->error);
+       i915_gpu_coredump_put(cap->error);
+
+       /* Return this request and all that depend upon it for signaling */
+       execlists_unhold(engine, cap->rq);
+
+       kfree(cap);
+}
+
+static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
+{
+       const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+       struct execlists_capture *cap;
+
+       cap = kmalloc(sizeof(*cap), gfp);
+       if (!cap)
+               return NULL;
+
+       cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
+       if (!cap->error)
+               goto err_cap;
+
+       cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
+       if (!cap->error->gt)
+               goto err_gpu;
+
+       cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
+       if (!cap->error->gt->engine)
+               goto err_gt;
+
+       return cap;
+
+err_gt:
+       kfree(cap->error->gt);
+err_gpu:
+       kfree(cap->error);
+err_cap:
+       kfree(cap);
+       return NULL;
+}
+
+static void execlists_capture(struct intel_engine_cs *engine)
+{
+       struct execlists_capture *cap;
+
+       if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
+               return;
+
+       /*
+        * We need to _quickly_ capture the engine state before we reset.
+        * We are inside an atomic section (softirq) here and we are delaying
+        * the forced preemption event.
+        */
+       cap = capture_regs(engine);
+       if (!cap)
+               return;
+
+       cap->rq = execlists_active(&engine->execlists);
+       GEM_BUG_ON(!cap->rq);
+
+       cap->rq = active_request(cap->rq->context->timeline, cap->rq);
+       GEM_BUG_ON(!cap->rq);
+
+       /*
+        * Remove the request from the execlists queue, and take ownership
+        * of the request. We pass it to our worker who will _slowly_ compress
+        * all the pages the _user_ requested for debugging their batch, after
+        * which we return it to the queue for signaling.
+        *
+        * By removing them from the execlists queue, we also remove the
+        * requests from being processed by __unwind_incomplete_requests()
+        * during the intel_engine_reset(), and so they will *not* be replayed
+        * afterwards.
+        *
+        * Note that because we have not yet reset the engine at this point,
+        * it is possible for the request that we have identified as being
+        * guilty, did in fact complete and we will then hit an arbitration
+        * point allowing the outstanding preemption to succeed. The likelihood
+        * of that is very low (as capturing of the engine registers should be
+        * fast enough to run inside an irq-off atomic section!), so we will
+        * simply hold that request accountable for being non-preemptible
+        * long enough to force the reset.
+        */
+       execlists_hold(engine, cap->rq);
+
+       INIT_WORK(&cap->work, execlists_capture_work);
+       schedule_work(&cap->work);
+}
+
  static noinline void preempt_reset(struct intel_engine_cs *engine)
  {
         const unsigned int bit = I915_RESET_ENGINE + engine->id;
@@ -2510,6 +2625,9 @@ static noinline void preempt_reset(struct intel_engine_cs *engine)
         ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
                      READ_ONCE(engine->props.preempt_timeout_ms),
                      jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
+
+       ring_set_paused(engine, 1); /* Freeze the current request in place */
+       execlists_capture(engine);
         intel_engine_reset(engine, "preemption time out");
  
         tasklet_enable(&engine->execlists.tasklet);