drm/xe: Resume TDR after GT reset
authorMatthew Brost <matthew.brost@intel.com>
Wed, 24 Jul 2024 23:59:19 +0000 (16:59 -0700)
committerLucas De Marchi <lucas.demarchi@intel.com>
Thu, 3 Oct 2024 06:19:44 +0000 (01:19 -0500)
Not starting the TDR after GT reset on exec queue which have been
restarted can lead to jobs being able to be run forever. Fix this by
restarting the TDR.

Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Nirmoy Das <nirmoy.das@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240724235919.1917216-1-matthew.brost@intel.com
(cherry picked from commit 8ec5a4e5ce97d6ee9f5eb5b4ce4cfc831976fdec)
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
drivers/gpu/drm/xe/xe_gpu_scheduler.c
drivers/gpu/drm/xe/xe_gpu_scheduler.h
drivers/gpu/drm/xe/xe_guc_submit.c

index c518d1d..50361b4 100644 (file)
@@ -90,6 +90,11 @@ void xe_sched_submission_stop(struct xe_gpu_scheduler *sched)
        cancel_work_sync(&sched->work_process_msg);
 }
 
+void xe_sched_submission_resume_tdr(struct xe_gpu_scheduler *sched)
+{
+       drm_sched_resume_timeout(&sched->base, sched->base.timeout);
+}
+
 void xe_sched_add_msg(struct xe_gpu_scheduler *sched,
                      struct xe_sched_msg *msg)
 {
index cee9c68..5ad5629 100644 (file)
@@ -22,6 +22,8 @@ void xe_sched_fini(struct xe_gpu_scheduler *sched);
 void xe_sched_submission_start(struct xe_gpu_scheduler *sched);
 void xe_sched_submission_stop(struct xe_gpu_scheduler *sched);
 
+void xe_sched_submission_resume_tdr(struct xe_gpu_scheduler *sched);
+
 void xe_sched_add_msg(struct xe_gpu_scheduler *sched,
                      struct xe_sched_msg *msg);
 void xe_sched_add_msg_locked(struct xe_gpu_scheduler *sched,
index 98a6a38..80062e1 100644 (file)
@@ -1826,6 +1826,7 @@ static void guc_exec_queue_start(struct xe_exec_queue *q)
        }
 
        xe_sched_submission_start(sched);
+       xe_sched_submission_resume_tdr(sched);
 }
 
 int xe_guc_submit_start(struct xe_guc *guc)