From 6952255ed97914abbf86fc3f92c9918945b885b8 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 30 Jan 2026 11:19:49 -0500 Subject: [PATCH] drm/amdgpu: re-add the bad job to the pending list for ring resets Returning DRM_GPU_SCHED_STAT_NO_HANG causes the scheduler to add the bad job back the pending list. We've already set the errors on the fence and killed the bad job at this point so it's the correct behavior. Acked-by: Pierre-Eric Pelloux-Prayer Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 ++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 4 ---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index aaf5477fcd7a..2c82d9e8c0be 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -92,6 +92,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) struct drm_wedge_task_info *info = NULL; struct amdgpu_task_info *ti = NULL; struct amdgpu_device *adev = ring->adev; + enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_RESET; int idx, r; if (!drm_dev_enter(adev_to_drm(adev), &idx)) { @@ -135,13 +136,19 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) ring->funcs->reset) { dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name); + /* Stop the scheduler to prevent anybody else from touching the ring buffer. */ + drm_sched_wqueue_stop(&ring->sched); r = amdgpu_ring_reset(ring, job->vmid, job->hw_fence); if (!r) { + /* Start the scheduler again */ + drm_sched_wqueue_start(&ring->sched); atomic_inc(&ring->adev->gpu_reset_counter); dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name); drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, info); + /* This is needed to add the job back to the pending list */ + status = DRM_GPU_SCHED_STAT_NO_HANG; goto exit; } dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name); @@ -177,7 +184,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) exit: amdgpu_vm_put_task_info(ti); drm_dev_exit(idx); - return DRM_GPU_SCHED_STAT_RESET; + return status; } int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index b82357c65723..129ad5138653 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -868,8 +868,6 @@ bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring) void amdgpu_ring_reset_helper_begin(struct amdgpu_ring *ring, struct amdgpu_fence *guilty_fence) { - /* Stop the scheduler to prevent anybody else from touching the ring buffer. */ - drm_sched_wqueue_stop(&ring->sched); /* back up the non-guilty commands */ amdgpu_ring_backup_unprocessed_commands(ring, guilty_fence); } @@ -895,8 +893,6 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring, amdgpu_ring_write(ring, ring->ring_backup[i]); amdgpu_ring_commit(ring); } - /* Start the scheduler again */ - drm_sched_wqueue_start(&ring->sched); return 0; } -- 2.30.2