drm/amdgpu: report bad status in GPU recovery
authorTao Zhou <tao.zhou1@amd.com>
Wed, 31 Jul 2024 07:54:27 +0000 (15:54 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 6 Aug 2024 15:11:02 +0000 (11:11 -0400)
Instead of printing GPU reset failed.

v2: add check for reset_context->src.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index f595ba6..29a4ade 100644 (file)
@@ -5876,8 +5876,14 @@ skip_hw_reset:
                tmp_adev->asic_reset_res = 0;
 
                if (r) {
-                       /* bad news, how to tell it to userspace ? */
-                       dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
+                       /* bad news, how to tell it to userspace ?
+                        * for ras error, we should report GPU bad status instead of
+                        * reset failure
+                        */
+                       if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
+                           !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
+                               dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
+                                       atomic_read(&tmp_adev->gpu_reset_counter));
                        amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
                } else {
                        dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));