drm/amdgpu: gpu recovers from fatal error in poison mode
authorYiPeng Chai <YiPeng.Chai@amd.com>
Sun, 25 Jun 2023 02:18:32 +0000 (10:18 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 30 Jun 2023 17:12:15 +0000 (13:12 -0400)
Fatal error occurs in ras poison mode, mode1 reset
is used to recover gpu.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

index 4769a18..8aaa427 100644 (file)
@@ -2065,6 +2065,14 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
                                ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
                                reset_context.method = AMD_RESET_METHOD_MODE2;
                        }
+
+                       /* Fatal error occurs in poison mode, mode1 reset is used to
+                        * recover gpu.
+                        */
+                       if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
+                               ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+                               set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+                       }
                }
 
                amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
@@ -2955,9 +2963,12 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
                return;
 
        if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
+               struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
                dev_info(adev->dev, "uncorrectable hardware error"
                        "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
 
+               ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
                amdgpu_ras_reset_gpu(adev);
        }
 }
index 46bf188..ffb49b2 100644 (file)
@@ -340,6 +340,7 @@ enum amdgpu_ras_ret {
 #define AMDGPU_RAS_ERR_ADDRESS_VALID   (1 << 2)
 
 #define AMDGPU_RAS_GPU_RESET_MODE2_RESET  (0x1 << 0)
+#define AMDGPU_RAS_GPU_RESET_MODE1_RESET  (0x1 << 1)
 
 struct amdgpu_ras_err_status_reg_entry {
        uint32_t hwip;