drm/amdgpu: Set fatal errror detected flag earlier

author Lijo Lazar <lijo.lazar@amd.com>

Mon, 25 Mar 2024 07:03:02 +0000 (12:33 +0530)

committer Alex Deucher <alexander.deucher@amd.com>

Wed, 10 Apr 2024 02:13:36 +0000 (22:13 -0400)
author Lijo Lazar <lijo.lazar@amd.com>
Mon, 25 Mar 2024 07:03:02 +0000 (12:33 +0530)
committer Alex Deucher <alexander.deucher@amd.com>
Wed, 10 Apr 2024 02:13:36 +0000 (22:13 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index b8c7d0b..352ce16 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2399,6 +2399,19 @@ out:
         return ret;
  }
  
+static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
+                                  struct amdgpu_hive_info *hive, bool status)
+{
+       struct amdgpu_device *tmp_adev;
+
+       if (hive) {
+               list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+                       amdgpu_ras_set_fed(tmp_adev, status);
+       } else {
+               amdgpu_ras_set_fed(adev, status);
+       }
+}
+
  static void amdgpu_ras_do_recovery(struct work_struct *work)
  {
         struct amdgpu_ras *ras =
@@ -2408,8 +2421,21 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
         struct list_head device_list, *device_list_handle =  NULL;
         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
  
-       if (hive)
+       if (hive) {
                 atomic_set(&hive->ras_recovery, 1);
+
+               /* If any device which is part of the hive received RAS fatal
+                * error interrupt, set fatal error status on all. This
+                * condition will need a recovery, and flag will be cleared
+                * as part of recovery.
+                */
+               list_for_each_entry(remote_adev, &hive->device_list,
+                                   gmc.xgmi.head)
+                       if (amdgpu_ras_get_fed_status(remote_adev)) {
+                               amdgpu_ras_set_fed_all(adev, hive, true);
+                               break;
+                       }
+       }
         if (!ras->disable_ras_err_cnt_harvest) {
  
                 /* Build list of devices to query RAS related errors */
@@ -2454,18 +2480,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
                                 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
                                 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
  
-                               /* For any RAS error that needs a full reset to
-                                * recover, set the fatal error status
-                                */
-                               if (hive) {
-                                       list_for_each_entry(remote_adev,
-                                                           &hive->device_list,
-                                                           gmc.xgmi.head)
-                                               amdgpu_ras_set_fed(remote_adev,
-                                                                  true);
-                               } else {
-                                       amdgpu_ras_set_fed(adev, true);
-                               }
                                 psp_fatal_error_recovery_quirk(&adev->psp);
                         }
                 }
@@ -3550,6 +3564,7 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
                 RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
                               "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
  
+               amdgpu_ras_set_fed(adev, true);
                 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
                 amdgpu_ras_reset_gpu(adev);
         }
author	Lijo Lazar <lijo.lazar@amd.com>
	Mon, 25 Mar 2024 07:03:02 +0000 (12:33 +0530)
committer	Alex Deucher <alexander.deucher@amd.com>
	Wed, 10 Apr 2024 02:13:36 +0000 (22:13 -0400)