drm/amdgpu: Send applicable RMA CPERs at end of RAS init
authorKent Russell <kent.russell@amd.com>
Tue, 3 Feb 2026 14:48:23 +0000 (09:48 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 5 Feb 2026 22:28:34 +0000 (17:28 -0500)
Firmware and monitoring tools may not be ready to receive a CPER when we
read the bad pages, so send the CPERs at the end of RAS initialization
to ensure that the FW is ready to receive and process the CPER. This
removes the previous CPER submission that was added during bad page
load, and sends both in-band and out-of-band at the same time.

Signed-off-by: Kent Russell <kent.russell@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

index b28fcf9..856b1bf 100644 (file)
@@ -4650,6 +4650,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
                        amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
        }
 
+       amdgpu_ras_check_bad_page_status(adev);
+
        return 0;
 }
 
index 469d04a..2c5d7f8 100644 (file)
@@ -1712,10 +1712,6 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
                        dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
                                        control->ras_num_bad_pages,
                                        ras->bad_page_cnt_threshold);
-               if (amdgpu_bad_page_threshold != 0 &&
-                       control->ras_num_bad_pages >= ras->bad_page_cnt_threshold)
-                       amdgpu_dpm_send_rma_reason(adev);
-
        } else if (hdr->header == RAS_TABLE_HDR_BAD &&
                   amdgpu_bad_page_threshold != 0) {
                if (hdr->version >= RAS_TABLE_VER_V2_1) {
@@ -1932,3 +1928,26 @@ int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev,
                                                                           result);
        return -EOPNOTSUPP;
 }
+
+void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+       struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
+
+       if (!control || amdgpu_bad_page_threshold == 0)
+               return;
+
+       if (control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) {
+               if (amdgpu_dpm_send_rma_reason(adev))
+                       dev_warn(adev->dev, "Unable to send out-of-band RMA CPER");
+               else
+                       dev_dbg(adev->dev, "Sent out-of-band RMA CPER");
+
+               if (adev->cper.enabled && !amdgpu_uniras_enabled(adev)) {
+                       if (amdgpu_cper_generate_bp_threshold_record(adev))
+                               dev_warn(adev->dev, "Unable to send in-band RMA CPER");
+                       else
+                               dev_dbg(adev->dev, "Sent in-band RMA CPER");
+               }
+       }
+}
index 2e5d639..a621148 100644 (file)
@@ -193,6 +193,8 @@ int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
 
 int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control);
 
+void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev);
+
 extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
 extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;