drm/amdgpu: only harvest gcea/mmea error status in arcturus
authorHawking Zhang <Hawking.Zhang@amd.com>
Fri, 16 Apr 2021 09:34:13 +0000 (17:34 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 21 Apr 2021 01:35:45 +0000 (21:35 -0400)
SDP RdRspStatus/WrRspStatus or first parity error on
RdRsp data can cause system fatal error in arcturus.
GPU will be freezed in such case.

Driver needs to harvest these error information before
reset the GPU. Check error type to avoid harvest normal
gcea/mmea information.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Stanley Yang <Stanley.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_sh_mask.h

index 830080f..b4789df 100644 (file)
@@ -994,7 +994,7 @@ static int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
        return ret;
 }
 
-static const struct soc15_reg_entry gfx_v9_4_rdrsp_status_regs =
+static const struct soc15_reg_entry gfx_v9_4_ea_err_status_regs =
        { SOC15_REG_ENTRY(GC, 0, mmGCEA_ERR_STATUS), 0, 1, 32 };
 
 static void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev)
@@ -1007,15 +1007,21 @@ static void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev)
 
        mutex_lock(&adev->grbm_idx_mutex);
 
-       for (i = 0; i < gfx_v9_4_rdrsp_status_regs.se_num; i++) {
-               for (j = 0; j < gfx_v9_4_rdrsp_status_regs.instance;
+       for (i = 0; i < gfx_v9_4_ea_err_status_regs.se_num; i++) {
+               for (j = 0; j < gfx_v9_4_ea_err_status_regs.instance;
                     j++) {
                        gfx_v9_4_select_se_sh(adev, i, 0, j);
                        reg_value = RREG32(SOC15_REG_ENTRY_OFFSET(
-                               gfx_v9_4_rdrsp_status_regs));
-                       if (reg_value)
+                               gfx_v9_4_ea_err_status_regs));
+                       if (REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_STATUS) ||
+                           REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_WRRSP_STATUS) ||
+                           REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_DATAPARITY_ERROR)) {
+                               /* SDP read/write error/parity error in FUE_IS_FATAL mode
+                                * can cause system fatal error in arcturas. Harvest the error
+                                * status before GPU reset */
                                dev_warn(adev->dev, "GCEA err detected at instance: %d, status: 0x%x!\n",
                                                j, reg_value);
+                       }
                }
        }
 
index 1a92177..47c8dd9 100644 (file)
@@ -1645,9 +1645,15 @@ static void mmhub_v9_4_query_ras_error_status(struct amdgpu_device *adev)
        for (i = 0; i < ARRAY_SIZE(mmhub_v9_4_err_status_regs); i++) {
                reg_value =
                        RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v9_4_err_status_regs[i]));
-               if (reg_value)
+               if (REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_RDRSP_STATUS) ||
+                   REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_WRRSP_STATUS) ||
+                   REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_RDRSP_DATAPARITY_ERROR)) {
+                       /* SDP read/write error/parity error in FUE_IS_FATAL mode
+                        * can cause system fatal error in arcturas. Harvest the error
+                        * status before GPU reset */
                        dev_warn(adev->dev, "MMHUB EA err detected at instance: %d, status: 0x%x!\n",
                                        i, reg_value);
+               }
        }
 }
 
index 4089cfa..849450c 100644 (file)
 #define GCEA_EDC_CNT3__MAM_A3MEM_SEC_COUNT_MASK                                                               0x30000000L
 #define GCEA_EDC_CNT3__MAM_A3MEM_DED_COUNT_MASK                                                               0xC0000000L
 
+//GCEA_ERR_STATUS
+#define GCEA_ERR_STATUS__SDP_RDRSP_STATUS__SHIFT                                                              0x0
+#define GCEA_ERR_STATUS__SDP_WRRSP_STATUS__SHIFT                                                              0x4
+#define GCEA_ERR_STATUS__SDP_RDRSP_DATASTATUS__SHIFT                                                          0x8
+#define GCEA_ERR_STATUS__SDP_RDRSP_DATAPARITY_ERROR__SHIFT                                                    0xa
+#define GCEA_ERR_STATUS__CLEAR_ERROR_STATUS__SHIFT                                                            0xb
+#define GCEA_ERR_STATUS__BUSY_ON_ERROR__SHIFT                                                                 0xc
+#define GCEA_ERR_STATUS__FUE_FLAG__SHIFT                                                                      0xd
+#define GCEA_ERR_STATUS__SDP_RDRSP_STATUS_MASK                                                                0x0000000FL
+#define GCEA_ERR_STATUS__SDP_WRRSP_STATUS_MASK                                                                0x000000F0L
+#define GCEA_ERR_STATUS__SDP_RDRSP_DATASTATUS_MASK                                                            0x00000300L
+#define GCEA_ERR_STATUS__SDP_RDRSP_DATAPARITY_ERROR_MASK                                                      0x00000400L
+#define GCEA_ERR_STATUS__CLEAR_ERROR_STATUS_MASK                                                              0x00000800L
+#define GCEA_ERR_STATUS__BUSY_ON_ERROR_MASK                                                                   0x00001000L
+#define GCEA_ERR_STATUS__FUE_FLAG_MASK                                                                        0x00002000L
+
 // addressBlock: gc_gfxudec
 //GRBM_GFX_INDEX
 #define GRBM_GFX_INDEX__INSTANCE_INDEX__SHIFT                                                                 0x0