drm/amdgpu: add helper funtion to query umc ras error
authorHawking Zhang <Hawking.Zhang@amd.com>
Mon, 8 Mar 2021 12:43:15 +0000 (20:43 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Apr 2021 20:50:56 +0000 (16:50 -0400)
Add helper functions to query correctable and
uncorrectable umc ras error.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: John Clements <John.Clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
drivers/gpu/drm/amd/amdgpu/umc_v6_7.h

index 37aa1cf..05fec10 100644 (file)
 #include "amdgpu_ras.h"
 #include "amdgpu.h"
 
+#include "umc/umc_6_7_0_offset.h"
+#include "umc/umc_6_7_0_sh_mask.h"
+
+static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
+                                                  uint32_t umc_reg_offset,
+                                                  unsigned long *error_count)
+{
+       uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
+       uint32_t ecc_err_cnt, ecc_err_cnt_addr;
+       uint64_t mc_umc_status;
+       uint32_t mc_umc_status_addr;
+
+       /* UMC 6_1_1 registers */
+       ecc_err_cnt_sel_addr =
+               SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCntSel);
+       ecc_err_cnt_addr =
+               SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCnt);
+       mc_umc_status_addr =
+               SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
+
+       /* select the lower chip and check the error count */
+       ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
+       ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
+                                       EccErrCntCsSel, 0);
+       WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
+
+       ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
+       *error_count +=
+               (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
+                UMC_V6_7_CE_CNT_INIT);
+
+       /* select the higher chip and check the err counter */
+       ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
+                                       EccErrCntCsSel, 1);
+       WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
+
+       ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
+       *error_count +=
+               (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
+                UMC_V6_7_CE_CNT_INIT);
+
+       /* check for SRAM correctable error
+         MCUMC_STATUS is a 64 bit register */
+       mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
+       if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
+           REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
+               *error_count += 1;
+}
+
+static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
+                                                     uint32_t umc_reg_offset,
+                                                     unsigned long *error_count)
+{
+       uint64_t mc_umc_status;
+       uint32_t mc_umc_status_addr;
+
+       mc_umc_status_addr =
+               SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
+
+       /* check the MCUMC_STATUS */
+       mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
+       if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
+           (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
+           REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
+           REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
+           REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
+           REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
+               *error_count += 1;
+}
+
 const struct amdgpu_umc_funcs umc_v6_7_funcs = {
        .ras_late_init = amdgpu_umc_ras_late_init,
 };
index 8c2ce69..6b88122 100644 (file)
 #ifndef __UMC_V6_7_H__
 #define __UMC_V6_7_H__
 
+/* EccErrCnt max value */
+#define UMC_V6_7_CE_CNT_MAX            0xffff
+/* umc ce interrupt threshold */
+#define UMC_V6_7_CE_INT_THRESHOLD      0xffff
+/* umc ce count initial value */
+#define UMC_V6_7_CE_CNT_INIT   (UMC_V6_7_CE_CNT_MAX - UMC_V6_7_CE_INT_THRESHOLD)
+
 extern const struct amdgpu_umc_funcs umc_v6_7_funcs;
 
 #endif