drm/amdgpu: correct smu v13.0.6 umc ras error check
authorYang Wang <kevinyang.wang@amd.com>
Tue, 24 Oct 2023 06:00:39 +0000 (14:00 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 9 Nov 2023 22:01:20 +0000 (17:01 -0500)
correct smu v13.0.0 umc ras error check

Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c

index 770b4b4..e9c2ff7 100644 (file)
@@ -88,7 +88,7 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device *adev)
                umc_v12_0_reset_error_count_per_channel, NULL);
 }
 
-static bool umc_v12_0_is_uncorrectable_error(uint64_t mc_umc_status)
+bool umc_v12_0_is_uncorrectable_error(uint64_t mc_umc_status)
 {
        return ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
                (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
@@ -96,7 +96,7 @@ static bool umc_v12_0_is_uncorrectable_error(uint64_t mc_umc_status)
                REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1));
 }
 
-static bool umc_v12_0_is_correctable_error(uint64_t mc_umc_status)
+bool umc_v12_0_is_correctable_error(uint64_t mc_umc_status)
 {
        return (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
                (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1 ||
index 4885b9f..b34b1e3 100644 (file)
                (pa) |= (UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) << UMC_V12_0_PA_CH6_BIT); \
        } while (0)
 
+bool umc_v12_0_is_uncorrectable_error(uint64_t mc_umc_status);
+bool umc_v12_0_is_correctable_error(uint64_t mc_umc_status);
+
 extern const uint32_t
        umc_v12_0_channel_idx_tbl[]
                        [UMC_V12_0_UMC_INSTANCE_NUM]
index 83e1228..a6b57a5 100644 (file)
@@ -48,6 +48,7 @@
 #include "smu_cmn.h"
 #include "mp/mp_13_0_6_offset.h"
 #include "mp/mp_13_0_6_sh_mask.h"
+#include "umc_v12_0.h"
 
 #undef MP1_Public
 #undef smnMP1_FIRMWARE_FLAGS
@@ -2481,7 +2482,7 @@ static int mca_decode_mca_ipid(struct amdgpu_device *adev, enum amdgpu_mca_error
        return 0;
 }
 
-static int mca_normal_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev,
+static int mca_umc_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev,
                                        enum amdgpu_mca_error_type type, int idx, uint32_t *count)
 {
        uint64_t status0;
@@ -2491,10 +2492,15 @@ static int mca_normal_mca_get_err_count(const struct mca_ras_info *mca_ras, stru
        if (ret)
                return ret;
 
-       if (REG_GET_FIELD(status0, MCMP1_STATUST0, Val))
-               *count = 1;
-       else
+       if (!REG_GET_FIELD(status0, MCMP1_STATUST0, Val)) {
                *count = 0;
+               return 0;
+       }
+
+       if (type == AMDGPU_MCA_ERROR_TYPE_UE && umc_v12_0_is_uncorrectable_error(status0))
+               *count = 1;
+       else if (type == AMDGPU_MCA_ERROR_TYPE_CE && umc_v12_0_is_correctable_error(status0))
+               *count = 1;
 
        return 0;
 }
@@ -2608,7 +2614,7 @@ static const struct mca_ras_info mca_ras_table[] = {
        {
                .blkid = AMDGPU_RAS_BLOCK__UMC,
                .ip = AMDGPU_MCA_IP_UMC,
-               .get_err_count = mca_normal_mca_get_err_count,
+               .get_err_count = mca_umc_mca_get_err_count,
        }, {
                .blkid = AMDGPU_RAS_BLOCK__GFX,
                .ip = AMDGPU_MCA_IP_MP5,