drm/amdgpu: MCA supports recording umc address information
authorYiPeng Chai <YiPeng.Chai@amd.com>
Tue, 12 Dec 2023 09:26:58 +0000 (17:26 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 19 Dec 2023 19:59:03 +0000 (14:59 -0500)
MCA supports recording umc address information.

V2:
  Move err_addr variable from struct ras_err_node to
struct ras_err_info.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c

index 210aea5..8911310 100644 (file)
@@ -218,6 +218,7 @@ static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, st
 int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data)
 {
        struct amdgpu_smuio_mcm_config_info mcm_info;
+       struct ras_err_addr err_addr = {0};
        struct mca_bank_set mca_set;
        struct mca_bank_node *node;
        struct mca_bank_entry *entry;
@@ -246,10 +247,18 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
                mcm_info.socket_id = entry->info.socket_id;
                mcm_info.die_id = entry->info.aid;
 
+               if (blk == AMDGPU_RAS_BLOCK__UMC) {
+                       err_addr.err_status = entry->regs[MCA_REG_IDX_STATUS];
+                       err_addr.err_ipid = entry->regs[MCA_REG_IDX_IPID];
+                       err_addr.err_addr = entry->regs[MCA_REG_IDX_ADDR];
+               }
+
                if (type == AMDGPU_MCA_ERROR_TYPE_UE)
-                       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, (uint64_t)count);
+                       amdgpu_ras_error_statistic_ue_count(err_data,
+                               &mcm_info, &err_addr, (uint64_t)count);
                else
-                       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, (uint64_t)count);
+                       amdgpu_ras_error_statistic_ce_count(err_data,
+                               &mcm_info, &err_addr, (uint64_t)count);
        }
 
 out_mca_release:
index bacb59d..bad6214 100644 (file)
@@ -1156,8 +1156,10 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
                for_each_ras_error(err_node, err_data) {
                        err_info = &err_node->err_info;
 
-                       amdgpu_ras_error_statistic_ce_count(&obj->err_data, &err_info->mcm_info, err_info->ce_count);
-                       amdgpu_ras_error_statistic_ue_count(&obj->err_data, &err_info->mcm_info, err_info->ue_count);
+                       amdgpu_ras_error_statistic_ce_count(&obj->err_data,
+                                       &err_info->mcm_info, NULL, err_info->ce_count);
+                       amdgpu_ras_error_statistic_ue_count(&obj->err_data,
+                                       &err_info->mcm_info, NULL, err_info->ue_count);
                }
        } else {
                /* for legacy asic path which doesn't has error source info */
@@ -3691,7 +3693,8 @@ static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct
 }
 
 static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
-                                                     struct amdgpu_smuio_mcm_config_info *mcm_info)
+                               struct amdgpu_smuio_mcm_config_info *mcm_info,
+                               struct ras_err_addr *err_addr)
 {
        struct ras_err_node *err_node;
 
@@ -3705,6 +3708,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
 
        memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
 
+       if (err_addr)
+               memcpy(&err_node->err_info.err_addr, err_addr, sizeof(*err_addr));
+
        err_data->err_list_count++;
        list_add_tail(&err_node->node, &err_data->err_node_list);
        list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp);
@@ -3713,7 +3719,8 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
 }
 
 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
-                                       struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
+               struct amdgpu_smuio_mcm_config_info *mcm_info,
+               struct ras_err_addr *err_addr, u64 count)
 {
        struct ras_err_info *err_info;
 
@@ -3723,7 +3730,7 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
        if (!count)
                return 0;
 
-       err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
+       err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
        if (!err_info)
                return -EINVAL;
 
@@ -3734,7 +3741,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
 }
 
 int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
-                                       struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
+               struct amdgpu_smuio_mcm_config_info *mcm_info,
+               struct ras_err_addr *err_addr, u64 count)
 {
        struct ras_err_info *err_info;
 
@@ -3744,7 +3752,7 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
        if (!count)
                return 0;
 
-       err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
+       err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
        if (!err_info)
                return -EINVAL;
 
index 6a941eb..76fb856 100644 (file)
@@ -452,10 +452,17 @@ struct ras_fs_data {
        char debugfs_name[32];
 };
 
+struct ras_err_addr {
+       uint64_t err_status;
+       uint64_t err_ipid;
+       uint64_t err_addr;
+};
+
 struct ras_err_info {
        struct amdgpu_smuio_mcm_config_info mcm_info;
        u64 ce_count;
        u64 ue_count;
+       struct ras_err_addr err_addr;
 };
 
 struct ras_err_node {
@@ -806,8 +813,10 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
 int amdgpu_ras_error_data_init(struct ras_err_data *err_data);
 void amdgpu_ras_error_data_fini(struct ras_err_data *err_data);
 int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
-                                       struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count);
+               struct amdgpu_smuio_mcm_config_info *mcm_info,
+               struct ras_err_addr *err_addr, u64 count);
 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
-                                       struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count);
+               struct amdgpu_smuio_mcm_config_info *mcm_info,
+               struct ras_err_addr *err_addr, u64 count);
 
 #endif
index 9a95b9f..a6c88f2 100644 (file)
@@ -1313,10 +1313,10 @@ static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct a
 
        switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) {
        case AMDGPU_MCA_ERROR_TYPE_UE:
-               amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL);
+               amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, NULL, 1ULL);
                break;
        case AMDGPU_MCA_ERROR_TYPE_CE:
-               amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL);
+               amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, NULL, 1ULL);
                break;
        default:
                break;
index 00b21ec..131cddb 100644 (file)
@@ -3828,8 +3828,8 @@ static void gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev,
        /* the caller should make sure initialize value of
         * err_data->ue_count and err_data->ce_count
         */
-       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
-       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
+       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
+       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
 }
 
 static void gfx_v9_4_3_inst_reset_ras_err_count(struct amdgpu_device *adev,
index 9b01467..fb53aac 100644 (file)
@@ -652,8 +652,8 @@ static void mmhub_v1_8_inst_query_ras_error_count(struct amdgpu_device *adev,
                                        AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
                                        &ue_count);
 
-       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
-       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
+       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
+       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
 }
 
 static void mmhub_v1_8_query_ras_error_count(struct amdgpu_device *adev,
index 0f24af6..2d688dc 100644 (file)
@@ -2156,7 +2156,7 @@ static void sdma_v4_4_2_inst_query_ras_error_count(struct amdgpu_device *adev,
                                        AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
                                        &ue_count);
 
-       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
+       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
 }
 
 static void sdma_v4_4_2_query_ras_error_count(struct amdgpu_device *adev,
index e9c2ff7..8d60c39 100644 (file)
@@ -166,8 +166,8 @@ static int umc_v12_0_query_error_count(struct amdgpu_device *adev,
        umc_v12_0_query_correctable_error_count(adev, umc_reg_offset, &ce_count);
        umc_v12_0_query_uncorrectable_error_count(adev, umc_reg_offset, &ue_count);
 
-       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
-       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
+       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
+       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
 
        return 0;
 }