if (type == AMDGPU_MCA_ERROR_TYPE_UE)
amdgpu_ras_error_statistic_ue_count(err_data,
&mcm_info, &err_addr, (uint64_t)count);
- else
- amdgpu_ras_error_statistic_ce_count(err_data,
- &mcm_info, &err_addr, (uint64_t)count);
+ else {
+ if (!!(MCA_REG__STATUS__DEFERRED(entry->regs[MCA_REG_IDX_STATUS])))
+ amdgpu_ras_error_statistic_de_count(err_data,
+ &mcm_info, &err_addr, (uint64_t)count);
+ else
+ amdgpu_ras_error_statistic_ce_count(err_data,
+ &mcm_info, &err_addr, (uint64_t)count);
+ }
}
out_mca_release:
struct ras_manager *ras_mgr,
struct ras_err_data *err_data,
const char *blk_name,
- bool is_ue)
+ bool is_ue,
+ bool is_de)
{
struct amdgpu_smuio_mcm_config_info *mcm_info;
struct ras_err_node *err_node;
}
} else {
- for_each_ras_error(err_node, err_data) {
- err_info = &err_node->err_info;
- mcm_info = &err_info->mcm_info;
- if (err_info->ce_count) {
+ if (is_de) {
+ for_each_ras_error(err_node, err_data) {
+ err_info = &err_node->err_info;
+ mcm_info = &err_info->mcm_info;
+ if (err_info->de_count) {
+ dev_info(adev->dev, "socket: %d, die: %d, "
+ "%lld new deferred hardware errors detected in %s block\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->de_count,
+ blk_name);
+ }
+ }
+
+ for_each_ras_error(err_node, &ras_mgr->err_data) {
+ err_info = &err_node->err_info;
+ mcm_info = &err_info->mcm_info;
dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld new correctable hardware errors detected in %s block\n",
- mcm_info->socket_id,
- mcm_info->die_id,
- err_info->ce_count,
- blk_name);
+ "%lld deferred hardware errors detected in total in %s block\n",
+ mcm_info->socket_id, mcm_info->die_id,
+ err_info->de_count, blk_name);
+ }
+ } else {
+ for_each_ras_error(err_node, err_data) {
+ err_info = &err_node->err_info;
+ mcm_info = &err_info->mcm_info;
+ if (err_info->ce_count) {
+ dev_info(adev->dev, "socket: %d, die: %d, "
+ "%lld new correctable hardware errors detected in %s block\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->ce_count,
+ blk_name);
+ }
}
- }
- for_each_ras_error(err_node, &ras_mgr->err_data) {
- err_info = &err_node->err_info;
- mcm_info = &err_info->mcm_info;
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld correctable hardware errors detected in total in %s block\n",
- mcm_info->socket_id, mcm_info->die_id, err_info->ce_count, blk_name);
+ for_each_ras_error(err_node, &ras_mgr->err_data) {
+ err_info = &err_node->err_info;
+ mcm_info = &err_info->mcm_info;
+ dev_info(adev->dev, "socket: %d, die: %d, "
+ "%lld correctable hardware errors detected in total in %s block\n",
+ mcm_info->socket_id, mcm_info->die_id,
+ err_info->ce_count, blk_name);
+ }
}
}
}
if (err_data->ce_count) {
if (err_data_has_source_info(err_data)) {
- amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, false);
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+ blk_name, false, false);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
if (err_data->ue_count) {
if (err_data_has_source_info(err_data)) {
- amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, true);
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+ blk_name, true, false);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
}
}
+ if (err_data->de_count) {
+ if (err_data_has_source_info(err_data)) {
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+ blk_name, false, true);
+ } else if (!adev->aid_mask &&
+ adev->smuio.funcs &&
+ adev->smuio.funcs->get_socket_id &&
+ adev->smuio.funcs->get_die_id) {
+ dev_info(adev->dev, "socket: %d, die: %d "
+ "%ld deferred hardware errors "
+ "detected in %s block\n",
+ adev->smuio.funcs->get_socket_id(adev),
+ adev->smuio.funcs->get_die_id(adev),
+ ras_mgr->err_data.de_count,
+ blk_name);
+ } else {
+ dev_info(adev->dev, "%ld deferred hardware errors "
+ "detected in %s block\n",
+ ras_mgr->err_data.de_count,
+ blk_name);
+ }
+ }
}
static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data)
if (err_data_has_source_info(err_data)) {
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
-
+ amdgpu_ras_error_statistic_de_count(&obj->err_data,
+ &err_info->mcm_info, NULL, err_info->de_count);
amdgpu_ras_error_statistic_ce_count(&obj->err_data,
&err_info->mcm_info, NULL, err_info->ce_count);
amdgpu_ras_error_statistic_ue_count(&obj->err_data,
/* for legacy asic path which doesn't has error source info */
obj->err_data.ue_count += err_data->ue_count;
obj->err_data.ce_count += err_data->ce_count;
+ obj->err_data.de_count += err_data->de_count;
}
}
info->ue_count = obj->err_data.ue_count;
info->ce_count = obj->err_data.ce_count;
+ info->de_count = obj->err_data.de_count;
amdgpu_ras_error_generate_report(adev, info, &err_data);
*/
obj->err_data.ue_count += err_data.ue_count;
obj->err_data.ce_count += err_data.ce_count;
+ obj->err_data.de_count += err_data.de_count;
}
amdgpu_ras_error_data_fini(&err_data);
return 0;
}
+int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ struct ras_err_addr *err_addr, u64 count)
+{
+ struct ras_err_info *err_info;
+
+ if (!err_data || !mcm_info)
+ return -EINVAL;
+
+ if (!count)
+ return 0;
+
+ err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+ if (!err_info)
+ return -EINVAL;
+
+ err_info->de_count += count;
+ err_data->de_count += count;
+
+ return 0;
+}
+
#define mmMP0_SMN_C2PMSG_92 0x1609C
#define mmMP0_SMN_C2PMSG_126 0x160BE
static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
struct amdgpu_smuio_mcm_config_info mcm_info;
u64 ce_count;
u64 ue_count;
+ u64 de_count;
struct ras_err_addr err_addr;
};
struct ras_err_data {
unsigned long ue_count;
unsigned long ce_count;
+ unsigned long de_count;
unsigned long err_addr_cnt;
struct eeprom_table_record *err_addr;
u32 err_list_count;
struct ras_common_if head;
unsigned long ue_count;
unsigned long ce_count;
+ unsigned long de_count;
};
struct ras_inject_if {
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count);
+int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ struct ras_err_addr *err_addr, u64 count);
void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances);
int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
const struct aca_info *aca_info, void *data);
if (ret == AMDGPU_RAS_SUCCESS && obj) {
obj->err_data.ue_count += err_data.ue_count;
obj->err_data.ce_count += err_data.ce_count;
+ obj->err_data.de_count += err_data.de_count;
}
amdgpu_ras_error_data_fini(&err_data);
umc_v12_0_reset_error_count_per_channel, NULL);
}
+bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
+{
+ return (amdgpu_ras_is_poison_mode_supported(adev) &&
+ (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
+ (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1));
+}
+
bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
{
- if (amdgpu_ras_is_poison_mode_supported(adev) &&
- (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
- (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1))
- return true;
+ if (umc_v12_0_is_deferred_error(adev, mc_umc_status))
+ return false;
return ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
{
- if (amdgpu_ras_is_poison_mode_supported(adev) &&
- (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
- (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1))
+ if (umc_v12_0_is_deferred_error(adev, mc_umc_status))
return false;
return (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
!(umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)))));
}
-static void umc_v12_0_query_correctable_error_count(struct amdgpu_device *adev,
+static void umc_v12_0_query_error_count_per_type(struct amdgpu_device *adev,
uint64_t umc_reg_offset,
- unsigned long *error_count)
-{
- uint64_t mc_umc_status;
- uint64_t mc_umc_status_addr;
-
- mc_umc_status_addr =
- SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
-
- /* Rely on MCUMC_STATUS for correctable error counter
- * MCUMC_STATUS is a 64 bit register
- */
- mc_umc_status =
- RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4);
-
- if (umc_v12_0_is_correctable_error(adev, mc_umc_status))
- *error_count += 1;
-}
-
-static void umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device *adev,
- uint64_t umc_reg_offset,
- unsigned long *error_count)
+ unsigned long *error_count,
+ check_error_type_func error_type_func)
{
uint64_t mc_umc_status;
uint64_t mc_umc_status_addr;
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
- /* Check the MCUMC_STATUS. */
+ /* Check MCUMC_STATUS */
mc_umc_status =
RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4);
- if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status))
+ if (error_type_func(adev, mc_umc_status))
*error_count += 1;
}
uint32_t ch_inst, void *data)
{
struct ras_err_data *err_data = (struct ras_err_data *)data;
- unsigned long ue_count = 0, ce_count = 0;
+ unsigned long ue_count = 0, ce_count = 0, de_count = 0;
/* NOTE: node_inst is converted by adev->umc.active_mask and the range is [0-3],
* which can be used as die ID directly */
uint64_t umc_reg_offset =
get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
- umc_v12_0_query_correctable_error_count(adev, umc_reg_offset, &ce_count);
- umc_v12_0_query_uncorrectable_error_count(adev, umc_reg_offset, &ue_count);
+ umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
+ &ce_count, umc_v12_0_is_correctable_error);
+ umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
+ &ue_count, umc_v12_0_is_uncorrectable_error);
+ umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
+ &de_count, umc_v12_0_is_deferred_error);
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
+ amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, NULL, de_count);
return 0;
}
if (!mc_umc_status)
continue;
- if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) {
+ if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
+ umc_v12_0_is_deferred_error(adev, mc_umc_status)) {
uint64_t mca_addr, err_addr, mca_ipid;
uint32_t InstanceIdLo;
struct amdgpu_smuio_mcm_config_info *mcm_info;
(((_ipid_lo) >> 12) & 0xF))
#define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7)
+bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
+typedef bool (*check_error_type_func)(struct amdgpu_device *adev, uint64_t mc_umc_status);
+
extern const uint32_t
umc_v12_0_channel_idx_tbl[]
[UMC_V12_0_UMC_INSTANCE_NUM]
return 0;
}
- if (type == AMDGPU_MCA_ERROR_TYPE_UE && umc_v12_0_is_uncorrectable_error(adev, status0))
- *count = 1;
- else if (type == AMDGPU_MCA_ERROR_TYPE_CE && umc_v12_0_is_correctable_error(adev, status0))
+ if ((type == AMDGPU_MCA_ERROR_TYPE_UE && umc_v12_0_is_uncorrectable_error(adev, status0)) ||
+ (type == AMDGPU_MCA_ERROR_TYPE_CE && (umc_v12_0_is_correctable_error(adev, status0) ||
+ umc_v12_0_is_deferred_error(adev, status0))))
*count = 1;
return 0;