drm/amdgpu: Fix SDMA RAS error reporting on Aldebaran
authorMukul Joshi <mukul.joshi@amd.com>
Wed, 24 Mar 2021 15:51:35 +0000 (11:51 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 21 Apr 2021 01:45:17 +0000 (21:45 -0400)
Fix the following issues with SDMA RAS error reporting:
1. Read the EDC_COUNTER2 register also to fetch error counts
   for all sub-blocks in SDMA.
2. SDMA RAS on Aldebaran suports single-bit uncorrectable errors
   only. So, report error count in UE count instead of CE count.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-By: John Clements <John.Clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c

index 6fcb95c..bf95007 100644 (file)
@@ -160,6 +160,7 @@ static const struct soc15_ras_field_entry sdma_v4_4_ras_fields[] = {
 };
 
 static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev,
+                                         uint32_t reg_offset,
                                          uint32_t value,
                                          uint32_t instance,
                                          uint32_t *sec_count)
@@ -169,6 +170,9 @@ static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev,
 
        /* double bits error (multiple bits) error detection is not supported */
        for (i = 0; i < ARRAY_SIZE(sdma_v4_4_ras_fields); i++) {
+               if (sdma_v4_4_ras_fields[i].reg_offset != reg_offset)
+                       continue;
+
                /* the SDMA_EDC_COUNTER register in each sdma instance
                 * shares the same sed shift_mask
                 * */
@@ -197,13 +201,30 @@ static int sdma_v4_4_query_ras_error_count(struct amdgpu_device *adev,
        reg_value = RREG32(reg_offset);
        /* double bit error is not supported */
        if (reg_value)
-               sdma_v4_4_get_ras_error_count(adev, reg_value, instance, &sec_count);
-       /* err_data->ce_count should be initialized to 0
-        * before calling into this function */
-       err_data->ce_count += sec_count;
-       /* double bit error is not supported
-        * set ue count to 0 */
-       err_data->ue_count = 0;
+               sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER, reg_value,
+                                             instance, &sec_count);
+
+       reg_offset = sdma_v4_4_get_reg_offset(adev, instance, regSDMA0_EDC_COUNTER2);
+       reg_value = RREG32(reg_offset);
+       /* double bit error is not supported */
+       if (reg_value)
+               sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER2, reg_value,
+                                             instance, &sec_count);
+
+       /*
+        * err_data->ue_count should be initialized to 0
+        * before calling into this function
+        *
+        * SDMA RAS supports single bit uncorrectable error detection.
+        * So, increment uncorrectable error count.
+        */
+       err_data->ue_count += sec_count;
+
+       /*
+        * SDMA RAS does not support correctable errors.
+        * Set ce count to 0.
+        */
+       err_data->ce_count = 0;
 
        return 0;
 };