drm/amdgpu: Use explicit cardinality for clarity

author Luben Tuikov <luben.tuikov@amd.com>

Sat, 27 Mar 2021 07:57:49 +0000 (03:57 -0400)

committer Alex Deucher <alexander.deucher@amd.com>

Thu, 1 Jul 2021 04:24:41 +0000 (00:24 -0400)
author Luben Tuikov <luben.tuikov@amd.com>
Sat, 27 Mar 2021 07:57:49 +0000 (03:57 -0400)
committer Alex Deucher <alexander.deucher@amd.com>
Thu, 1 Jul 2021 04:24:41 +0000 (00:24 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index 71beb0d..b514b7b 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -870,11 +870,10 @@ MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legac
  module_param_named(reset_method, amdgpu_reset_method, int, 0444);
  
  /**
- * DOC: bad_page_threshold (int)
- * Bad page threshold is to specify the threshold value of faulty pages
- * detected by RAS ECC, that may result in GPU entering bad status if total
- * faulty pages by ECC exceed threshold value and leave it for user's further
- * check.
+ * DOC: bad_page_threshold (int) Bad page threshold is specifies the
+ * threshold value of faulty pages detected by RAS ECC, which may
+ * result in the GPU entering bad status when the number of total
+ * faulty pages by ECC exceeds the threshold value.
   */
  MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)");
  module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 6ef3cfe..79afff1 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -71,8 +71,8 @@ const char *ras_block_string[] = {
  /* inject address is 52 bits */
  #define        RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
  
-/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
-#define RAS_BAD_PAGE_RATE              (100 * 1024 * 1024ULL)
+/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
+#define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL)
  
  enum amdgpu_ras_retire_page_reservation {
         AMDGPU_RAS_RETIRE_PAGE_RESERVED,
@@ -1841,27 +1841,24 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
  static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
  {
         struct amdgpu_ras_eeprom_control *control =
-                                       &adev->psp.ras.ras->eeprom_control;
-       struct eeprom_table_record *bps = NULL;
-       int ret = 0;
+               &adev->psp.ras.ras->eeprom_control;
+       struct eeprom_table_record *bps;
+       int ret;
  
         /* no bad page record, skip eeprom access */
-       if (!control->num_recs || (amdgpu_bad_page_threshold == 0))
-               return ret;
+       if (control->num_recs == 0 || amdgpu_bad_page_threshold == 0)
+               return 0;
  
         bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
         if (!bps)
                 return -ENOMEM;
  
-       if (amdgpu_ras_eeprom_read(control, bps, control->num_recs)) {
+       ret = amdgpu_ras_eeprom_read(control, bps, control->num_recs);
+       if (ret)
                 dev_err(adev->dev, "Failed to load EEPROM table records!");
-               ret = -EIO;
-               goto out;
-       }
-
-       ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
+       else
+               ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
  
-out:
         kfree(bps);
         return ret;
  }
@@ -1901,11 +1898,9 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
  }
  
  static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
-                                       uint32_t max_length)
+                                         uint32_t max_count)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       int tmp_threshold = amdgpu_bad_page_threshold;
-       u64 val;
  
         /*
          * Justification of value bad_page_cnt_threshold in ras structure
@@ -1926,18 +1921,15 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
          *      take no effect.
          */
  
-       if (tmp_threshold < -1)
-               tmp_threshold = -1;
-       else if (tmp_threshold > max_length)
-               tmp_threshold = max_length;
+       if (amdgpu_bad_page_threshold < 0) {
+               u64 val = adev->gmc.mc_vram_size;
  
-       if (tmp_threshold == -1) {
-               val = adev->gmc.mc_vram_size;
-               do_div(val, RAS_BAD_PAGE_RATE);
+               do_div(val, RAS_BAD_PAGE_COVER);
                 con->bad_page_cnt_threshold = min(lower_32_bits(val),
-                                               max_length);
+                                                 max_count);
         } else {
-               con->bad_page_cnt_threshold = tmp_threshold;
+               con->bad_page_cnt_threshold = min_t(int, max_count,
+                                                   amdgpu_bad_page_threshold);
         }
  }
  
@@ -1945,7 +1937,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_err_handler_data **data;
-       uint32_t max_eeprom_records_len = 0;
+       u32  max_eeprom_records_count = 0;
         bool exc_err_limit = false;
         int ret;
  
@@ -1965,8 +1957,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         atomic_set(&con->in_recovery, 0);
         con->adev = adev;
  
-       max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
-       amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+       max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
+       amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
  
         /* Todo: During test the SMU might fail to read the eeprom through I2C
          * when the GPU is pending on XGMI reset during probe time
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

index 54ef315..21e1e59 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -54,7 +54,7 @@
  #define RAS_TBL_SIZE_BYTES      (256 * 1024)
  #define RAS_HDR_START           0
  #define RAS_RECORD_START        (RAS_HDR_START + RAS_TABLE_HEADER_SIZE)
-#define RAS_MAX_RECORD_NUM      ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
+#define RAS_MAX_RECORD_COUNT    ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
                                  / RAS_TABLE_RECORD_SIZE)
  
  #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev
@@ -532,7 +532,7 @@ static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control,
                  * TODO - Check the assumption is correct
                  */
                 control->num_recs += num;
-               control->num_recs %= RAS_MAX_RECORD_NUM;
+               control->num_recs %= RAS_MAX_RECORD_COUNT;
                 control->tbl_hdr.tbl_size += RAS_TABLE_RECORD_SIZE * num;
                 if (control->tbl_hdr.tbl_size > RAS_TBL_SIZE_BYTES)
                         control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
@@ -568,9 +568,9 @@ int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
         return amdgpu_ras_eeprom_xfer(control, records, num, true);
  }
  
-inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void)
+inline uint32_t amdgpu_ras_eeprom_max_record_count(void)
  {
-       return RAS_MAX_RECORD_NUM;
+       return RAS_MAX_RECORD_COUNT;
  }
  
  /* Used for testing if bugs encountered */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

index 4906ed9..504729b 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -88,7 +88,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
  int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
                             struct eeprom_table_record *records, const u32 num);
  
-inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void);
+inline uint32_t amdgpu_ras_eeprom_max_record_count(void);
  
  void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control);
author	Luben Tuikov <luben.tuikov@amd.com>
	Sat, 27 Mar 2021 07:57:49 +0000 (03:57 -0400)
committer	Alex Deucher <alexander.deucher@amd.com>
	Thu, 1 Jul 2021 04:24:41 +0000 (00:24 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h		patch \| blob \| history