Merge tag 'dt-5.15' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc

[linux-2.6-microblaze.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index fc66aca..96a8fd0 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -64,15 +64,14 @@ const char *ras_block_string[] = {
  };
  
  #define ras_err_str(i) (ras_error_string[ffs(i)])
-#define ras_block_str(i) (ras_block_string[i])
  
  #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
  
  /* inject address is 52 bits */
  #define        RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
  
-/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
-#define RAS_BAD_PAGE_RATE              (100 * 1024 * 1024ULL)
+/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
+#define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL)
  
  enum amdgpu_ras_retire_page_reservation {
         AMDGPU_RAS_RETIRE_PAGE_RESERVED,
@@ -355,8 +354,9 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
   *     to see which blocks support RAS on a particular asic.
   *
   */
-static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
-               size_t size, loff_t *pos)
+static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
+                                            const char __user *buf,
+                                            size_t size, loff_t *pos)
  {
         struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
         struct ras_debug_if data;
@@ -370,7 +370,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
  
         ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
         if (ret)
-               return -EINVAL;
+               return ret;
  
         if (data.op == 3) {
                 ret = amdgpu_reserve_page_direct(adev, data.inject.address);
@@ -403,9 +403,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
                 /* umc ce/ue error injection for a bad page is not allowed */
                 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
                     amdgpu_ras_check_bad_page(adev, data.inject.address)) {
-                       dev_warn(adev->dev, "RAS WARN: 0x%llx has been marked "
-                                       "as bad before error injection!\n",
-                                       data.inject.address);
+                       dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
+                                "already been marked as bad!\n",
+                                data.inject.address);
                         break;
                 }
  
@@ -439,21 +439,24 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
   * will reset EEPROM table to 0 entries.
   *
   */
-static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf,
-               size_t size, loff_t *pos)
+static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
+                                              const char __user *buf,
+                                              size_t size, loff_t *pos)
  {
         struct amdgpu_device *adev =
                 (struct amdgpu_device *)file_inode(f)->i_private;
         int ret;
  
         ret = amdgpu_ras_eeprom_reset_table(
-                       &(amdgpu_ras_get_context(adev)->eeprom_control));
+               &(amdgpu_ras_get_context(adev)->eeprom_control));
  
-       if (ret == 1) {
+       if (!ret) {
+               /* Something was written to EEPROM.
+                */
                 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
                 return size;
         } else {
-               return -EIO;
+               return ret;
         }
  }
  
@@ -526,7 +529,7 @@ static inline void put_obj(struct ras_manager *obj)
         if (obj && (--obj->use == 0))
                 list_del(&obj->node);
         if (obj && (obj->use < 0))
-               DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
+               DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", ras_block_str(obj->head.block));
  }
  
  /* make one obj and return it. */
@@ -789,7 +792,6 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
                         .type = default_ras_type,
                         .sub_block_index = 0,
                 };
-               strcpy(head.name, ras_block_str(i));
                 if (bypass) {
                         /*
                          * bypass psp. vbios enable ras for us.
@@ -1316,6 +1318,12 @@ static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *
                            &con->bad_page_cnt_threshold);
         debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
         debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
+       debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
+                           &amdgpu_ras_debugfs_eeprom_size_ops);
+       con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
+                                                      S_IRUGO, dir, adev,
+                                                      &amdgpu_ras_debugfs_eeprom_table_ops);
+       amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
  
         /*
          * After one uncorrectable error happens, usually GPU recovery will
@@ -1833,13 +1841,12 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
  
         control = &con->eeprom_control;
         data = con->eh_data;
-       save_count = data->count - control->num_recs;
+       save_count = data->count - control->ras_num_recs;
         /* only new entries are saved */
         if (save_count > 0) {
-               if (amdgpu_ras_eeprom_process_recods(control,
-                                                       &data->bps[control->num_recs],
-                                                       true,
-                                                       save_count)) {
+               if (amdgpu_ras_eeprom_append(control,
+                                            &data->bps[control->ras_num_recs],
+                                            save_count)) {
                         dev_err(adev->dev, "Failed to save EEPROM table data!");
                         return -EIO;
                 }
@@ -1857,28 +1864,24 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
  static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
  {
         struct amdgpu_ras_eeprom_control *control =
-                                       &adev->psp.ras.ras->eeprom_control;
-       struct eeprom_table_record *bps = NULL;
-       int ret = 0;
+               &adev->psp.ras_context.ras->eeprom_control;
+       struct eeprom_table_record *bps;
+       int ret;
  
         /* no bad page record, skip eeprom access */
-       if (!control->num_recs || (amdgpu_bad_page_threshold == 0))
-               return ret;
+       if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
+               return 0;
  
-       bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
+       bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL);
         if (!bps)
                 return -ENOMEM;
  
-       if (amdgpu_ras_eeprom_process_recods(control, bps, false,
-               control->num_recs)) {
+       ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs);
+       if (ret)
                 dev_err(adev->dev, "Failed to load EEPROM table records!");
-               ret = -EIO;
-               goto out;
-       }
-
-       ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
+       else
+               ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs);
  
-out:
         kfree(bps);
         return ret;
  }
@@ -1918,11 +1921,9 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
  }
  
  static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
-                                       uint32_t max_length)
+                                         uint32_t max_count)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       int tmp_threshold = amdgpu_bad_page_threshold;
-       u64 val;
  
         /*
          * Justification of value bad_page_cnt_threshold in ras structure
@@ -1943,18 +1944,15 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
          *      take no effect.
          */
  
-       if (tmp_threshold < -1)
-               tmp_threshold = -1;
-       else if (tmp_threshold > max_length)
-               tmp_threshold = max_length;
+       if (amdgpu_bad_page_threshold < 0) {
+               u64 val = adev->gmc.mc_vram_size;
  
-       if (tmp_threshold == -1) {
-               val = adev->gmc.mc_vram_size;
-               do_div(val, RAS_BAD_PAGE_RATE);
+               do_div(val, RAS_BAD_PAGE_COVER);
                 con->bad_page_cnt_threshold = min(lower_32_bits(val),
-                                               max_length);
+                                                 max_count);
         } else {
-               con->bad_page_cnt_threshold = tmp_threshold;
+               con->bad_page_cnt_threshold = min_t(int, max_count,
+                                                   amdgpu_bad_page_threshold);
         }
  }
  
@@ -1962,15 +1960,24 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_err_handler_data **data;
-       uint32_t max_eeprom_records_len = 0;
+       u32  max_eeprom_records_count = 0;
         bool exc_err_limit = false;
         int ret;
  
-       if (adev->ras_enabled && con)
-               data = &con->eh_data;
-       else
+       if (!con)
                 return 0;
  
+       /* Allow access to RAS EEPROM via debugfs, when the ASIC
+        * supports RAS and debugfs is enabled, but when
+        * adev->ras_enabled is unset, i.e. when "ras_enable"
+        * module parameter is set to 0.
+        */
+       con->adev = adev;
+
+       if (!adev->ras_enabled)
+               return 0;
+
+       data = &con->eh_data;
         *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
         if (!*data) {
                 ret = -ENOMEM;
@@ -1980,10 +1987,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         mutex_init(&con->recovery_lock);
         INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
         atomic_set(&con->in_recovery, 0);
-       con->adev = adev;
  
-       max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
-       amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+       max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
+       amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
  
         /* Todo: During test the SMU might fail to read the eeprom through I2C
          * when the GPU is pending on XGMI reset during probe time
@@ -1999,13 +2005,13 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         if (exc_err_limit || ret)
                 goto free;
  
-       if (con->eeprom_control.num_recs) {
+       if (con->eeprom_control.ras_num_recs) {
                 ret = amdgpu_ras_load_bad_pages(adev);
                 if (ret)
                         goto free;
  
                 if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num)
-                       adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.num_recs);
+                       adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
         }
  
         return 0;
@@ -2015,7 +2021,7 @@ free:
         kfree(*data);
         con->eh_data = NULL;
  out:
-       dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
+       dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret);
  
         /*
          * Except error threshold exceeding case, other failure cases in this