drm/amdgpu: Add helper to initialize badpage info
authorLijo Lazar <lijo.lazar@amd.com>
Fri, 30 Aug 2024 05:51:43 +0000 (11:21 +0530)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 26 Sep 2024 21:06:38 +0000 (17:06 -0400)
Add a separate function to read badpage data during initialization.
Reading bad pages will need hardware access and cannot be done during
reset. Hence in cases where device needs a full reset during
init itself, attempting to read will cause a deadlock.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Feifei Xu <Feifei.Xu@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

index f01e22b..1ee6449 100644 (file)
@@ -2953,7 +2953,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
         * Note: theoretically, this should be called before all vram allocations
         * to protect retired page from abusing
         */
-       r = amdgpu_ras_recovery_init(adev);
+       r = amdgpu_ras_recovery_init(adev, true);
        if (r)
                goto init_failed;
 
index 6665d9a..7e63fc0 100644 (file)
@@ -3146,7 +3146,42 @@ static int amdgpu_ras_page_retirement_thread(void *param)
        return 0;
 }
 
-int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       int ret;
+
+       if (!con || amdgpu_sriov_vf(adev))
+               return 0;
+
+       ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
+
+       if (ret)
+               return ret;
+
+       /* HW not usable */
+       if (amdgpu_ras_is_rma(adev))
+               return -EHWPOISON;
+
+       if (con->eeprom_control.ras_num_recs) {
+               ret = amdgpu_ras_load_bad_pages(adev);
+               if (ret)
+                       return ret;
+
+               amdgpu_dpm_send_hbm_bad_pages_num(
+                       adev, con->eeprom_control.ras_num_recs);
+
+               if (con->update_channel_flag == true) {
+                       amdgpu_dpm_send_hbm_bad_channel_flag(
+                               adev, con->eeprom_control.bad_channel_bitmap);
+                       con->update_channel_flag = false;
+               }
+       }
+
+       return ret;
+}
+
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        struct ras_err_handler_data **data;
@@ -3187,25 +3222,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         */
        if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
                return 0;
-       ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
-       /*
-        * This calling fails when is_rma is true or
-        * ret != 0.
-        */
-       if (amdgpu_ras_is_rma(adev) || ret)
-               goto free;
-
-       if (con->eeprom_control.ras_num_recs) {
-               ret = amdgpu_ras_load_bad_pages(adev);
+       if (init_bp_info) {
+               ret = amdgpu_ras_init_badpage_info(adev);
                if (ret)
                        goto free;
-
-               amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
-
-               if (con->update_channel_flag == true) {
-                       amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
-                       con->update_channel_flag = false;
-               }
        }
 
        mutex_init(&con->page_rsv_lock);
index 669720a..871b2d6 100644 (file)
@@ -736,8 +736,8 @@ struct amdgpu_ras_block_hw_ops {
  * 8: feature disable
  */
 
-
-int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
+int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev);
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info);
 
 void amdgpu_ras_resume(struct amdgpu_device *adev);
 void amdgpu_ras_suspend(struct amdgpu_device *adev);