drm/amdgpu: Add psp v13 function to query boot status
authorHawking Zhang <Hawking.Zhang@amd.com>
Fri, 3 Nov 2023 05:28:04 +0000 (13:28 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 3 Nov 2023 16:18:33 +0000 (12:18 -0400)
Add psp v13 function to query boot status.

v2: limit the use case to dGPU only (Lijo)

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Le Ma <le.ma@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
drivers/gpu/drm/amd/amdgpu/psp_v13_0.c

index 648bd5e..32b701c 100644 (file)
@@ -2120,6 +2120,21 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev)
        return ret;
 }
 
+int amdgpu_psp_query_boot_status(struct amdgpu_device *adev)
+{
+       struct psp_context *psp = &adev->psp;
+       int ret = 0;
+
+       if (amdgpu_sriov_vf(adev) || (adev->flags & AMD_IS_APU))
+               return 0;
+
+       if (psp->funcs &&
+           psp->funcs->query_boot_status)
+               ret = psp->funcs->query_boot_status(psp);
+
+       return ret;
+}
+
 static int psp_hw_start(struct psp_context *psp)
 {
        struct amdgpu_device *adev = psp->adev;
index 7111dd3..5d36ad3 100644 (file)
@@ -134,6 +134,7 @@ struct psp_funcs {
        int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
        int (*vbflash_stat)(struct psp_context *psp);
        int (*fatal_error_recovery_quirk)(struct psp_context *psp);
+       int (*query_boot_status)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -537,4 +538,6 @@ int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
 
+int amdgpu_psp_query_boot_status(struct amdgpu_device *adev);
+
 #endif
index 4142e2f..3cf4684 100644 (file)
@@ -759,6 +759,83 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct psp_context *psp)
        return 0;
 }
 
+
+static void psp_v13_0_boot_error_reporting(struct amdgpu_device *adev,
+                                          uint32_t inst,
+                                          uint32_t boot_error)
+{
+       uint32_t socket_id;
+       uint32_t aid_id;
+       uint32_t hbm_id;
+       uint32_t reg_data;
+
+       socket_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, SOCKET_ID);
+       aid_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, AID_ID);
+       hbm_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, HBM_ID);
+
+       reg_data = RREG32_SOC15(MP0, inst, regMP0_SMN_C2PMSG_109);
+       dev_info(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n",
+                socket_id, aid_id, reg_data);
+
+       if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_MEM_TRAINING))
+               dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n",
+                        socket_id, aid_id, hbm_id);
+
+       if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_FW_LOAD))
+               dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n",
+                        socket_id, aid_id);
+
+       if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_WAFL_LINK_TRAINING))
+               dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n",
+                        socket_id, aid_id);
+
+       if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_XGMI_LINK_TRAINING))
+               dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n",
+                        socket_id, aid_id);
+
+       if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_USR_CP_LINK_TRAINING))
+               dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n",
+                        socket_id, aid_id);
+
+       if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_USR_DP_LINK_TRAINING))
+               dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n",
+                        socket_id, aid_id);
+
+       if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_HBM_MEM_TEST))
+               dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n",
+                        socket_id, aid_id, hbm_id);
+
+       if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_HBM_BIST_TEST))
+               dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n",
+                        socket_id, aid_id, hbm_id);
+}
+
+static int psp_v13_0_query_boot_status(struct psp_context *psp)
+{
+       struct amdgpu_device *adev = psp->adev;
+       int inst_mask = adev->aid_mask;
+       uint32_t reg_data;
+       uint32_t i;
+       int ret = 0;
+
+       if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(13, 0, 6))
+               return 0;
+
+       if (RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_59) < 0x00a10007)
+               return 0;
+
+       for_each_inst(i, inst_mask) {
+               reg_data = RREG32_SOC15(MP0, i, regMP0_SMN_C2PMSG_126);
+               if (!REG_GET_FIELD(reg_data, MP0_SMN_C2PMSG_126, BOOT_STATUS)) {
+                       psp_v13_0_boot_error_reporting(adev, i, reg_data);
+                       ret = -EINVAL;
+                       break;
+               }
+       }
+
+       return ret;
+}
+
 static const struct psp_funcs psp_v13_0_funcs = {
        .init_microcode = psp_v13_0_init_microcode,
        .wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state,
@@ -781,6 +858,7 @@ static const struct psp_funcs psp_v13_0_funcs = {
        .update_spirom = psp_v13_0_update_spirom,
        .vbflash_stat = psp_v13_0_vbflash_status,
        .fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk,
+       .query_boot_status = psp_v13_0_query_boot_status,
 };
 
 void psp_v13_0_set_psp_funcs(struct psp_context *psp)