X-Git-Url: http://git.monstr.eu/?a=blobdiff_plain;f=drivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_device.c;h=c3552079bb8621e3628c4a219b28494aec562d75;hb=751f43e75d63103cec3b6be4d451b6a3e569e87b;hp=0b3db212b5a4d7e3cf9bd035c699f1411c2676bc;hpb=655ce9cb13b5967558d81dd644868473ecfb5ee4;p=linux-2.6-microblaze.git diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0b3db212b5a4..c3552079bb86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1224,6 +1224,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) } } + /* Don't post if we need to reset whole hive on init */ + if (adev->gmc.xgmi.pending_reset) + return false; + if (adev->has_hw_reset) { adev->has_hw_reset = false; return true; @@ -2154,6 +2158,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) continue; + if (!adev->ip_blocks[i].status.sw) + continue; + /* no need to do the fw loading again if already done*/ if (adev->ip_blocks[i].status.hw == true) break; @@ -2294,7 +2301,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(adev); - amdgpu_amdkfd_device_init(adev); + + /* Don't init kfd if whole hive need to be reset during init */ + if (!adev->gmc.xgmi.pending_reset) + amdgpu_amdkfd_device_init(adev); amdgpu_fru_get_product_info(adev); @@ -2739,6 +2749,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) adev->ip_blocks[i].status.hw = false; continue; } + + /* skip unnecessary suspend if we do not initialize them yet */ + if (adev->gmc.xgmi.pending_reset && + !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { + adev->ip_blocks[i].status.hw = false; + continue; + } /* XXX handle errors */ r = adev->ip_blocks[i].version->funcs->suspend(adev); /* XXX handle errors */ @@ -3414,10 +3434,28 @@ int amdgpu_device_init(struct amdgpu_device *adev, * E.g., driver was not cleanly unloaded previously, etc. */ if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { - r = amdgpu_asic_reset(adev); - if (r) { - dev_err(adev->dev, "asic reset on init failed\n"); - goto failed; + if (adev->gmc.xgmi.num_physical_nodes) { + dev_info(adev->dev, "Pending hive reset.\n"); + adev->gmc.xgmi.pending_reset = true; + /* Only need to init necessary block for SMU to handle the reset */ + for (i = 0; i < adev->num_ip_blocks; i++) { + if (!adev->ip_blocks[i].status.valid) + continue; + if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { + DRM_DEBUG("IP %s disabled for hw_init.\n", + adev->ip_blocks[i].version->funcs->name); + adev->ip_blocks[i].status.hw = true; + } + } + } else { + r = amdgpu_asic_reset(adev); + if (r) { + dev_err(adev->dev, "asic reset on init failed\n"); + goto failed; + } } } @@ -3548,19 +3586,19 @@ fence_driver_init: /* enable clockgating, etc. after ib tests, etc. since some blocks require * explicit gating rather than handling it automatically. */ - r = amdgpu_device_ip_late_init(adev); - if (r) { - dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); - goto failed; + if (!adev->gmc.xgmi.pending_reset) { + r = amdgpu_device_ip_late_init(adev); + if (r) { + dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); + amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); + goto failed; + } + /* must succeed. */ + amdgpu_ras_resume(adev); + queue_delayed_work(system_wq, &adev->delayed_init_work, + msecs_to_jiffies(AMDGPU_RESUME_MS)); } - /* must succeed. */ - amdgpu_ras_resume(adev); - - queue_delayed_work(system_wq, &adev->delayed_init_work, - msecs_to_jiffies(AMDGPU_RESUME_MS)); - if (amdgpu_sriov_vf(adev)) flush_delayed_work(&adev->delayed_init_work); @@ -3577,6 +3615,14 @@ fence_driver_init: if (amdgpu_device_cache_pci_state(adev->pdev)) pci_restore_state(pdev); + /* Enable lightSBR on SMU in passthrough + xgmi configuration */ + if (amdgpu_passthrough(adev) && adev->gmc.xgmi.num_physical_nodes > 1) + smu_set_light_sbr(&adev->smu, true); + + if (adev->gmc.xgmi.pending_reset) + queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, + msecs_to_jiffies(AMDGPU_RESUME_MS)); + return 0; failed: @@ -4287,14 +4333,16 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) return ret; } -static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, - struct amdgpu_job *job, - bool *need_full_reset_arg) +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, + struct amdgpu_job *job, + bool *need_full_reset_arg) { int i, r = 0; bool need_full_reset = *need_full_reset_arg; - amdgpu_debugfs_wait_dump(adev); + /* no need to dump if device is not in good state during probe period */ + if (!adev->gmc.xgmi.pending_reset) + amdgpu_debugfs_wait_dump(adev); if (amdgpu_sriov_vf(adev)) { /* stop the data exchange thread */ @@ -4340,10 +4388,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, return r; } -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, - struct list_head *device_list_handle, - bool *need_full_reset_arg, - bool skip_hw_reset) +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, + struct list_head *device_list_handle, + bool *need_full_reset_arg, + bool skip_hw_reset) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset = *need_full_reset_arg, vram_lost = false; @@ -4357,6 +4405,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, list_for_each_entry(tmp_adev, device_list_handle, reset_list) { /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { + tmp_adev->gmc.xgmi.pending_reset = false; if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) r = -EALREADY; } else @@ -4395,10 +4444,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (need_full_reset) { /* post card */ - if (amdgpu_device_asic_init(tmp_adev)) + r = amdgpu_device_asic_init(tmp_adev); + if (r) { dev_warn(tmp_adev->dev, "asic atom init failed!"); - - if (!r) { + } else { dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); r = amdgpu_device_ip_resume_phase1(tmp_adev); if (r) @@ -4431,6 +4480,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, */ amdgpu_register_gpu_instance(tmp_adev); + if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) + amdgpu_xgmi_add_device(tmp_adev); + r = amdgpu_device_ip_late_init(tmp_adev); if (r) goto out;