X-Git-Url: http://git.monstr.eu/?a=blobdiff_plain;f=drivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_device.c;h=c3552079bb8621e3628c4a219b28494aec562d75;hb=751f43e75d63103cec3b6be4d451b6a3e569e87b;hp=0ee6514ee55c0cf74e39df0cb86ff79092d5fc6b;hpb=a8d3d80a8ca3df47a846937809fc1e1d8e8fbce2;p=linux-2.6-microblaze.git diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0ee6514ee55c..c3552079bb86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -110,6 +110,7 @@ const char *amdgpu_asic_name[] = { "RAVEN", "ARCTURUS", "RENOIR", + "ALDEBARAN", "NAVI10", "NAVI14", "NAVI12", @@ -1223,6 +1224,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) } } + /* Don't post if we need to reset whole hive on init */ + if (adev->gmc.xgmi.pending_reset) + return false; + if (adev->has_hw_reset) { adev->has_hw_reset = false; return true; @@ -1810,6 +1815,7 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) case CHIP_CARRIZO: case CHIP_STONEY: case CHIP_VEGA20: + case CHIP_ALDEBARAN: case CHIP_SIENNA_CICHLID: case CHIP_NAVY_FLOUNDER: case CHIP_DIMGREY_CAVEFISH: @@ -2010,6 +2016,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) case CHIP_RAVEN: case CHIP_ARCTURUS: case CHIP_RENOIR: + case CHIP_ALDEBARAN: if (adev->flags & AMD_IS_APU) adev->family = AMDGPU_FAMILY_RV; else @@ -2045,6 +2052,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) adev->pm.pp_feature = amdgpu_pp_feature_mask; if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; + if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) + adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; for (i = 0; i < adev->num_ip_blocks; i++) { if ((amdgpu_ip_block_mask & (1 << i)) == 0) { @@ -2149,6 +2158,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) continue; + if (!adev->ip_blocks[i].status.sw) + continue; + /* no need to do the fw loading again if already done*/ if (adev->ip_blocks[i].status.hw == true) break; @@ -2289,7 +2301,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(adev); - amdgpu_amdkfd_device_init(adev); + + /* Don't init kfd if whole hive need to be reset during init */ + if (!adev->gmc.xgmi.pending_reset) + amdgpu_amdkfd_device_init(adev); amdgpu_fru_get_product_info(adev); @@ -2678,7 +2693,8 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) { int i, r; - if (!amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) { + if (adev->in_poweroff_reboot_com || + !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) { amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); } @@ -2733,6 +2749,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) adev->ip_blocks[i].status.hw = false; continue; } + + /* skip unnecessary suspend if we do not initialize them yet */ + if (adev->gmc.xgmi.pending_reset && + !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { + adev->ip_blocks[i].status.hw = false; + continue; + } /* XXX handle errors */ r = adev->ip_blocks[i].version->funcs->suspend(adev); /* XXX handle errors */ @@ -2773,8 +2799,10 @@ int amdgpu_device_ip_suspend(struct amdgpu_device *adev) { int r; - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_vf(adev)) { + amdgpu_virt_fini_data_exchange(adev); amdgpu_virt_request_full_gpu(adev, false); + } r = amdgpu_device_ip_suspend_phase1(adev); if (r) @@ -3289,6 +3317,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_LIST_HEAD(&adev->shadow_list); mutex_init(&adev->shadow_list_lock); + INIT_LIST_HEAD(&adev->reset_list); + INIT_DELAYED_WORK(&adev->delayed_init_work, amdgpu_device_delayed_init_work_handler); INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, @@ -3404,10 +3434,28 @@ int amdgpu_device_init(struct amdgpu_device *adev, * E.g., driver was not cleanly unloaded previously, etc. */ if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { - r = amdgpu_asic_reset(adev); - if (r) { - dev_err(adev->dev, "asic reset on init failed\n"); - goto failed; + if (adev->gmc.xgmi.num_physical_nodes) { + dev_info(adev->dev, "Pending hive reset.\n"); + adev->gmc.xgmi.pending_reset = true; + /* Only need to init necessary block for SMU to handle the reset */ + for (i = 0; i < adev->num_ip_blocks; i++) { + if (!adev->ip_blocks[i].status.valid) + continue; + if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { + DRM_DEBUG("IP %s disabled for hw_init.\n", + adev->ip_blocks[i].version->funcs->name); + adev->ip_blocks[i].status.hw = true; + } + } + } else { + r = amdgpu_asic_reset(adev); + if (r) { + dev_err(adev->dev, "asic reset on init failed\n"); + goto failed; + } } } @@ -3538,19 +3586,19 @@ fence_driver_init: /* enable clockgating, etc. after ib tests, etc. since some blocks require * explicit gating rather than handling it automatically. */ - r = amdgpu_device_ip_late_init(adev); - if (r) { - dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); - goto failed; + if (!adev->gmc.xgmi.pending_reset) { + r = amdgpu_device_ip_late_init(adev); + if (r) { + dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); + amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); + goto failed; + } + /* must succeed. */ + amdgpu_ras_resume(adev); + queue_delayed_work(system_wq, &adev->delayed_init_work, + msecs_to_jiffies(AMDGPU_RESUME_MS)); } - /* must succeed. */ - amdgpu_ras_resume(adev); - - queue_delayed_work(system_wq, &adev->delayed_init_work, - msecs_to_jiffies(AMDGPU_RESUME_MS)); - if (amdgpu_sriov_vf(adev)) flush_delayed_work(&adev->delayed_init_work); @@ -3567,6 +3615,14 @@ fence_driver_init: if (amdgpu_device_cache_pci_state(adev->pdev)) pci_restore_state(pdev); + /* Enable lightSBR on SMU in passthrough + xgmi configuration */ + if (amdgpu_passthrough(adev) && adev->gmc.xgmi.num_physical_nodes > 1) + smu_set_light_sbr(&adev->smu, true); + + if (adev->gmc.xgmi.pending_reset) + queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, + msecs_to_jiffies(AMDGPU_RESUME_MS)); + return 0; failed: @@ -3593,6 +3649,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev) { dev_info(adev->dev, "amdgpu: finishing device.\n"); flush_delayed_work(&adev->delayed_init_work); + ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); adev->shutdown = true; kfree(adev->pci_state); @@ -3734,14 +3791,15 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) r = amdgpu_device_ip_suspend_phase1(adev); - amdgpu_amdkfd_suspend(adev, !fbcon); + amdgpu_amdkfd_suspend(adev, adev->in_runpm); /* evict vram memory */ amdgpu_bo_evict_vram(adev); amdgpu_fence_driver_suspend(adev); - if (!amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) + if (adev->in_poweroff_reboot_com || + !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) r = amdgpu_device_ip_suspend_phase2(adev); else amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); @@ -3818,7 +3876,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) } } } - r = amdgpu_amdkfd_resume(adev, !fbcon); + r = amdgpu_amdkfd_resume(adev, adev->in_runpm); if (r) return r; @@ -4221,6 +4279,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) case CHIP_NAVI12: case CHIP_SIENNA_CICHLID: case CHIP_NAVY_FLOUNDER: + case CHIP_DIMGREY_CAVEFISH: break; default: goto disabled; @@ -4234,15 +4293,56 @@ disabled: return false; } +int amdgpu_device_mode1_reset(struct amdgpu_device *adev) +{ + u32 i; + int ret = 0; + + amdgpu_atombios_scratch_regs_engine_hung(adev, true); + + dev_info(adev->dev, "GPU mode1 reset\n"); + + /* disable BM */ + pci_clear_master(adev->pdev); + + amdgpu_device_cache_pci_state(adev->pdev); + + if (amdgpu_dpm_is_mode1_reset_supported(adev)) { + dev_info(adev->dev, "GPU smu mode1 reset\n"); + ret = amdgpu_dpm_mode1_reset(adev); + } else { + dev_info(adev->dev, "GPU psp mode1 reset\n"); + ret = psp_gpu_reset(adev); + } + + if (ret) + dev_err(adev->dev, "GPU mode1 reset failed\n"); -static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, - struct amdgpu_job *job, - bool *need_full_reset_arg) + amdgpu_device_load_pci_state(adev->pdev); + + /* wait for asic to come out of reset */ + for (i = 0; i < adev->usec_timeout; i++) { + u32 memsize = adev->nbio.funcs->get_memsize(adev); + + if (memsize != 0xffffffff) + break; + udelay(1); + } + + amdgpu_atombios_scratch_regs_engine_hung(adev, false); + return ret; +} + +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, + struct amdgpu_job *job, + bool *need_full_reset_arg) { int i, r = 0; bool need_full_reset = *need_full_reset_arg; - amdgpu_debugfs_wait_dump(adev); + /* no need to dump if device is not in good state during probe period */ + if (!adev->gmc.xgmi.pending_reset) + amdgpu_debugfs_wait_dump(adev); if (amdgpu_sriov_vf(adev)) { /* stop the data exchange thread */ @@ -4288,23 +4388,24 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, return r; } -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, - struct list_head *device_list_handle, - bool *need_full_reset_arg, - bool skip_hw_reset) +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, + struct list_head *device_list_handle, + bool *need_full_reset_arg, + bool skip_hw_reset) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset = *need_full_reset_arg, vram_lost = false; int r = 0; /* - * ASIC reset has to be done on all HGMI hive nodes ASAP + * ASIC reset has to be done on all XGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) */ if (!skip_hw_reset && need_full_reset) { - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { + tmp_adev->gmc.xgmi.pending_reset = false; if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) r = -EALREADY; } else @@ -4319,8 +4420,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, /* For XGMI wait for all resets to complete before proceed */ if (!r) { - list_for_each_entry(tmp_adev, device_list_handle, - gmc.xgmi.head) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { flush_work(&tmp_adev->xgmi_reset_work); r = tmp_adev->asic_reset_res; @@ -4332,7 +4432,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, } if (!r && amdgpu_ras_intr_triggered()) { - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (tmp_adev->mmhub.funcs && tmp_adev->mmhub.funcs->reset_ras_error_count) tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); @@ -4341,13 +4441,13 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, amdgpu_ras_intr_cleared(); } - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (need_full_reset) { /* post card */ - if (amdgpu_device_asic_init(tmp_adev)) + r = amdgpu_device_asic_init(tmp_adev); + if (r) { dev_warn(tmp_adev->dev, "asic atom init failed!"); - - if (!r) { + } else { dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); r = amdgpu_device_ip_resume_phase1(tmp_adev); if (r) @@ -4380,6 +4480,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, */ amdgpu_register_gpu_instance(tmp_adev); + if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) + amdgpu_xgmi_add_device(tmp_adev); + r = amdgpu_device_ip_late_init(tmp_adev); if (r) goto out; @@ -4396,7 +4499,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, * bad_page_threshold value to fix this once * probing driver again. */ - if (!amdgpu_ras_check_err_threshold(tmp_adev)) { + if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { /* must succeed. */ amdgpu_ras_resume(tmp_adev); } else { @@ -4446,7 +4549,6 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, down_write(&adev->reset_sem); } - atomic_inc(&adev->gpu_reset_counter); switch (amdgpu_asic_reset_method(adev)) { case AMD_RESET_METHOD_MODE1: adev->mp1_state = PP_MP1_STATE_SHUTDOWN; @@ -4653,16 +4755,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, */ INIT_LIST_HEAD(&device_list); if (adev->gmc.xgmi.num_physical_nodes > 1) { - if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) - list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); - device_list_handle = &hive->device_list; + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) + list_add_tail(&tmp_adev->reset_list, &device_list); + if (!list_is_first(&adev->reset_list, &device_list)) + list_rotate_to_front(&adev->reset_list, &device_list); + device_list_handle = &device_list; } else { - list_add_tail(&adev->gmc.xgmi.head, &device_list); + list_add_tail(&adev->reset_list, &device_list); device_list_handle = &device_list; } /* block all schedulers and reset given job's ring */ - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { /* * Try to put the audio codec into suspend state * before gpu reset started. @@ -4707,6 +4811,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (need_emergency_restart) amdgpu_job_stop_all_jobs_on_sched(&ring->sched); } + atomic_inc(&tmp_adev->gpu_reset_counter); } if (need_emergency_restart) @@ -4726,7 +4831,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, } retry: /* Rest of adevs pre asic reset from XGMI hive. */ - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { r = amdgpu_device_pre_asic_reset(tmp_adev, (tmp_adev == adev) ? job : NULL, &need_full_reset); @@ -4753,7 +4858,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ skip_hw_reset: /* Post ASIC reset for all devs .*/ - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; @@ -4784,10 +4889,17 @@ skip_hw_reset: } skip_sched_resume: - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - /*unlock kfd: SRIOV would do it separately */ + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + /* unlock kfd: SRIOV would do it separately */ if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) amdgpu_amdkfd_post_reset(tmp_adev); + + /* kfd_post_reset will do nothing if kfd device is not initialized, + * need to bring up kfd here if it's not be initialized before + */ + if (!adev->kfd.init_complete) + amdgpu_amdkfd_device_init(adev); + if (audio_suspended) amdgpu_device_resume_display_audio(tmp_adev); amdgpu_device_unlock_adev(tmp_adev); @@ -5049,6 +5161,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta drm_sched_stop(&ring->sched, NULL); } + atomic_inc(&adev->gpu_reset_counter); return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure: /* Permanent error, prepare for device removal */ @@ -5097,7 +5210,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) DRM_INFO("PCI error: slot reset callback!!\n"); INIT_LIST_HEAD(&device_list); - list_add_tail(&adev->gmc.xgmi.head, &device_list); + list_add_tail(&adev->reset_list, &device_list); /* wait for asic to come out of reset */ msleep(500);