X-Git-Url: http://git.monstr.eu/?a=blobdiff_plain;f=drivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_device.c;h=c3552079bb8621e3628c4a219b28494aec562d75;hb=751f43e75d63103cec3b6be4d451b6a3e569e87b;hp=00b6ba5740f3a78978e10526393e8502ba80d51a;hpb=c106c5e2fd3bcde85a1ce4a7bd237e815b117cc8;p=linux-2.6-microblaze.git diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 00b6ba5740f3..c3552079bb86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -110,6 +110,7 @@ const char *amdgpu_asic_name[] = { "RAVEN", "ARCTURUS", "RENOIR", + "ALDEBARAN", "NAVI10", "NAVI14", "NAVI12", @@ -929,6 +930,18 @@ void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); } +/** + * amdgpu_device_pci_reset - reset the GPU using generic PCI means + * + * @adev: amdgpu_device pointer + * + * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). + */ +int amdgpu_device_pci_reset(struct amdgpu_device *adev) +{ + return pci_reset_function(adev->pdev); +} + /* * GPU doorbell aperture helpers function. */ @@ -1211,6 +1224,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) } } + /* Don't post if we need to reset whole hive on init */ + if (adev->gmc.xgmi.pending_reset) + return false; + if (adev->has_hw_reset) { adev->has_hw_reset = false; return true; @@ -1433,10 +1450,8 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, amdgpu_device_resume(dev, true); dev->switch_power_state = DRM_SWITCH_POWER_ON; - drm_kms_helper_poll_enable(dev); } else { pr_info("switched off\n"); - drm_kms_helper_poll_disable(dev); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; amdgpu_device_suspend(dev, true); amdgpu_device_cache_pci_state(pdev); @@ -1800,6 +1815,7 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) case CHIP_CARRIZO: case CHIP_STONEY: case CHIP_VEGA20: + case CHIP_ALDEBARAN: case CHIP_SIENNA_CICHLID: case CHIP_NAVY_FLOUNDER: case CHIP_DIMGREY_CAVEFISH: @@ -2000,6 +2016,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) case CHIP_RAVEN: case CHIP_ARCTURUS: case CHIP_RENOIR: + case CHIP_ALDEBARAN: if (adev->flags & AMD_IS_APU) adev->family = AMDGPU_FAMILY_RV; else @@ -2035,6 +2052,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) adev->pm.pp_feature = amdgpu_pp_feature_mask; if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; + if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) + adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; for (i = 0; i < adev->num_ip_blocks; i++) { if ((amdgpu_ip_block_mask & (1 << i)) == 0) { @@ -2139,6 +2158,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) continue; + if (!adev->ip_blocks[i].status.sw) + continue; + /* no need to do the fw loading again if already done*/ if (adev->ip_blocks[i].status.hw == true) break; @@ -2279,7 +2301,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(adev); - amdgpu_amdkfd_device_init(adev); + + /* Don't init kfd if whole hive need to be reset during init */ + if (!adev->gmc.xgmi.pending_reset) + amdgpu_amdkfd_device_init(adev); amdgpu_fru_get_product_info(adev); @@ -2668,7 +2693,8 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) { int i, r; - if (!amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) { + if (adev->in_poweroff_reboot_com || + !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) { amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); } @@ -2723,6 +2749,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) adev->ip_blocks[i].status.hw = false; continue; } + + /* skip unnecessary suspend if we do not initialize them yet */ + if (adev->gmc.xgmi.pending_reset && + !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { + adev->ip_blocks[i].status.hw = false; + continue; + } /* XXX handle errors */ r = adev->ip_blocks[i].version->funcs->suspend(adev); /* XXX handle errors */ @@ -2763,8 +2799,10 @@ int amdgpu_device_ip_suspend(struct amdgpu_device *adev) { int r; - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_vf(adev)) { + amdgpu_virt_fini_data_exchange(adev); amdgpu_virt_request_full_gpu(adev, false); + } r = amdgpu_device_ip_suspend_phase1(adev); if (r) @@ -3279,6 +3317,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_LIST_HEAD(&adev->shadow_list); mutex_init(&adev->shadow_list_lock); + INIT_LIST_HEAD(&adev->reset_list); + INIT_DELAYED_WORK(&adev->delayed_init_work, amdgpu_device_delayed_init_work_handler); INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, @@ -3394,10 +3434,28 @@ int amdgpu_device_init(struct amdgpu_device *adev, * E.g., driver was not cleanly unloaded previously, etc. */ if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { - r = amdgpu_asic_reset(adev); - if (r) { - dev_err(adev->dev, "asic reset on init failed\n"); - goto failed; + if (adev->gmc.xgmi.num_physical_nodes) { + dev_info(adev->dev, "Pending hive reset.\n"); + adev->gmc.xgmi.pending_reset = true; + /* Only need to init necessary block for SMU to handle the reset */ + for (i = 0; i < adev->num_ip_blocks; i++) { + if (!adev->ip_blocks[i].status.valid) + continue; + if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { + DRM_DEBUG("IP %s disabled for hw_init.\n", + adev->ip_blocks[i].version->funcs->name); + adev->ip_blocks[i].status.hw = true; + } + } + } else { + r = amdgpu_asic_reset(adev); + if (r) { + dev_err(adev->dev, "asic reset on init failed\n"); + goto failed; + } } } @@ -3528,19 +3586,19 @@ fence_driver_init: /* enable clockgating, etc. after ib tests, etc. since some blocks require * explicit gating rather than handling it automatically. */ - r = amdgpu_device_ip_late_init(adev); - if (r) { - dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); - goto failed; + if (!adev->gmc.xgmi.pending_reset) { + r = amdgpu_device_ip_late_init(adev); + if (r) { + dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); + amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); + goto failed; + } + /* must succeed. */ + amdgpu_ras_resume(adev); + queue_delayed_work(system_wq, &adev->delayed_init_work, + msecs_to_jiffies(AMDGPU_RESUME_MS)); } - /* must succeed. */ - amdgpu_ras_resume(adev); - - queue_delayed_work(system_wq, &adev->delayed_init_work, - msecs_to_jiffies(AMDGPU_RESUME_MS)); - if (amdgpu_sriov_vf(adev)) flush_delayed_work(&adev->delayed_init_work); @@ -3557,6 +3615,14 @@ fence_driver_init: if (amdgpu_device_cache_pci_state(adev->pdev)) pci_restore_state(pdev); + /* Enable lightSBR on SMU in passthrough + xgmi configuration */ + if (amdgpu_passthrough(adev) && adev->gmc.xgmi.num_physical_nodes > 1) + smu_set_light_sbr(&adev->smu, true); + + if (adev->gmc.xgmi.pending_reset) + queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, + msecs_to_jiffies(AMDGPU_RESUME_MS)); + return 0; failed: @@ -3583,6 +3649,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev) { dev_info(adev->dev, "amdgpu: finishing device.\n"); flush_delayed_work(&adev->delayed_init_work); + ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); adev->shutdown = true; kfree(adev->pci_state); @@ -3724,14 +3791,15 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) r = amdgpu_device_ip_suspend_phase1(adev); - amdgpu_amdkfd_suspend(adev, !fbcon); + amdgpu_amdkfd_suspend(adev, adev->in_runpm); /* evict vram memory */ amdgpu_bo_evict_vram(adev); amdgpu_fence_driver_suspend(adev); - if (!amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) + if (adev->in_poweroff_reboot_com || + !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) r = amdgpu_device_ip_suspend_phase2(adev); else amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); @@ -3808,7 +3876,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) } } } - r = amdgpu_amdkfd_resume(adev, !fbcon); + r = amdgpu_amdkfd_resume(adev, adev->in_runpm); if (r) return r; @@ -4211,7 +4279,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) case CHIP_NAVI12: case CHIP_SIENNA_CICHLID: case CHIP_NAVY_FLOUNDER: - case CHIP_VANGOGH: + case CHIP_DIMGREY_CAVEFISH: break; default: goto disabled; @@ -4225,15 +4293,56 @@ disabled: return false; } +int amdgpu_device_mode1_reset(struct amdgpu_device *adev) +{ + u32 i; + int ret = 0; + + amdgpu_atombios_scratch_regs_engine_hung(adev, true); + + dev_info(adev->dev, "GPU mode1 reset\n"); + + /* disable BM */ + pci_clear_master(adev->pdev); + + amdgpu_device_cache_pci_state(adev->pdev); + + if (amdgpu_dpm_is_mode1_reset_supported(adev)) { + dev_info(adev->dev, "GPU smu mode1 reset\n"); + ret = amdgpu_dpm_mode1_reset(adev); + } else { + dev_info(adev->dev, "GPU psp mode1 reset\n"); + ret = psp_gpu_reset(adev); + } -static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, - struct amdgpu_job *job, - bool *need_full_reset_arg) + if (ret) + dev_err(adev->dev, "GPU mode1 reset failed\n"); + + amdgpu_device_load_pci_state(adev->pdev); + + /* wait for asic to come out of reset */ + for (i = 0; i < adev->usec_timeout; i++) { + u32 memsize = adev->nbio.funcs->get_memsize(adev); + + if (memsize != 0xffffffff) + break; + udelay(1); + } + + amdgpu_atombios_scratch_regs_engine_hung(adev, false); + return ret; +} + +int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, + struct amdgpu_job *job, + bool *need_full_reset_arg) { int i, r = 0; bool need_full_reset = *need_full_reset_arg; - amdgpu_debugfs_wait_dump(adev); + /* no need to dump if device is not in good state during probe period */ + if (!adev->gmc.xgmi.pending_reset) + amdgpu_debugfs_wait_dump(adev); if (amdgpu_sriov_vf(adev)) { /* stop the data exchange thread */ @@ -4279,23 +4388,24 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, return r; } -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, - struct list_head *device_list_handle, - bool *need_full_reset_arg, - bool skip_hw_reset) +int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, + struct list_head *device_list_handle, + bool *need_full_reset_arg, + bool skip_hw_reset) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset = *need_full_reset_arg, vram_lost = false; int r = 0; /* - * ASIC reset has to be done on all HGMI hive nodes ASAP + * ASIC reset has to be done on all XGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) */ if (!skip_hw_reset && need_full_reset) { - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { + tmp_adev->gmc.xgmi.pending_reset = false; if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) r = -EALREADY; } else @@ -4310,8 +4420,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, /* For XGMI wait for all resets to complete before proceed */ if (!r) { - list_for_each_entry(tmp_adev, device_list_handle, - gmc.xgmi.head) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { flush_work(&tmp_adev->xgmi_reset_work); r = tmp_adev->asic_reset_res; @@ -4323,7 +4432,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, } if (!r && amdgpu_ras_intr_triggered()) { - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (tmp_adev->mmhub.funcs && tmp_adev->mmhub.funcs->reset_ras_error_count) tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); @@ -4332,13 +4441,13 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, amdgpu_ras_intr_cleared(); } - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (need_full_reset) { /* post card */ - if (amdgpu_device_asic_init(tmp_adev)) + r = amdgpu_device_asic_init(tmp_adev); + if (r) { dev_warn(tmp_adev->dev, "asic atom init failed!"); - - if (!r) { + } else { dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); r = amdgpu_device_ip_resume_phase1(tmp_adev); if (r) @@ -4371,6 +4480,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, */ amdgpu_register_gpu_instance(tmp_adev); + if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) + amdgpu_xgmi_add_device(tmp_adev); + r = amdgpu_device_ip_late_init(tmp_adev); if (r) goto out; @@ -4387,7 +4499,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, * bad_page_threshold value to fix this once * probing driver again. */ - if (!amdgpu_ras_check_err_threshold(tmp_adev)) { + if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { /* must succeed. */ amdgpu_ras_resume(tmp_adev); } else { @@ -4437,7 +4549,6 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, down_write(&adev->reset_sem); } - atomic_inc(&adev->gpu_reset_counter); switch (amdgpu_asic_reset_method(adev)) { case AMD_RESET_METHOD_MODE1: adev->mp1_state = PP_MP1_STATE_SHUTDOWN; @@ -4461,6 +4572,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) up_write(&adev->reset_sem); } +/* + * to lockup a list of amdgpu devices in a hive safely, if not a hive + * with multiple nodes, it will be similar as amdgpu_device_lock_adev. + * + * unlock won't require roll back. + */ +static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) +{ + struct amdgpu_device *tmp_adev = NULL; + + if (adev->gmc.xgmi.num_physical_nodes > 1) { + if (!hive) { + dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); + return -ENODEV; + } + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + if (!amdgpu_device_lock_adev(tmp_adev, hive)) + goto roll_back; + } + } else if (!amdgpu_device_lock_adev(adev, hive)) + return -EAGAIN; + + return 0; +roll_back: + if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { + /* + * if the lockup iteration break in the middle of a hive, + * it may means there may has a race issue, + * or a hive device locked up independently. + * we may be in trouble and may not, so will try to roll back + * the lock and give out a warnning. + */ + dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); + list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { + amdgpu_device_unlock_adev(tmp_adev); + } + } + return -EAGAIN; +} + static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) { struct pci_dev *p = NULL; @@ -4574,11 +4725,29 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", job ? job->base.id : -1, hive->hive_id); amdgpu_put_xgmi_hive(hive); + if (job) + drm_sched_increase_karma(&job->base); return 0; } mutex_lock(&hive->hive_lock); } + /* + * lock the device before we try to operate the linked list + * if didn't get the device lock, don't touch the linked list since + * others may iterating it. + */ + r = amdgpu_device_lock_hive_adev(adev, hive); + if (r) { + dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", + job ? job->base.id : -1); + + /* even we skipped this reset, still need to set the job to guilty */ + if (job) + drm_sched_increase_karma(&job->base); + goto skip_recovery; + } + /* * Build list of devices to reset. * In case we are in XGMI hive mode, resort the device list @@ -4586,25 +4755,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, */ INIT_LIST_HEAD(&device_list); if (adev->gmc.xgmi.num_physical_nodes > 1) { - if (!hive) - return -ENODEV; - if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) - list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); - device_list_handle = &hive->device_list; + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) + list_add_tail(&tmp_adev->reset_list, &device_list); + if (!list_is_first(&adev->reset_list, &device_list)) + list_rotate_to_front(&adev->reset_list, &device_list); + device_list_handle = &device_list; } else { - list_add_tail(&adev->gmc.xgmi.head, &device_list); + list_add_tail(&adev->reset_list, &device_list); device_list_handle = &device_list; } /* block all schedulers and reset given job's ring */ - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - if (!amdgpu_device_lock_adev(tmp_adev, hive)) { - dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", - job ? job->base.id : -1); - r = 0; - goto skip_recovery; - } - + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { /* * Try to put the audio codec into suspend state * before gpu reset started. @@ -4649,6 +4811,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (need_emergency_restart) amdgpu_job_stop_all_jobs_on_sched(&ring->sched); } + atomic_inc(&tmp_adev->gpu_reset_counter); } if (need_emergency_restart) @@ -4668,7 +4831,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, } retry: /* Rest of adevs pre asic reset from XGMI hive. */ - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { r = amdgpu_device_pre_asic_reset(tmp_adev, (tmp_adev == adev) ? job : NULL, &need_full_reset); @@ -4695,7 +4858,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ skip_hw_reset: /* Post ASIC reset for all devs .*/ - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; @@ -4726,10 +4889,17 @@ skip_hw_reset: } skip_sched_resume: - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - /*unlock kfd: SRIOV would do it separately */ + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + /* unlock kfd: SRIOV would do it separately */ if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) amdgpu_amdkfd_post_reset(tmp_adev); + + /* kfd_post_reset will do nothing if kfd device is not initialized, + * need to bring up kfd here if it's not be initialized before + */ + if (!adev->kfd.init_complete) + amdgpu_amdkfd_device_init(adev); + if (audio_suspended) amdgpu_device_resume_display_audio(tmp_adev); amdgpu_device_unlock_adev(tmp_adev); @@ -4742,7 +4912,7 @@ skip_recovery: amdgpu_put_xgmi_hive(hive); } - if (r) + if (r && r != -EAGAIN) dev_info(adev->dev, "GPU reset end with ret = %d\n", r); return r; } @@ -4792,7 +4962,13 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); } else { - if (speed_cap == PCIE_SPEED_16_0GT) + if (speed_cap == PCIE_SPEED_32_0GT) + adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | + CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | + CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | + CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | + CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); + else if (speed_cap == PCIE_SPEED_16_0GT) adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | @@ -4812,7 +4988,13 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); } else { - if (platform_speed_cap == PCIE_SPEED_16_0GT) + if (platform_speed_cap == PCIE_SPEED_32_0GT) + adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | + CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | + CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | + CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | + CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); + else if (platform_speed_cap == PCIE_SPEED_16_0GT) adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | @@ -4979,6 +5161,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta drm_sched_stop(&ring->sched, NULL); } + atomic_inc(&adev->gpu_reset_counter); return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure: /* Permanent error, prepare for device removal */ @@ -5027,7 +5210,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) DRM_INFO("PCI error: slot reset callback!!\n"); INIT_LIST_HEAD(&device_list); - list_add_tail(&adev->gmc.xgmi.head, &device_list); + list_add_tail(&adev->reset_list, &device_list); /* wait for asic to come out of reset */ msleep(500);