drm/amdgpu: Fix spelling mistake "disabed" -> "disabled"

[linux-2.6-microblaze.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 00b6ba5..c355207 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -110,6 +110,7 @@ const char *amdgpu_asic_name[] = {
         "RAVEN",
         "ARCTURUS",
         "RENOIR",
         "RAVEN",
         "ARCTURUS",
         "RENOIR",
+       "ALDEBARAN",
         "NAVI10",
         "NAVI14",
         "NAVI12",
         "NAVI10",
         "NAVI14",
         "NAVI12",
@@ -929,6 +930,18 @@ void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
  }
  
         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
  }
  
+/**
+ * amdgpu_device_pci_reset - reset the GPU using generic PCI means
+ *
+ * @adev: amdgpu_device pointer
+ *
+ * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
+ */
+int amdgpu_device_pci_reset(struct amdgpu_device *adev)
+{
+       return pci_reset_function(adev->pdev);
+}
+
  /*
   * GPU doorbell aperture helpers function.
   */
  /*
   * GPU doorbell aperture helpers function.
   */
@@ -1211,6 +1224,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
                 }
         }
  
                 }
         }
  
+       /* Don't post if we need to reset whole hive on init */
+       if (adev->gmc.xgmi.pending_reset)
+               return false;
+
         if (adev->has_hw_reset) {
                 adev->has_hw_reset = false;
                 return true;
         if (adev->has_hw_reset) {
                 adev->has_hw_reset = false;
                 return true;
@@ -1433,10 +1450,8 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
                 amdgpu_device_resume(dev, true);
  
                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
                 amdgpu_device_resume(dev, true);
  
                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
-               drm_kms_helper_poll_enable(dev);
         } else {
                 pr_info("switched off\n");
         } else {
                 pr_info("switched off\n");
-               drm_kms_helper_poll_disable(dev);
                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
                 amdgpu_device_suspend(dev, true);
                 amdgpu_device_cache_pci_state(pdev);
                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
                 amdgpu_device_suspend(dev, true);
                 amdgpu_device_cache_pci_state(pdev);
@@ -1800,6 +1815,7 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
         case CHIP_CARRIZO:
         case CHIP_STONEY:
         case CHIP_VEGA20:
         case CHIP_CARRIZO:
         case CHIP_STONEY:
         case CHIP_VEGA20:
+       case CHIP_ALDEBARAN:
         case CHIP_SIENNA_CICHLID:
         case CHIP_NAVY_FLOUNDER:
         case CHIP_DIMGREY_CAVEFISH:
         case CHIP_SIENNA_CICHLID:
         case CHIP_NAVY_FLOUNDER:
         case CHIP_DIMGREY_CAVEFISH:
@@ -2000,6 +2016,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
         case CHIP_RAVEN:
         case CHIP_ARCTURUS:
         case CHIP_RENOIR:
         case CHIP_RAVEN:
         case CHIP_ARCTURUS:
         case CHIP_RENOIR:
+       case CHIP_ALDEBARAN:
                 if (adev->flags & AMD_IS_APU)
                         adev->family = AMDGPU_FAMILY_RV;
                 else
                 if (adev->flags & AMD_IS_APU)
                         adev->family = AMDGPU_FAMILY_RV;
                 else
@@ -2035,6 +2052,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
         adev->pm.pp_feature = amdgpu_pp_feature_mask;
         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
         adev->pm.pp_feature = amdgpu_pp_feature_mask;
         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
+       if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
+               adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
  
         for (i = 0; i < adev->num_ip_blocks; i++) {
                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
  
         for (i = 0; i < adev->num_ip_blocks; i++) {
                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
@@ -2139,6 +2158,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
                                 continue;
  
                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
                                 continue;
  
+                       if (!adev->ip_blocks[i].status.sw)
+                               continue;
+
                         /* no need to do the fw loading again if already done*/
                         if (adev->ip_blocks[i].status.hw == true)
                                 break;
                         /* no need to do the fw loading again if already done*/
                         if (adev->ip_blocks[i].status.hw == true)
                                 break;
@@ -2279,7 +2301,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
  
         if (adev->gmc.xgmi.num_physical_nodes > 1)
                 amdgpu_xgmi_add_device(adev);
  
         if (adev->gmc.xgmi.num_physical_nodes > 1)
                 amdgpu_xgmi_add_device(adev);
-       amdgpu_amdkfd_device_init(adev);
+
+       /* Don't init kfd if whole hive need to be reset during init */
+       if (!adev->gmc.xgmi.pending_reset)
+               amdgpu_amdkfd_device_init(adev);
  
         amdgpu_fru_get_product_info(adev);
  
  
         amdgpu_fru_get_product_info(adev);
  
@@ -2668,7 +2693,8 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
  {
         int i, r;
  
  {
         int i, r;
  
-       if (!amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) {
+       if (adev->in_poweroff_reboot_com ||
+           !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) {
                 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
                 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
         }
                 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
                 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
         }
@@ -2723,6 +2749,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
                         adev->ip_blocks[i].status.hw = false;
                         continue;
                 }
                         adev->ip_blocks[i].status.hw = false;
                         continue;
                 }
+
+               /* skip unnecessary suspend if we do not initialize them yet */
+               if (adev->gmc.xgmi.pending_reset &&
+                   !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
+                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
+                       adev->ip_blocks[i].status.hw = false;
+                       continue;
+               }
                 /* XXX handle errors */
                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
                 /* XXX handle errors */
                 /* XXX handle errors */
                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
                 /* XXX handle errors */
@@ -2763,8 +2799,10 @@ int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
  {
         int r;
  
  {
         int r;
  
-       if (amdgpu_sriov_vf(adev))
+       if (amdgpu_sriov_vf(adev)) {
+               amdgpu_virt_fini_data_exchange(adev);
                 amdgpu_virt_request_full_gpu(adev, false);
                 amdgpu_virt_request_full_gpu(adev, false);
+       }
  
         r = amdgpu_device_ip_suspend_phase1(adev);
         if (r)
  
         r = amdgpu_device_ip_suspend_phase1(adev);
         if (r)
@@ -3279,6 +3317,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
         INIT_LIST_HEAD(&adev->shadow_list);
         mutex_init(&adev->shadow_list_lock);
  
         INIT_LIST_HEAD(&adev->shadow_list);
         mutex_init(&adev->shadow_list_lock);
  
+       INIT_LIST_HEAD(&adev->reset_list);
+
         INIT_DELAYED_WORK(&adev->delayed_init_work,
                           amdgpu_device_delayed_init_work_handler);
         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
         INIT_DELAYED_WORK(&adev->delayed_init_work,
                           amdgpu_device_delayed_init_work_handler);
         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
@@ -3394,10 +3434,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
          *  E.g., driver was not cleanly unloaded previously, etc.
          */
         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
          *  E.g., driver was not cleanly unloaded previously, etc.
          */
         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
-               r = amdgpu_asic_reset(adev);
-               if (r) {
-                       dev_err(adev->dev, "asic reset on init failed\n");
-                       goto failed;
+               if (adev->gmc.xgmi.num_physical_nodes) {
+                       dev_info(adev->dev, "Pending hive reset.\n");
+                       adev->gmc.xgmi.pending_reset = true;
+                       /* Only need to init necessary block for SMU to handle the reset */
+                       for (i = 0; i < adev->num_ip_blocks; i++) {
+                               if (!adev->ip_blocks[i].status.valid)
+                                       continue;
+                               if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
+                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
+                                       DRM_DEBUG("IP %s disabled for hw_init.\n",
+                                               adev->ip_blocks[i].version->funcs->name);
+                                       adev->ip_blocks[i].status.hw = true;
+                               }
+                       }
+               } else {
+                       r = amdgpu_asic_reset(adev);
+                       if (r) {
+                               dev_err(adev->dev, "asic reset on init failed\n");
+                               goto failed;
+                       }
                 }
         }
  
                 }
         }
  
@@ -3528,19 +3586,19 @@ fence_driver_init:
         /* enable clockgating, etc. after ib tests, etc. since some blocks require
          * explicit gating rather than handling it automatically.
          */
         /* enable clockgating, etc. after ib tests, etc. since some blocks require
          * explicit gating rather than handling it automatically.
          */
-       r = amdgpu_device_ip_late_init(adev);
-       if (r) {
-               dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
-               amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
-               goto failed;
+       if (!adev->gmc.xgmi.pending_reset) {
+               r = amdgpu_device_ip_late_init(adev);
+               if (r) {
+                       dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
+                       amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
+                       goto failed;
+               }
+               /* must succeed. */
+               amdgpu_ras_resume(adev);
+               queue_delayed_work(system_wq, &adev->delayed_init_work,
+                                  msecs_to_jiffies(AMDGPU_RESUME_MS));
         }
  
         }
  
-       /* must succeed. */
-       amdgpu_ras_resume(adev);
-
-       queue_delayed_work(system_wq, &adev->delayed_init_work,
-                          msecs_to_jiffies(AMDGPU_RESUME_MS));
-
         if (amdgpu_sriov_vf(adev))
                 flush_delayed_work(&adev->delayed_init_work);
  
         if (amdgpu_sriov_vf(adev))
                 flush_delayed_work(&adev->delayed_init_work);
  
@@ -3557,6 +3615,14 @@ fence_driver_init:
         if (amdgpu_device_cache_pci_state(adev->pdev))
                 pci_restore_state(pdev);
  
         if (amdgpu_device_cache_pci_state(adev->pdev))
                 pci_restore_state(pdev);
  
+       /* Enable lightSBR on SMU in passthrough + xgmi configuration */
+       if (amdgpu_passthrough(adev) && adev->gmc.xgmi.num_physical_nodes > 1)
+               smu_set_light_sbr(&adev->smu, true);
+
+       if (adev->gmc.xgmi.pending_reset)
+               queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
+                                  msecs_to_jiffies(AMDGPU_RESUME_MS));
+
         return 0;
  
  failed:
         return 0;
  
  failed:
@@ -3583,6 +3649,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
  {
         dev_info(adev->dev, "amdgpu: finishing device.\n");
         flush_delayed_work(&adev->delayed_init_work);
  {
         dev_info(adev->dev, "amdgpu: finishing device.\n");
         flush_delayed_work(&adev->delayed_init_work);
+       ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
         adev->shutdown = true;
  
         kfree(adev->pci_state);
         adev->shutdown = true;
  
         kfree(adev->pci_state);
@@ -3724,14 +3791,15 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
  
         r = amdgpu_device_ip_suspend_phase1(adev);
  
  
         r = amdgpu_device_ip_suspend_phase1(adev);
  
-       amdgpu_amdkfd_suspend(adev, !fbcon);
+       amdgpu_amdkfd_suspend(adev, adev->in_runpm);
  
         /* evict vram memory */
         amdgpu_bo_evict_vram(adev);
  
         amdgpu_fence_driver_suspend(adev);
  
  
         /* evict vram memory */
         amdgpu_bo_evict_vram(adev);
  
         amdgpu_fence_driver_suspend(adev);
  
-       if (!amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev))
+       if (adev->in_poweroff_reboot_com ||
+           !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev))
                 r = amdgpu_device_ip_suspend_phase2(adev);
         else
                 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
                 r = amdgpu_device_ip_suspend_phase2(adev);
         else
                 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
@@ -3808,7 +3876,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
                         }
                 }
         }
                         }
                 }
         }
-       r = amdgpu_amdkfd_resume(adev, !fbcon);
+       r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
         if (r)
                 return r;
  
         if (r)
                 return r;
  
@@ -4211,7 +4279,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
                 case CHIP_NAVI12:
                 case CHIP_SIENNA_CICHLID:
                 case CHIP_NAVY_FLOUNDER:
                 case CHIP_NAVI12:
                 case CHIP_SIENNA_CICHLID:
                 case CHIP_NAVY_FLOUNDER:
-               case CHIP_VANGOGH:
+               case CHIP_DIMGREY_CAVEFISH:
                         break;
                 default:
                         goto disabled;
                         break;
                 default:
                         goto disabled;
@@ -4225,15 +4293,56 @@ disabled:
                 return false;
  }
  
                 return false;
  }
  
+int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
+{
+        u32 i;
+        int ret = 0;
+
+        amdgpu_atombios_scratch_regs_engine_hung(adev, true);
+
+        dev_info(adev->dev, "GPU mode1 reset\n");
+
+        /* disable BM */
+        pci_clear_master(adev->pdev);
+
+        amdgpu_device_cache_pci_state(adev->pdev);
+
+        if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
+                dev_info(adev->dev, "GPU smu mode1 reset\n");
+                ret = amdgpu_dpm_mode1_reset(adev);
+        } else {
+                dev_info(adev->dev, "GPU psp mode1 reset\n");
+                ret = psp_gpu_reset(adev);
+        }
  
  
-static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
-                                       struct amdgpu_job *job,
-                                       bool *need_full_reset_arg)
+        if (ret)
+                dev_err(adev->dev, "GPU mode1 reset failed\n");
+
+        amdgpu_device_load_pci_state(adev->pdev);
+
+        /* wait for asic to come out of reset */
+        for (i = 0; i < adev->usec_timeout; i++) {
+                u32 memsize = adev->nbio.funcs->get_memsize(adev);
+
+                if (memsize != 0xffffffff)
+                        break;
+                udelay(1);
+        }
+
+        amdgpu_atombios_scratch_regs_engine_hung(adev, false);
+        return ret;
+}
+
+int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
+                                 struct amdgpu_job *job,
+                                 bool *need_full_reset_arg)
  {
         int i, r = 0;
         bool need_full_reset  = *need_full_reset_arg;
  
  {
         int i, r = 0;
         bool need_full_reset  = *need_full_reset_arg;
  
-       amdgpu_debugfs_wait_dump(adev);
+       /* no need to dump if device is not in good state during probe period */
+       if (!adev->gmc.xgmi.pending_reset)
+               amdgpu_debugfs_wait_dump(adev);
  
         if (amdgpu_sriov_vf(adev)) {
                 /* stop the data exchange thread */
  
         if (amdgpu_sriov_vf(adev)) {
                 /* stop the data exchange thread */
@@ -4279,23 +4388,24 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
         return r;
  }
  
         return r;
  }
  
-static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
-                              struct list_head *device_list_handle,
-                              bool *need_full_reset_arg,
-                              bool skip_hw_reset)
+int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+                         struct list_head *device_list_handle,
+                         bool *need_full_reset_arg,
+                         bool skip_hw_reset)
  {
         struct amdgpu_device *tmp_adev = NULL;
         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
         int r = 0;
  
         /*
  {
         struct amdgpu_device *tmp_adev = NULL;
         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
         int r = 0;
  
         /*
-        * ASIC reset has to be done on all HGMI hive nodes ASAP
+        * ASIC reset has to be done on all XGMI hive nodes ASAP
          * to allow proper links negotiation in FW (within 1 sec)
          */
         if (!skip_hw_reset && need_full_reset) {
          * to allow proper links negotiation in FW (within 1 sec)
          */
         if (!skip_hw_reset && need_full_reset) {
-               list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+               list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
                         /* For XGMI run all resets in parallel to speed up the process */
                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
                         /* For XGMI run all resets in parallel to speed up the process */
                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+                               tmp_adev->gmc.xgmi.pending_reset = false;
                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
                                         r = -EALREADY;
                         } else
                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
                                         r = -EALREADY;
                         } else
@@ -4310,8 +4420,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
  
                 /* For XGMI wait for all resets to complete before proceed */
                 if (!r) {
  
                 /* For XGMI wait for all resets to complete before proceed */
                 if (!r) {
-                       list_for_each_entry(tmp_adev, device_list_handle,
-                                           gmc.xgmi.head) {
+                       list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
                                         flush_work(&tmp_adev->xgmi_reset_work);
                                         r = tmp_adev->asic_reset_res;
                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
                                         flush_work(&tmp_adev->xgmi_reset_work);
                                         r = tmp_adev->asic_reset_res;
@@ -4323,7 +4432,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
         }
  
         if (!r && amdgpu_ras_intr_triggered()) {
         }
  
         if (!r && amdgpu_ras_intr_triggered()) {
-               list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+               list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
                         if (tmp_adev->mmhub.funcs &&
                             tmp_adev->mmhub.funcs->reset_ras_error_count)
                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
                         if (tmp_adev->mmhub.funcs &&
                             tmp_adev->mmhub.funcs->reset_ras_error_count)
                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
@@ -4332,13 +4441,13 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                 amdgpu_ras_intr_cleared();
         }
  
                 amdgpu_ras_intr_cleared();
         }
  
-       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+       list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
                 if (need_full_reset) {
                         /* post card */
                 if (need_full_reset) {
                         /* post card */
-                       if (amdgpu_device_asic_init(tmp_adev))
+                       r = amdgpu_device_asic_init(tmp_adev);
+                       if (r) {
                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
-
-                       if (!r) {
+                       } else {
                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
                                 if (r)
                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
                                 if (r)
@@ -4371,6 +4480,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                                  */
                                 amdgpu_register_gpu_instance(tmp_adev);
  
                                  */
                                 amdgpu_register_gpu_instance(tmp_adev);
  
+                               if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
+                                       amdgpu_xgmi_add_device(tmp_adev);
+
                                 r = amdgpu_device_ip_late_init(tmp_adev);
                                 if (r)
                                         goto out;
                                 r = amdgpu_device_ip_late_init(tmp_adev);
                                 if (r)
                                         goto out;
@@ -4387,7 +4499,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                                  * bad_page_threshold value to fix this once
                                  * probing driver again.
                                  */
                                  * bad_page_threshold value to fix this once
                                  * probing driver again.
                                  */
-                               if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
+                               if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
                                         /* must succeed. */
                                         amdgpu_ras_resume(tmp_adev);
                                 } else {
                                         /* must succeed. */
                                         amdgpu_ras_resume(tmp_adev);
                                 } else {
@@ -4437,7 +4549,6 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
                 down_write(&adev->reset_sem);
         }
  
                 down_write(&adev->reset_sem);
         }
  
-       atomic_inc(&adev->gpu_reset_counter);
         switch (amdgpu_asic_reset_method(adev)) {
         case AMD_RESET_METHOD_MODE1:
                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
         switch (amdgpu_asic_reset_method(adev)) {
         case AMD_RESET_METHOD_MODE1:
                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
@@ -4461,6 +4572,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
         up_write(&adev->reset_sem);
  }
  
         up_write(&adev->reset_sem);
  }
  
+/*
+ * to lockup a list of amdgpu devices in a hive safely, if not a hive
+ * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
+ *
+ * unlock won't require roll back.
+ */
+static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
+{
+       struct amdgpu_device *tmp_adev = NULL;
+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               if (!hive) {
+                       dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
+                       return -ENODEV;
+               }
+               list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+                       if (!amdgpu_device_lock_adev(tmp_adev, hive))
+                               goto roll_back;
+               }
+       } else if (!amdgpu_device_lock_adev(adev, hive))
+               return -EAGAIN;
+
+       return 0;
+roll_back:
+       if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
+               /*
+                * if the lockup iteration break in the middle of a hive,
+                * it may means there may has a race issue,
+                * or a hive device locked up independently.
+                * we may be in trouble and may not, so will try to roll back
+                * the lock and give out a warnning.
+                */
+               dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
+               list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+                       amdgpu_device_unlock_adev(tmp_adev);
+               }
+       }
+       return -EAGAIN;
+}
+
  static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
  {
         struct pci_dev *p = NULL;
  static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
  {
         struct pci_dev *p = NULL;
@@ -4574,11 +4725,29 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
                                 job ? job->base.id : -1, hive->hive_id);
                         amdgpu_put_xgmi_hive(hive);
                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
                                 job ? job->base.id : -1, hive->hive_id);
                         amdgpu_put_xgmi_hive(hive);
+                       if (job)
+                               drm_sched_increase_karma(&job->base);
                         return 0;
                 }
                 mutex_lock(&hive->hive_lock);
         }
  
                         return 0;
                 }
                 mutex_lock(&hive->hive_lock);
         }
  
+       /*
+        * lock the device before we try to operate the linked list
+        * if didn't get the device lock, don't touch the linked list since
+        * others may iterating it.
+        */
+       r = amdgpu_device_lock_hive_adev(adev, hive);
+       if (r) {
+               dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
+                                       job ? job->base.id : -1);
+
+               /* even we skipped this reset, still need to set the job to guilty */
+               if (job)
+                       drm_sched_increase_karma(&job->base);
+               goto skip_recovery;
+       }
+
         /*
          * Build list of devices to reset.
          * In case we are in XGMI hive mode, resort the device list
         /*
          * Build list of devices to reset.
          * In case we are in XGMI hive mode, resort the device list
@@ -4586,25 +4755,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
          */
         INIT_LIST_HEAD(&device_list);
         if (adev->gmc.xgmi.num_physical_nodes > 1) {
          */
         INIT_LIST_HEAD(&device_list);
         if (adev->gmc.xgmi.num_physical_nodes > 1) {
-               if (!hive)
-                       return -ENODEV;
-               if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
-                       list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
-               device_list_handle = &hive->device_list;
+               list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+                       list_add_tail(&tmp_adev->reset_list, &device_list);
+               if (!list_is_first(&adev->reset_list, &device_list))
+                       list_rotate_to_front(&adev->reset_list, &device_list);
+               device_list_handle = &device_list;
         } else {
         } else {
-               list_add_tail(&adev->gmc.xgmi.head, &device_list);
+               list_add_tail(&adev->reset_list, &device_list);
                 device_list_handle = &device_list;
         }
  
         /* block all schedulers and reset given job's ring */
                 device_list_handle = &device_list;
         }
  
         /* block all schedulers and reset given job's ring */
-       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-               if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
-                       dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
-                                 job ? job->base.id : -1);
-                       r = 0;
-                       goto skip_recovery;
-               }
-
+       list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
                 /*
                  * Try to put the audio codec into suspend state
                  * before gpu reset started.
                 /*
                  * Try to put the audio codec into suspend state
                  * before gpu reset started.
@@ -4649,6 +4811,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                         if (need_emergency_restart)
                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
                 }
                         if (need_emergency_restart)
                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
                 }
+               atomic_inc(&tmp_adev->gpu_reset_counter);
         }
  
         if (need_emergency_restart)
         }
  
         if (need_emergency_restart)
@@ -4668,7 +4831,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         }
  
  retry: /* Rest of adevs pre asic reset from XGMI hive. */
         }
  
  retry: /* Rest of adevs pre asic reset from XGMI hive. */
-       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+       list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
                 r = amdgpu_device_pre_asic_reset(tmp_adev,
                                                  (tmp_adev == adev) ? job : NULL,
                                                  &need_full_reset);
                 r = amdgpu_device_pre_asic_reset(tmp_adev,
                                                  (tmp_adev == adev) ? job : NULL,
                                                  &need_full_reset);
@@ -4695,7 +4858,7 @@ retry:    /* Rest of adevs pre asic reset from XGMI hive. */
  skip_hw_reset:
  
         /* Post ASIC reset for all devs .*/
  skip_hw_reset:
  
         /* Post ASIC reset for all devs .*/
-       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+       list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
  
                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                         struct amdgpu_ring *ring = tmp_adev->rings[i];
  
                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                         struct amdgpu_ring *ring = tmp_adev->rings[i];
@@ -4726,10 +4889,17 @@ skip_hw_reset:
         }
  
  skip_sched_resume:
         }
  
  skip_sched_resume:
-       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-               /*unlock kfd: SRIOV would do it separately */
+       list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+               /* unlock kfd: SRIOV would do it separately */
                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
                         amdgpu_amdkfd_post_reset(tmp_adev);
                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
                         amdgpu_amdkfd_post_reset(tmp_adev);
+
+               /* kfd_post_reset will do nothing if kfd device is not initialized,
+                * need to bring up kfd here if it's not be initialized before
+                */
+               if (!adev->kfd.init_complete)
+                       amdgpu_amdkfd_device_init(adev);
+
                 if (audio_suspended)
                         amdgpu_device_resume_display_audio(tmp_adev);
                 amdgpu_device_unlock_adev(tmp_adev);
                 if (audio_suspended)
                         amdgpu_device_resume_display_audio(tmp_adev);
                 amdgpu_device_unlock_adev(tmp_adev);
@@ -4742,7 +4912,7 @@ skip_recovery:
                 amdgpu_put_xgmi_hive(hive);
         }
  
                 amdgpu_put_xgmi_hive(hive);
         }
  
-       if (r)
+       if (r && r != -EAGAIN)
                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
         return r;
  }
                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
         return r;
  }
@@ -4792,7 +4962,13 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
                 } else {
                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
                 } else {
-                       if (speed_cap == PCIE_SPEED_16_0GT)
+                       if (speed_cap == PCIE_SPEED_32_0GT)
+                               adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
+                                                         CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
+                                                         CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
+                                                         CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
+                                                         CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
+                       else if (speed_cap == PCIE_SPEED_16_0GT)
                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
@@ -4812,7 +4988,13 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
                 } else {
                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
                 } else {
-                       if (platform_speed_cap == PCIE_SPEED_16_0GT)
+                       if (platform_speed_cap == PCIE_SPEED_32_0GT)
+                               adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
+                                                          CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
+                                                          CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
+                                                          CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
+                                                          CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
+                       else if (platform_speed_cap == PCIE_SPEED_16_0GT)
                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
@@ -4979,6 +5161,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
  
                         drm_sched_stop(&ring->sched, NULL);
                 }
  
                         drm_sched_stop(&ring->sched, NULL);
                 }
+               atomic_inc(&adev->gpu_reset_counter);
                 return PCI_ERS_RESULT_NEED_RESET;
         case pci_channel_io_perm_failure:
                 /* Permanent error, prepare for device removal */
                 return PCI_ERS_RESULT_NEED_RESET;
         case pci_channel_io_perm_failure:
                 /* Permanent error, prepare for device removal */
@@ -5027,7 +5210,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
         DRM_INFO("PCI error: slot reset callback!!\n");
  
         INIT_LIST_HEAD(&device_list);
         DRM_INFO("PCI error: slot reset callback!!\n");
  
         INIT_LIST_HEAD(&device_list);
-       list_add_tail(&adev->gmc.xgmi.head, &device_list);
+       list_add_tail(&adev->reset_list, &device_list);
  
         /* wait for asic to come out of reset */
         msleep(500);
  
         /* wait for asic to come out of reset */
         msleep(500);