drm/amdgpu: Fix spelling mistake "disabed" -> "disabled"
[linux-2.6-microblaze.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
index 0b3db21..c355207 100644 (file)
@@ -1224,6 +1224,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
                }
        }
 
+       /* Don't post if we need to reset whole hive on init */
+       if (adev->gmc.xgmi.pending_reset)
+               return false;
+
        if (adev->has_hw_reset) {
                adev->has_hw_reset = false;
                return true;
@@ -2154,6 +2158,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
                        if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
                                continue;
 
+                       if (!adev->ip_blocks[i].status.sw)
+                               continue;
+
                        /* no need to do the fw loading again if already done*/
                        if (adev->ip_blocks[i].status.hw == true)
                                break;
@@ -2294,7 +2301,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 
        if (adev->gmc.xgmi.num_physical_nodes > 1)
                amdgpu_xgmi_add_device(adev);
-       amdgpu_amdkfd_device_init(adev);
+
+       /* Don't init kfd if whole hive need to be reset during init */
+       if (!adev->gmc.xgmi.pending_reset)
+               amdgpu_amdkfd_device_init(adev);
 
        amdgpu_fru_get_product_info(adev);
 
@@ -2739,6 +2749,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
                        adev->ip_blocks[i].status.hw = false;
                        continue;
                }
+
+               /* skip unnecessary suspend if we do not initialize them yet */
+               if (adev->gmc.xgmi.pending_reset &&
+                   !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
+                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
+                       adev->ip_blocks[i].status.hw = false;
+                       continue;
+               }
                /* XXX handle errors */
                r = adev->ip_blocks[i].version->funcs->suspend(adev);
                /* XXX handle errors */
@@ -3414,10 +3434,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
         *  E.g., driver was not cleanly unloaded previously, etc.
         */
        if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
-               r = amdgpu_asic_reset(adev);
-               if (r) {
-                       dev_err(adev->dev, "asic reset on init failed\n");
-                       goto failed;
+               if (adev->gmc.xgmi.num_physical_nodes) {
+                       dev_info(adev->dev, "Pending hive reset.\n");
+                       adev->gmc.xgmi.pending_reset = true;
+                       /* Only need to init necessary block for SMU to handle the reset */
+                       for (i = 0; i < adev->num_ip_blocks; i++) {
+                               if (!adev->ip_blocks[i].status.valid)
+                                       continue;
+                               if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
+                                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
+                                       DRM_DEBUG("IP %s disabled for hw_init.\n",
+                                               adev->ip_blocks[i].version->funcs->name);
+                                       adev->ip_blocks[i].status.hw = true;
+                               }
+                       }
+               } else {
+                       r = amdgpu_asic_reset(adev);
+                       if (r) {
+                               dev_err(adev->dev, "asic reset on init failed\n");
+                               goto failed;
+                       }
                }
        }
 
@@ -3548,19 +3586,19 @@ fence_driver_init:
        /* enable clockgating, etc. after ib tests, etc. since some blocks require
         * explicit gating rather than handling it automatically.
         */
-       r = amdgpu_device_ip_late_init(adev);
-       if (r) {
-               dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
-               amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
-               goto failed;
+       if (!adev->gmc.xgmi.pending_reset) {
+               r = amdgpu_device_ip_late_init(adev);
+               if (r) {
+                       dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
+                       amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
+                       goto failed;
+               }
+               /* must succeed. */
+               amdgpu_ras_resume(adev);
+               queue_delayed_work(system_wq, &adev->delayed_init_work,
+                                  msecs_to_jiffies(AMDGPU_RESUME_MS));
        }
 
-       /* must succeed. */
-       amdgpu_ras_resume(adev);
-
-       queue_delayed_work(system_wq, &adev->delayed_init_work,
-                          msecs_to_jiffies(AMDGPU_RESUME_MS));
-
        if (amdgpu_sriov_vf(adev))
                flush_delayed_work(&adev->delayed_init_work);
 
@@ -3577,6 +3615,14 @@ fence_driver_init:
        if (amdgpu_device_cache_pci_state(adev->pdev))
                pci_restore_state(pdev);
 
+       /* Enable lightSBR on SMU in passthrough + xgmi configuration */
+       if (amdgpu_passthrough(adev) && adev->gmc.xgmi.num_physical_nodes > 1)
+               smu_set_light_sbr(&adev->smu, true);
+
+       if (adev->gmc.xgmi.pending_reset)
+               queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
+                                  msecs_to_jiffies(AMDGPU_RESUME_MS));
+
        return 0;
 
 failed:
@@ -4287,14 +4333,16 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
         return ret;
 }
 
-static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
-                                       struct amdgpu_job *job,
-                                       bool *need_full_reset_arg)
+int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
+                                 struct amdgpu_job *job,
+                                 bool *need_full_reset_arg)
 {
        int i, r = 0;
        bool need_full_reset  = *need_full_reset_arg;
 
-       amdgpu_debugfs_wait_dump(adev);
+       /* no need to dump if device is not in good state during probe period */
+       if (!adev->gmc.xgmi.pending_reset)
+               amdgpu_debugfs_wait_dump(adev);
 
        if (amdgpu_sriov_vf(adev)) {
                /* stop the data exchange thread */
@@ -4340,10 +4388,10 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
        return r;
 }
 
-static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
-                              struct list_head *device_list_handle,
-                              bool *need_full_reset_arg,
-                              bool skip_hw_reset)
+int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+                         struct list_head *device_list_handle,
+                         bool *need_full_reset_arg,
+                         bool skip_hw_reset)
 {
        struct amdgpu_device *tmp_adev = NULL;
        bool need_full_reset = *need_full_reset_arg, vram_lost = false;
@@ -4357,6 +4405,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
                        /* For XGMI run all resets in parallel to speed up the process */
                        if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+                               tmp_adev->gmc.xgmi.pending_reset = false;
                                if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
                                        r = -EALREADY;
                        } else
@@ -4395,10 +4444,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
        list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
                if (need_full_reset) {
                        /* post card */
-                       if (amdgpu_device_asic_init(tmp_adev))
+                       r = amdgpu_device_asic_init(tmp_adev);
+                       if (r) {
                                dev_warn(tmp_adev->dev, "asic atom init failed!");
-
-                       if (!r) {
+                       } else {
                                dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
                                r = amdgpu_device_ip_resume_phase1(tmp_adev);
                                if (r)
@@ -4431,6 +4480,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                                 */
                                amdgpu_register_gpu_instance(tmp_adev);
 
+                               if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
+                                       amdgpu_xgmi_add_device(tmp_adev);
+
                                r = amdgpu_device_ip_late_init(tmp_adev);
                                if (r)
                                        goto out;