drm/amdkfd: Avoid hanging hardware in stop_cpsch
authorFelix Kuehling <Felix.Kuehling@amd.com>
Fri, 20 Dec 2019 07:46:55 +0000 (02:46 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 7 Jan 2020 16:55:04 +0000 (11:55 -0500)
Don't use the HWS if it's known to be hanging. In a reset also
don't try to destroy the HIQ because that may hang on SRIOV if the
KIQ is unresponsive.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Tested-by: Emily Deng <Emily.Deng@amd.com>
Reviewed-by: shaoyunl <shaoyun.liu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c

index a7e9ec1..d7eb6ac 100644 (file)
@@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm)
 static int stop_nocpsch(struct device_queue_manager *dqm)
 {
        if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
-               pm_uninit(&dqm->packets);
+               pm_uninit(&dqm->packets, false);
        dqm->sched_running = false;
 
        return 0;
@@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
        return 0;
 fail_allocate_vidmem:
 fail_set_sched_resources:
-       pm_uninit(&dqm->packets);
+       pm_uninit(&dqm->packets, false);
 fail_packet_manager_init:
        return retval;
 }
 
 static int stop_cpsch(struct device_queue_manager *dqm)
 {
+       bool hanging;
+
        dqm_lock(dqm);
-       unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+       if (!dqm->is_hws_hang)
+               unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+       hanging = dqm->is_hws_hang || dqm->is_resetting;
        dqm->sched_running = false;
        dqm_unlock(dqm);
 
        kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
-       pm_uninit(&dqm->packets);
+       pm_uninit(&dqm->packets, hanging);
 
        return 0;
 }
index 2d56dc5..bae7064 100644 (file)
@@ -195,9 +195,9 @@ err_get_kernel_doorbell:
 }
 
 /* Uninitialize a kernel queue and free all its memory usages. */
-static void kq_uninitialize(struct kernel_queue *kq)
+static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
 {
-       if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
+       if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
                kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
                                        kq->queue->mqd,
                                        KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
        return NULL;
 }
 
-void kernel_queue_uninit(struct kernel_queue *kq)
+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
 {
-       kq_uninitialize(kq);
+       kq_uninitialize(kq, hanging);
        kfree(kq);
 }
 
index 6cabed0..dc406e6 100644 (file)
@@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
        return 0;
 }
 
-void pm_uninit(struct packet_manager *pm)
+void pm_uninit(struct packet_manager *pm, bool hanging)
 {
        mutex_destroy(&pm->lock);
-       kernel_queue_uninit(pm->priv_queue);
+       kernel_queue_uninit(pm->priv_queue, hanging);
 }
 
 int pm_send_set_resources(struct packet_manager *pm,
index fc61b5e..6af1b58 100644 (file)
@@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
 void device_queue_manager_uninit(struct device_queue_manager *dqm);
 struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
                                        enum kfd_queue_type type);
-void kernel_queue_uninit(struct kernel_queue *kq);
+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
 int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
 
 /* Process Queue Manager */
@@ -972,7 +972,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs;
 extern const struct packet_manager_funcs kfd_v9_pm_funcs;
 
 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
-void pm_uninit(struct packet_manager *pm);
+void pm_uninit(struct packet_manager *pm, bool hanging);
 int pm_send_set_resources(struct packet_manager *pm,
                                struct scheduling_resources *res);
 int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues);
index 1152490..31fcd1b 100644 (file)
@@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
                /* destroy kernel queue (DIQ) */
                dqm = pqn->kq->dev->dqm;
                dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
-               kernel_queue_uninit(pqn->kq);
+               kernel_queue_uninit(pqn->kq, false);
        }
 
        if (pqn->q) {