drm/amdkfd: Skip packet submission on fatal error
authorLijo Lazar <lijo.lazar@amd.com>
Thu, 22 Feb 2024 09:24:50 +0000 (14:54 +0530)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 26 Feb 2024 16:14:31 +0000 (11:14 -0500)
If fatal error is detected, packet submission won't go through. Return
error in such cases. Also, avoid waiting for fence when fatal error is
detected.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Asad Kamal <asad.kamal@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c

index 190039f..f5f2945 100644 (file)
@@ -742,6 +742,11 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
        amdgpu_device_flush_hdp(adev, NULL);
 }
 
+bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
+{
+       return amdgpu_ras_get_fed_status(adev);
+}
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
        enum amdgpu_ras_block block, bool reset)
 {
index e60f63c..4fb32d8 100644 (file)
@@ -337,6 +337,7 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
                                struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
                        enum amdgpu_ras_block block, bool reset);
+bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
index c0e7154..f4d395e 100644 (file)
@@ -1903,6 +1903,10 @@ int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm,
        uint64_t *fence_addr =  dqm->fence_addr;
 
        while (*fence_addr != fence_value) {
+               /* Fatal err detected, this response won't come */
+               if (amdgpu_amdkfd_is_fed(dqm->dev->adev))
+                       return -EIO;
+
                if (time_after(jiffies, end_jiffies)) {
                        dev_err(dev, "qcm fence wait loop timeout expired\n");
                        /* In HWS case, this is used to halt the driver thread
index 1bea629..32c9269 100644 (file)
@@ -286,7 +286,7 @@ err_no_space:
        return -ENOMEM;
 }
 
-void kq_submit_packet(struct kernel_queue *kq)
+int kq_submit_packet(struct kernel_queue *kq)
 {
 #ifdef DEBUG
        int i;
@@ -298,6 +298,10 @@ void kq_submit_packet(struct kernel_queue *kq)
        }
        pr_debug("\n");
 #endif
+       /* Fatal err detected, packet submission won't go through */
+       if (amdgpu_amdkfd_is_fed(kq->dev->adev))
+               return -EIO;
+
        if (kq->dev->kfd->device_info.doorbell_size == 8) {
                *kq->wptr64_kernel = kq->pending_wptr64;
                write_kernel_doorbell64(kq->queue->properties.doorbell_ptr,
@@ -307,6 +311,8 @@ void kq_submit_packet(struct kernel_queue *kq)
                write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
                                        kq->pending_wptr);
        }
+
+       return 0;
 }
 
 void kq_rollback_packet(struct kernel_queue *kq)
index 9a62444..e24ee50 100644 (file)
@@ -47,7 +47,7 @@
 int kq_acquire_packet_buffer(struct kernel_queue *kq,
                                size_t packet_size_in_dwords,
                                unsigned int **buffer_ptr);
-void kq_submit_packet(struct kernel_queue *kq);
+int kq_submit_packet(struct kernel_queue *kq);
 void kq_rollback_packet(struct kernel_queue *kq);
 
 
index 401096c..d6f65f3 100644 (file)
@@ -288,7 +288,7 @@ int pm_send_set_resources(struct packet_manager *pm,
 
        retval = pm->pmf->set_resources(pm, buffer, res);
        if (!retval)
-               kq_submit_packet(pm->priv_queue);
+               retval = kq_submit_packet(pm->priv_queue);
        else
                kq_rollback_packet(pm->priv_queue);
 
@@ -325,7 +325,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
        if (retval)
                goto fail_create_runlist;
 
-       kq_submit_packet(pm->priv_queue);
+       retval = kq_submit_packet(pm->priv_queue);
 
        mutex_unlock(&pm->lock);
 
@@ -361,7 +361,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
 
        retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value);
        if (!retval)
-               kq_submit_packet(pm->priv_queue);
+               retval = kq_submit_packet(pm->priv_queue);
        else
                kq_rollback_packet(pm->priv_queue);
 
@@ -392,7 +392,7 @@ int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period)
 
                retval = pm->pmf->set_grace_period(pm, buffer, grace_period);
                if (!retval)
-                       kq_submit_packet(pm->priv_queue);
+                       retval = kq_submit_packet(pm->priv_queue);
                else
                        kq_rollback_packet(pm->priv_queue);
        }
@@ -421,7 +421,7 @@ int pm_send_unmap_queue(struct packet_manager *pm,
 
        retval = pm->pmf->unmap_queues(pm, buffer, filter, filter_param, reset);
        if (!retval)
-               kq_submit_packet(pm->priv_queue);
+               retval = kq_submit_packet(pm->priv_queue);
        else
                kq_rollback_packet(pm->priv_queue);