drm/amdgpu: fix using the reserved VMID with gang submit

author Christian König <christian.koenig@amd.com>

Thu, 18 Jan 2024 12:28:55 +0000 (13:28 +0100)

committer Alex Deucher <alexander.deucher@amd.com>

Wed, 19 Jun 2024 16:48:00 +0000 (12:48 -0400)
author Christian König <christian.koenig@amd.com>
Thu, 18 Jan 2024 12:28:55 +0000 (13:28 +0100)
committer Alex Deucher <alexander.deucher@amd.com>
Wed, 19 Jun 2024 16:48:00 +0000 (12:48 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index d942c90..7dab476 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1441,6 +1441,7 @@ u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
                                 u32 reg);
  void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
                                 u32 reg, u32 v);
+struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev);
  struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
                                             struct dma_fence *gang);
  bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 3fb02f5..2de3688 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6522,6 +6522,22 @@ void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
  }
  
+/**
+ * amdgpu_device_get_gang - return a reference to the current gang
+ * @adev: amdgpu_device pointer
+ *
+ * Returns: A new reference to the current gang leader.
+ */
+struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
+{
+       struct dma_fence *fence;
+
+       rcu_read_lock();
+       fence = dma_fence_get_rcu_safe(&adev->gang_submit);
+       rcu_read_unlock();
+       return fence;
+}
+
  /**
   * amdgpu_device_switch_gang - switch to a new gang
   * @adev: amdgpu_device pointer
@@ -6538,10 +6554,7 @@ struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
  
         do {
                 dma_fence_put(old);
-               rcu_read_lock();
-               old = dma_fence_get_rcu_safe(&adev->gang_submit);
-               rcu_read_unlock();
-
+               old = amdgpu_device_get_gang(adev);
                 if (old == gang)
                         break;
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c

index 3d7fcde..b5b9d4f 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
@@ -290,18 +290,36 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm,
              !dma_fence_is_signaled((*id)->last_flush))) {
                 struct dma_fence *tmp;
  
-               /* Don't use per engine and per process VMID at the same time */
-               if (adev->vm_manager.concurrent_flush)
-                       ring = NULL;
-
-               /* to prevent one context starved by another context */
-               (*id)->pd_gpu_addr = 0;
-               tmp = amdgpu_sync_peek_fence(&(*id)->active, ring);
-               if (tmp) {
+               /* Wait for the gang to be assembled before using a
+                * reserved VMID or otherwise the gang could deadlock.
+                */
+               tmp = amdgpu_device_get_gang(adev);
+               if (!dma_fence_is_signaled(tmp) && tmp != job->gang_submit) {
                         *id = NULL;
-                       *fence = dma_fence_get(tmp);
+                       *fence = tmp;
                         return 0;
                 }
+               dma_fence_put(tmp);
+
+               /* Make sure the id is owned by the gang before proceeding */
+               if (!job->gang_submit ||
+                   (*id)->owner != vm->immediate.fence_context) {
+
+                       /* Don't use per engine and per process VMID at the
+                        * same time
+                        */
+                       if (adev->vm_manager.concurrent_flush)
+                               ring = NULL;
+
+                       /* to prevent one context starved by another context */
+                       (*id)->pd_gpu_addr = 0;
+                       tmp = amdgpu_sync_peek_fence(&(*id)->active, ring);
+                       if (tmp) {
+                               *id = NULL;
+                               *fence = dma_fence_get(tmp);
+                               return 0;
+                       }
+               }
                 needs_flush = true;
         }
author	Christian König <christian.koenig@amd.com>
	Thu, 18 Jan 2024 12:28:55 +0000 (13:28 +0100)
committer	Alex Deucher <alexander.deucher@amd.com>
	Wed, 19 Jun 2024 16:48:00 +0000 (12:48 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c		patch \| blob \| history