drm/amdkfd: Handle queue destroy buffer access race

author Philip Yang <Philip.Yang@amd.com>

Fri, 2 Aug 2024 15:28:45 +0000 (11:28 -0400)

committer Alex Deucher <alexander.deucher@amd.com>

Tue, 13 Aug 2024 14:43:16 +0000 (10:43 -0400)
author Philip Yang <Philip.Yang@amd.com>
Fri, 2 Aug 2024 15:28:45 +0000 (11:28 -0400)
committer Alex Deucher <alexander.deucher@amd.com>
Tue, 13 Aug 2024 14:43:16 +0000 (10:43 -0400)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index 0622ebd..00350ec 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -400,6 +400,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
         return 0;
  
  err_create_queue:
+       kfd_queue_unref_bo_vas(pdd, &q_properties);
         kfd_queue_release_buffers(pdd, &q_properties);
  err_acquire_queue_buf:
  err_sdma_engine_id:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 057d204..f7c12d4 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1298,9 +1298,12 @@ void print_queue_properties(struct queue_properties *q);
  void print_queue(struct queue *q);
  int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo,
                          u64 expected_size);
-void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
+void kfd_queue_buffer_put(struct amdgpu_bo **bo);
  int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
  int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
+void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
+int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd,
+                          struct queue_properties *properties);
  void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev);
  
  struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c

index f732ee3..20ea745 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -217,6 +217,7 @@ void pqm_uninit(struct process_queue_manager *pqm)
         list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
                 if (pqn->q) {
                         pdd = kfd_get_process_device_data(pqn->q->device, pqm->process);
+                       kfd_queue_unref_bo_vas(pdd, &pqn->q->properties);
                         kfd_queue_release_buffers(pdd, &pqn->q->properties);
                         pqm_clean_queue_resource(pqm, pqn);
                 }
@@ -512,7 +513,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
         }
  
         if (pqn->q) {
-               retval = kfd_queue_release_buffers(pdd, &pqn->q->properties);
+               retval = kfd_queue_unref_bo_vas(pdd, &pqn->q->properties);
                 if (retval)
                         goto err_destroy_queue;
  
@@ -526,7 +527,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
                         if (retval != -ETIME)
                                 goto err_destroy_queue;
                 }
-
+               kfd_queue_release_buffers(pdd, &pqn->q->properties);
                 pqm_clean_queue_resource(pqm, pqn);
                 uninit_queue(pqn->q);
         }
@@ -579,7 +580,8 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm,
                         return -EFAULT;
                 }
  
-               kfd_queue_buffer_put(vm, &pqn->q->properties.ring_bo);
+               kfd_queue_unref_bo_va(vm, &pqn->q->properties.ring_bo);
+               kfd_queue_buffer_put(&pqn->q->properties.ring_bo);
                 amdgpu_bo_unreserve(vm->root.bo);
  
                 pqn->q->properties.ring_bo = p->ring_bo;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c

index e0a073a..ad29634 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -224,16 +224,9 @@ out_err:
         return -EINVAL;
  }
  
-void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo)
+/* FIXME: remove this function, just call amdgpu_bo_unref directly */
+void kfd_queue_buffer_put(struct amdgpu_bo **bo)
  {
-       if (*bo) {
-               struct amdgpu_bo_va *bo_va;
-
-               bo_va = amdgpu_vm_bo_find(vm, *bo);
-               if (bo_va)
-                       bo_va->queue_refcount--;
-       }
-
         amdgpu_bo_unref(bo);
  }
  
@@ -327,6 +320,10 @@ out_unreserve:
  out_err_unreserve:
         amdgpu_bo_unreserve(vm->root.bo);
  out_err_release:
+       /* FIXME: make a _locked version of this that can be called before
+        * dropping the VM reservation.
+        */
+       kfd_queue_unref_bo_vas(pdd, properties);
         kfd_queue_release_buffers(pdd, properties);
         return err;
  }
@@ -334,22 +331,13 @@ out_err_release:
  int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
  {
         struct kfd_topology_device *topo_dev;
-       struct amdgpu_vm *vm;
         u32 total_cwsr_size;
-       int err;
  
-       vm = drm_priv_to_vm(pdd->drm_priv);
-       err = amdgpu_bo_reserve(vm->root.bo, false);
-       if (err)
-               return err;
-
-       kfd_queue_buffer_put(vm, &properties->wptr_bo);
-       kfd_queue_buffer_put(vm, &properties->rptr_bo);
-       kfd_queue_buffer_put(vm, &properties->ring_bo);
-       kfd_queue_buffer_put(vm, &properties->eop_buf_bo);
-       kfd_queue_buffer_put(vm, &properties->cwsr_bo);
-
-       amdgpu_bo_unreserve(vm->root.bo);
+       kfd_queue_buffer_put(&properties->wptr_bo);
+       kfd_queue_buffer_put(&properties->rptr_bo);
+       kfd_queue_buffer_put(&properties->ring_bo);
+       kfd_queue_buffer_put(&properties->eop_buf_bo);
+       kfd_queue_buffer_put(&properties->cwsr_bo);
  
         topo_dev = kfd_topology_device_by_id(pdd->dev->id);
         if (!topo_dev)
@@ -362,6 +350,38 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope
         return 0;
  }
  
+void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo)
+{
+       if (*bo) {
+               struct amdgpu_bo_va *bo_va;
+
+               bo_va = amdgpu_vm_bo_find(vm, *bo);
+               if (bo_va && bo_va->queue_refcount)
+                       bo_va->queue_refcount--;
+       }
+}
+
+int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd,
+                          struct queue_properties *properties)
+{
+       struct amdgpu_vm *vm;
+       int err;
+
+       vm = drm_priv_to_vm(pdd->drm_priv);
+       err = amdgpu_bo_reserve(vm->root.bo, false);
+       if (err)
+               return err;
+
+       kfd_queue_unref_bo_va(vm, &properties->wptr_bo);
+       kfd_queue_unref_bo_va(vm, &properties->rptr_bo);
+       kfd_queue_unref_bo_va(vm, &properties->ring_bo);
+       kfd_queue_unref_bo_va(vm, &properties->eop_buf_bo);
+       kfd_queue_unref_bo_va(vm, &properties->cwsr_bo);
+
+       amdgpu_bo_unreserve(vm->root.bo);
+       return 0;
+}
+
  #define SGPR_SIZE_PER_CU       0x4000
  #define LDS_SIZE_PER_CU                0x10000
  #define HWREG_SIZE_PER_CU      0x1000
author	Philip Yang <Philip.Yang@amd.com>
	Fri, 2 Aug 2024 15:28:45 +0000 (11:28 -0400)
committer	Alex Deucher <alexander.deucher@amd.com>
	Tue, 13 Aug 2024 14:43:16 +0000 (10:43 -0400)
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_priv.h		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_queue.c		patch \| blob \| history