drm/amdkfd: Add user queue eviction restore SMI event
authorPhilip Yang <Philip.Yang@amd.com>
Fri, 14 Jan 2022 02:24:20 +0000 (21:24 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 30 Jun 2022 19:31:14 +0000 (15:31 -0400)
Output user queue eviction and restore event. User queue eviction may be
triggered by svm or userptr MMU notifier, TTM eviction, device suspend
and CRIU checkpoint and restore.

User queue restore may be rescheduled if eviction happens again while
restore.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_process.c
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index b25b41f..73bf8b5 100644 (file)
@@ -336,7 +336,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
 }
 #endif
 /* KGD2KFD callbacks */
-int kgd2kfd_quiesce_mm(struct mm_struct *mm);
+int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger);
 int kgd2kfd_resume_mm(struct mm_struct *mm);
 int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
                                                struct dma_fence *fence);
index 0036c9e..2fcc6e0 100644 (file)
@@ -32,6 +32,7 @@
 #include "amdgpu_dma_buf.h"
 #include <uapi/linux/kfd_ioctl.h>
 #include "amdgpu_xgmi.h"
+#include "kfd_smi_events.h"
 
 /* Userptr restore delay, just long enough to allow consecutive VM
  * changes to accumulate
@@ -2346,7 +2347,7 @@ int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
        evicted_bos = atomic_inc_return(&process_info->evicted_bos);
        if (evicted_bos == 1) {
                /* First eviction, stop the queues */
-               r = kgd2kfd_quiesce_mm(mm);
+               r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
                if (r)
                        pr_err("Failed to quiesce KFD\n");
                schedule_delayed_work(&process_info->restore_userptr_work,
@@ -2620,13 +2621,16 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
 
 unlock_out:
        mutex_unlock(&process_info->lock);
-       mmput(mm);
-       put_task_struct(usertask);
 
        /* If validation failed, reschedule another attempt */
-       if (evicted_bos)
+       if (evicted_bos) {
                schedule_delayed_work(&process_info->restore_userptr_work,
                        msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
+
+               kfd_smi_event_queue_restore_rescheduled(mm);
+       }
+       mmput(mm);
+       put_task_struct(usertask);
 }
 
 /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
index d075882..2b3d8bc 100644 (file)
@@ -2434,7 +2434,7 @@ static int criu_restore(struct file *filep,
         * Set the process to evicted state to avoid running any new queues before all the memory
         * mappings are ready.
         */
-       ret = kfd_process_evict_queues(p);
+       ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE);
        if (ret)
                goto exit_unlock;
 
@@ -2553,7 +2553,7 @@ static int criu_process_info(struct file *filep,
                goto err_unlock;
        }
 
-       ret = kfd_process_evict_queues(p);
+       ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT);
        if (ret)
                goto err_unlock;
 
index c8fee0d..6ec0e9f 100644 (file)
@@ -837,7 +837,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
        spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
 }
 
-int kgd2kfd_quiesce_mm(struct mm_struct *mm)
+int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger)
 {
        struct kfd_process *p;
        int r;
@@ -851,7 +851,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
                return -ESRCH;
 
        WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
-       r = kfd_process_evict_queues(p);
+       r = kfd_process_evict_queues(p, trigger);
 
        kfd_unref_process(p);
        return r;
index 4c4bbd4..d03a3b9 100644 (file)
@@ -947,7 +947,7 @@ static inline struct kfd_process_device *kfd_process_device_from_gpuidx(
 }
 
 void kfd_unref_process(struct kfd_process *p);
-int kfd_process_evict_queues(struct kfd_process *p);
+int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger);
 int kfd_process_restore_queues(struct kfd_process *p);
 void kfd_suspend_all_processes(void);
 int kfd_resume_all_processes(void);
index a13e60d..fc38a4d 100644 (file)
@@ -43,6 +43,7 @@ struct mm_struct;
 #include "kfd_device_queue_manager.h"
 #include "kfd_iommu.h"
 #include "kfd_svm.h"
+#include "kfd_smi_events.h"
 
 /*
  * List of struct kfd_process (field kfd_process).
@@ -1736,7 +1737,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
  * Eviction is reference-counted per process-device. This means multiple
  * evictions from different sources can be nested safely.
  */
-int kfd_process_evict_queues(struct kfd_process *p)
+int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
 {
        int r = 0;
        int i;
@@ -1745,6 +1746,9 @@ int kfd_process_evict_queues(struct kfd_process *p)
        for (i = 0; i < p->n_pdds; i++) {
                struct kfd_process_device *pdd = p->pdds[i];
 
+               kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread->pid,
+                                            trigger);
+
                r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
                                                            &pdd->qpd);
                /* evict return -EIO if HWS is hang or asic is resetting, in this case
@@ -1769,6 +1773,9 @@ fail:
 
                if (n_evicted == 0)
                        break;
+
+               kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
+
                if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
                                                              &pdd->qpd))
                        pr_err("Failed to restore queues\n");
@@ -1788,6 +1795,8 @@ int kfd_process_restore_queues(struct kfd_process *p)
        for (i = 0; i < p->n_pdds; i++) {
                struct kfd_process_device *pdd = p->pdds[i];
 
+               kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
+
                r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
                                                              &pdd->qpd);
                if (r) {
@@ -1849,7 +1858,7 @@ static void evict_process_worker(struct work_struct *work)
        flush_delayed_work(&p->restore_work);
 
        pr_debug("Started evicting pasid 0x%x\n", p->pasid);
-       ret = kfd_process_evict_queues(p);
+       ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
        if (!ret) {
                dma_fence_signal(p->ef);
                dma_fence_put(p->ef);
@@ -1916,7 +1925,7 @@ void kfd_suspend_all_processes(void)
                cancel_delayed_work_sync(&p->eviction_work);
                cancel_delayed_work_sync(&p->restore_work);
 
-               if (kfd_process_evict_queues(p))
+               if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
                        pr_err("Failed to suspend process 0x%x\n", p->pasid);
                dma_fence_signal(p->ef);
                dma_fence_put(p->ef);
index ec4d278..3917c38 100644 (file)
@@ -283,6 +283,41 @@ void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
                          from, to, trigger);
 }
 
+void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
+                                 uint32_t trigger)
+{
+       kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_EVICTION,
+                         "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
+                         dev->id, trigger);
+}
+
+void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid)
+{
+       kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_RESTORE,
+                         "%lld -%d %x\n", ktime_get_boottime_ns(), pid,
+                         dev->id);
+}
+
+void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
+{
+       struct kfd_process *p;
+       int i;
+
+       p = kfd_lookup_process_by_mm(mm);
+       if (!p)
+               return;
+
+       for (i = 0; i < p->n_pdds; i++) {
+               struct kfd_process_device *pdd = p->pdds[i];
+
+               kfd_smi_event_add(p->lead_thread->pid, pdd->dev,
+                                 KFD_SMI_EVENT_QUEUE_RESTORE,
+                                 "%lld -%d %x %c\n", ktime_get_boottime_ns(),
+                                 p->lead_thread->pid, pdd->dev->id, 'R');
+       }
+       kfd_unref_process(p);
+}
+
 int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
 {
        struct kfd_smi_client *client;
index ec5d74a..b232926 100644 (file)
@@ -42,4 +42,8 @@ void kfd_smi_event_migration_start(struct kfd_dev *dev, pid_t pid,
 void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
                             unsigned long start, unsigned long end,
                             uint32_t from, uint32_t to, uint32_t trigger);
+void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
+                                 uint32_t trigger);
+void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid);
+void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm);
 #endif
index e8ded7a..8bfb7b9 100644 (file)
@@ -1730,14 +1730,16 @@ out_reschedule:
        mutex_unlock(&svms->lock);
        mmap_write_unlock(mm);
        mutex_unlock(&process_info->lock);
-       mmput(mm);
 
        /* If validation failed, reschedule another attempt */
        if (evicted_ranges) {
                pr_debug("reschedule to restore svm range\n");
                schedule_delayed_work(&svms->restore_work,
                        msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
+
+               kfd_smi_event_queue_restore_rescheduled(mm);
        }
+       mmput(mm);
 }
 
 /**
@@ -1793,7 +1795,7 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
                         prange->svms, prange->start, prange->last);
 
                /* First eviction, stop the queues */
-               r = kgd2kfd_quiesce_mm(mm);
+               r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM);
                if (r)
                        pr_debug("failed to quiesce KFD\n");