drm/amdkfd: Add GPU reset SMI event
authorMukul Joshi <mukul.joshi@amd.com>
Fri, 28 Aug 2020 22:50:42 +0000 (18:50 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 31 Aug 2020 18:40:03 +0000 (14:40 -0400)
Add support for reporting GPU reset events through SMI. KFD
would report both pre and post GPU reset events.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
include/uapi/linux/kfd_ioctl.h

index e1cd659..0e71a05 100644 (file)
@@ -812,6 +812,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
        if (!kfd->init_complete)
                return 0;
 
+       kfd_smi_event_update_gpu_reset(kfd, false);
+
        kfd->dqm->ops.pre_reset(kfd->dqm);
 
        kgd2kfd_suspend(kfd, false);
@@ -840,6 +842,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
 
        atomic_set(&kfd->sram_ecc_flag, 0);
 
+       kfd_smi_event_update_gpu_reset(kfd, true);
+
        return 0;
 }
 
index f14beb9..023629f 100644 (file)
@@ -312,6 +312,8 @@ struct kfd_dev {
        /* Clients watching SMI events */
        struct list_head smi_clients;
        spinlock_t smi_lock;
+
+       uint32_t reset_seq_num;
 };
 
 enum kfd_mempool {
index 4d4b6e3..17d1736 100644 (file)
@@ -174,6 +174,36 @@ static void add_event_to_kfifo(struct kfd_dev *dev, unsigned int smi_event,
        rcu_read_unlock();
 }
 
+void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset)
+{
+       /*
+        * GpuReset msg = Reset seq number (incremented for
+        * every reset message sent before GPU reset).
+        * 1 byte event + 1 byte space + 8 bytes seq num +
+        * 1 byte \n + 1 byte \0 = 12
+        */
+       char fifo_in[12];
+       int len;
+       unsigned int event;
+
+       if (list_empty(&dev->smi_clients))
+               return;
+
+       memset(fifo_in, 0x0, sizeof(fifo_in));
+
+       if (post_reset) {
+               event = KFD_SMI_EVENT_GPU_POST_RESET;
+       } else {
+               event = KFD_SMI_EVENT_GPU_PRE_RESET;
+               ++(dev->reset_seq_num);
+       }
+
+       len = snprintf(fifo_in, sizeof(fifo_in), "%x %x\n", event,
+                                               dev->reset_seq_num);
+
+       add_event_to_kfifo(dev, event, fifo_in, len);
+}
+
 void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
                                             uint32_t throttle_bitmask)
 {
@@ -191,7 +221,7 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
        if (list_empty(&dev->smi_clients))
                return;
 
-       len = snprintf(fifo_in, 29, "%x %x:%llx\n",
+       len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%llx\n",
                       KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask,
                       atomic64_read(&adev->smu.throttle_int_counter));
 
@@ -218,7 +248,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
        if (!task_info.pid)
                return;
 
-       len = snprintf(fifo_in, 29, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
+       len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
                task_info.pid, task_info.task_name);
 
        add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
index 15537b2..b9b0438 100644 (file)
@@ -27,5 +27,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
 void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
 void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
                                             uint32_t throttle_bitmask);
+void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset);
 
 #endif
index cb1f963..8b7368b 100644 (file)
@@ -453,6 +453,8 @@ enum kfd_smi_event {
         KFD_SMI_EVENT_NONE = 0, /* not used */
         KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */
         KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
+       KFD_SMI_EVENT_GPU_PRE_RESET = 3,
+       KFD_SMI_EVENT_GPU_POST_RESET = 4,
 };
 
 #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))