drm/amdgpu: Show warning message if IH ring overflow
authorPhilip Yang <Philip.Yang@amd.com>
Tue, 3 Dec 2024 15:00:25 +0000 (10:00 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 18 Dec 2024 17:39:07 +0000 (12:39 -0500)
If IH primary ring and KFD ih fifo overflows, we may miss CP, SDMA
interrupts and cause application soft hang. Show warning message with
ring name if overflow happens.

Add function to get ih ring name to avoid duplicating it. To keep
warning message consistent between GPU generations, change all
*_ih.c except ASICs older than Vega which has only one ih ring.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
drivers/gpu/drm/amd/amdgpu/navi10_ih.c
drivers/gpu/drm/amd/amdgpu/vega10_ih.c
drivers/gpu/drm/amd/amdgpu/vega20_ih.c
drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c

index f3b0aaf..901f8b1 100644 (file)
@@ -298,3 +298,9 @@ uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr,
        dw2 = le32_to_cpu(ih->ring[ring_index + 2]);
        return dw1 | ((u64)(dw2 & 0xffff) << 32);
 }
+
+const char *amdgpu_ih_ring_name(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih)
+{
+       return ih == &adev->irq.ih ? "ih" : ih == &adev->irq.ih_soft ? "sw ih" :
+              ih == &adev->irq.ih1 ? "ih1" : ih == &adev->irq.ih2 ? "ih2" : "unknown";
+}
index 508f02e..7d4395a 100644 (file)
@@ -110,4 +110,5 @@ void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev,
                                struct amdgpu_iv_entry *entry);
 uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr,
                                       signed int offset);
+const char *amdgpu_ih_ring_name(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
 #endif
index ebc2ab9..62cdfe1 100644 (file)
@@ -434,9 +434,8 @@ static u32 navi10_ih_get_wptr(struct amdgpu_device *adev,
         * this should allow us to catch up.
         */
        tmp = (wptr + 32) & ih->ptr_mask;
-       dev_warn(adev->dev, "IH ring buffer overflow "
-                "(0x%08X, 0x%08X, 0x%08X)\n",
-                wptr, ih->rptr, tmp);
+       dev_warn(adev->dev, "%s ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
+                amdgpu_ih_ring_name(adev, ih), wptr, ih->rptr, tmp);
        ih->rptr = tmp;
 
        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
index 378da88..98fc694 100644 (file)
@@ -364,9 +364,8 @@ static u32 vega10_ih_get_wptr(struct amdgpu_device *adev,
         * this should allow us to catchup.
         */
        tmp = (wptr + 32) & ih->ptr_mask;
-       dev_warn(adev->dev, "IH ring buffer overflow "
-                "(0x%08X, 0x%08X, 0x%08X)\n",
-                wptr, ih->rptr, tmp);
+       dev_warn_ratelimited(adev->dev, "%s ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
+                            amdgpu_ih_ring_name(adev, ih), wptr, ih->rptr, tmp);
        ih->rptr = tmp;
 
        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
index 2c1c4b7..e9e3b2e 100644 (file)
@@ -444,9 +444,8 @@ static u32 vega20_ih_get_wptr(struct amdgpu_device *adev,
         * this should allow us to catchup.
         */
        tmp = (wptr + 32) & ih->ptr_mask;
-       dev_warn(adev->dev, "IH ring buffer overflow "
-                "(0x%08X, 0x%08X, 0x%08X)\n",
-                wptr, ih->rptr, tmp);
+       dev_warn_ratelimited(adev->dev, "%s ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
+                            amdgpu_ih_ring_name(adev, ih), wptr, ih->rptr, tmp);
        ih->rptr = tmp;
 
        tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
index 6beb786..783c2f5 100644 (file)
@@ -108,8 +108,8 @@ void kfd_interrupt_exit(struct kfd_node *node)
 bool enqueue_ih_ring_entry(struct kfd_node *node, const void *ih_ring_entry)
 {
        if (kfifo_is_full(&node->ih_fifo)) {
-               dev_dbg_ratelimited(node->adev->dev,
-                                   "Interrupt ring overflow, dropping interrupt\n");
+               dev_warn_ratelimited(node->adev->dev, "KFD node %d ih_fifo overflow\n",
+                                    node->node_id);
                return false;
        }