drm/amdgpu/gfx9.4.3: Enable bad opcode interrupt
authorAlex Deucher <alexander.deucher@amd.com>
Fri, 12 Jul 2024 22:57:14 +0000 (18:57 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 23 Jul 2024 21:45:46 +0000 (17:45 -0400)
For the bad opcode case, it will cause CP/ME hang.
The firmware will prevent the ME side from hanging by raising a bad opcode interrupt.
And the driver needs to perform a vmid reset when receiving the interrupt.

Acked-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c

index 43a3ef2..98fe6c4 100644 (file)
@@ -901,6 +901,13 @@ static int gfx_v9_4_3_sw_init(void *handle)
        if (r)
                return r;
 
+       /* Bad opcode Event */
+       r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP,
+                             GFX_9_0__SRCID__CP_BAD_OPCODE_ERROR,
+                             &adev->gfx.bad_op_irq);
+       if (r)
+               return r;
+
        /* Privileged reg */
        r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP, GFX_9_0__SRCID__CP_PRIV_REG_FAULT,
                              &adev->gfx.priv_reg_irq);
@@ -2162,6 +2169,7 @@ static int gfx_v9_4_3_hw_fini(void *handle)
 
        amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
        amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
+       amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0);
 
        num_xcc = NUM_XCC(adev->gfx.xcc_mask);
        for (i = 0; i < num_xcc; i++) {
@@ -2327,6 +2335,10 @@ static int gfx_v9_4_3_late_init(void *handle)
        if (r)
                return r;
 
+       r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0);
+       if (r)
+               return r;
+
        if (adev->gfx.ras &&
            adev->gfx.ras->enable_watchdog_timer)
                adev->gfx.ras->enable_watchdog_timer(adev);
@@ -2964,6 +2976,46 @@ static int gfx_v9_4_3_set_priv_reg_fault_state(struct amdgpu_device *adev,
        return 0;
 }
 
+static int gfx_v9_4_3_set_bad_op_fault_state(struct amdgpu_device *adev,
+                                            struct amdgpu_irq_src *source,
+                                            unsigned type,
+                                            enum amdgpu_interrupt_state state)
+{
+       u32 mec_int_cntl_reg, mec_int_cntl;
+       int i, j, k, num_xcc;
+
+       num_xcc = NUM_XCC(adev->gfx.xcc_mask);
+       switch (state) {
+       case AMDGPU_IRQ_STATE_DISABLE:
+       case AMDGPU_IRQ_STATE_ENABLE:
+               for (i = 0; i < num_xcc; i++) {
+                       WREG32_FIELD15_PREREG(GC, GET_INST(GC, i), CP_INT_CNTL_RING0,
+                                             OPCODE_ERROR_INT_ENABLE,
+                                             state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
+                       for (j = 0; j < adev->gfx.mec.num_mec; j++) {
+                               for (k = 0; k < adev->gfx.mec.num_pipe_per_mec; k++) {
+                                       /* MECs start at 1 */
+                                       mec_int_cntl_reg = gfx_v9_4_3_get_cpc_int_cntl(adev, i, j + 1, k);
+
+                                       if (mec_int_cntl_reg) {
+                                               mec_int_cntl = RREG32_XCC(mec_int_cntl_reg, i);
+                                               mec_int_cntl = REG_SET_FIELD(mec_int_cntl, CP_ME1_PIPE0_INT_CNTL,
+                                                                            OPCODE_ERROR_INT_ENABLE,
+                                                                            state == AMDGPU_IRQ_STATE_ENABLE ?
+                                                                            1 : 0);
+                                               WREG32_XCC(mec_int_cntl_reg, mec_int_cntl, i);
+                                       }
+                               }
+                       }
+               }
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+}
+
 static int gfx_v9_4_3_set_priv_inst_fault_state(struct amdgpu_device *adev,
                                              struct amdgpu_irq_src *source,
                                              unsigned type,
@@ -3116,6 +3168,15 @@ static int gfx_v9_4_3_priv_reg_irq(struct amdgpu_device *adev,
        return 0;
 }
 
+static int gfx_v9_4_3_bad_op_irq(struct amdgpu_device *adev,
+                                struct amdgpu_irq_src *source,
+                                struct amdgpu_iv_entry *entry)
+{
+       DRM_ERROR("Illegal opcode in command stream\n");
+       gfx_v9_4_3_fault(adev, entry);
+       return 0;
+}
+
 static int gfx_v9_4_3_priv_inst_irq(struct amdgpu_device *adev,
                                  struct amdgpu_irq_src *source,
                                  struct amdgpu_iv_entry *entry)
@@ -4228,6 +4289,11 @@ static const struct amdgpu_irq_src_funcs gfx_v9_4_3_priv_reg_irq_funcs = {
        .process = gfx_v9_4_3_priv_reg_irq,
 };
 
+static const struct amdgpu_irq_src_funcs gfx_v9_4_3_bad_op_irq_funcs = {
+       .set = gfx_v9_4_3_set_bad_op_fault_state,
+       .process = gfx_v9_4_3_bad_op_irq,
+};
+
 static const struct amdgpu_irq_src_funcs gfx_v9_4_3_priv_inst_irq_funcs = {
        .set = gfx_v9_4_3_set_priv_inst_fault_state,
        .process = gfx_v9_4_3_priv_inst_irq,
@@ -4241,6 +4307,9 @@ static void gfx_v9_4_3_set_irq_funcs(struct amdgpu_device *adev)
        adev->gfx.priv_reg_irq.num_types = 1;
        adev->gfx.priv_reg_irq.funcs = &gfx_v9_4_3_priv_reg_irq_funcs;
 
+       adev->gfx.bad_op_irq.num_types = 1;
+       adev->gfx.bad_op_irq.funcs = &gfx_v9_4_3_bad_op_irq_funcs;
+
        adev->gfx.priv_inst_irq.num_types = 1;
        adev->gfx.priv_inst_irq.funcs = &gfx_v9_4_3_priv_inst_irq_funcs;
 }