accel/habanalabs: fix glbl error cause handling
authorTomer Tayar <ttayar@habana.ai>
Thu, 25 Jan 2024 20:59:02 +0000 (22:59 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Mon, 26 Feb 2024 07:47:00 +0000 (09:47 +0200)
The glbl error cause handling has a wrong assumption that all error
bits are consecutive.
Fix the handling to check all relevant error bits per ASIC.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/accel/habanalabs/common/habanalabs.h
drivers/accel/habanalabs/common/security.c
drivers/accel/habanalabs/common/security.h
drivers/accel/habanalabs/gaudi2/gaudi2.c
drivers/accel/habanalabs/gaudi2/gaudi2P.h

index 2a32492..d85e1d1 100644 (file)
@@ -647,7 +647,7 @@ struct hl_hints_range {
  * @num_engine_cores: number of engine cpu cores.
  * @max_num_of_engines: maximum number of all engines in the ASIC.
  * @num_of_special_blocks: special_blocks array size.
- * @glbl_err_cause_num: global err cause number.
+ * @glbl_err_max_cause_num: global err max cause number.
  * @hbw_flush_reg: register to read to generate HBW flush. value of 0 means HBW flush is
  *                 not supported.
  * @reserved_fw_mem_size: size in MB of dram memory reserved for FW.
@@ -779,7 +779,7 @@ struct asic_fixed_properties {
        u32                             num_engine_cores;
        u32                             max_num_of_engines;
        u32                             num_of_special_blocks;
-       u32                             glbl_err_cause_num;
+       u32                             glbl_err_max_cause_num;
        u32                             hbw_flush_reg;
        u32                             reserved_fw_mem_size;
        u16                             collective_first_sob;
index fe91396..5402a3c 100644 (file)
@@ -7,15 +7,31 @@
 
 #include "habanalabs.h"
 
-static const char * const hl_glbl_error_cause[HL_MAX_NUM_OF_GLBL_ERR_CAUSE] = {
+static const char * const hl_glbl_error_cause[] = {
        "Error due to un-priv read",
        "Error due to un-secure read",
        "Error due to read from unmapped reg",
        "Error due to un-priv write",
        "Error due to un-secure write",
        "Error due to write to unmapped reg",
+       "N/A",
+       "N/A",
+       "N/A",
+       "N/A",
+       "N/A",
+       "N/A",
+       "N/A",
+       "N/A",
+       "N/A",
+       "N/A",
        "External I/F write sec violation",
        "External I/F write to un-mapped reg",
+       "N/A",
+       "N/A",
+       "N/A",
+       "N/A",
+       "N/A",
+       "N/A",
        "Read to write only",
        "Write to read only"
 };
@@ -671,10 +687,11 @@ static bool hl_check_block_range_exclusion(struct hl_device *hdev,
 static int hl_read_glbl_errors(struct hl_device *hdev,
                u32 blk_idx, u32 major, u32 minor, u32 sub_minor, void *data)
 {
-       struct hl_special_block_info *special_blocks = hdev->asic_prop.special_blocks;
+       struct asic_fixed_properties *prop = &hdev->asic_prop;
+       struct hl_special_block_info *special_blocks = prop->special_blocks;
        struct hl_special_block_info *current_block = &special_blocks[blk_idx];
        u32 glbl_err_addr, glbl_err_cause, addr_val, cause_val, block_base,
-               base = current_block->base_addr - lower_32_bits(hdev->asic_prop.cfg_base_address);
+               base = current_block->base_addr - lower_32_bits(prop->cfg_base_address);
        int i;
 
        block_base = base + major * current_block->major_offset +
@@ -689,13 +706,13 @@ static int hl_read_glbl_errors(struct hl_device *hdev,
        glbl_err_addr = block_base + HL_GLBL_ERR_ADDR_OFFSET;
        addr_val = RREG32(glbl_err_addr);
 
-       for (i = 0 ; i < hdev->asic_prop.glbl_err_cause_num ; i++) {
+       for (i = 0 ; i <= prop->glbl_err_max_cause_num ; i++) {
                if (cause_val & BIT(i))
                        dev_err_ratelimited(hdev->dev,
-                               "%s, addr %#llx\n",
-                               hl_glbl_error_cause[i],
-                               hdev->asic_prop.cfg_base_address + block_base +
-                               FIELD_GET(HL_GLBL_ERR_ADDRESS_MASK, addr_val));
+                                       "%s, addr %#llx\n",
+                                       hl_glbl_error_cause[i],
+                                       prop->cfg_base_address + block_base +
+                                               FIELD_GET(HL_GLBL_ERR_ADDRESS_MASK, addr_val));
        }
 
        WREG32(glbl_err_cause, cause_val);
index d7a3b3e..476f706 100644 (file)
@@ -13,8 +13,7 @@
 struct hl_device;
 
 /* special blocks */
-#define HL_MAX_NUM_OF_GLBL_ERR_CAUSE           10
-#define HL_GLBL_ERR_ADDRESS_MASK               GENMASK(11, 0)
+#define HL_GLBL_ERR_ADDRESS_MASK       GENMASK(11, 0)
 /* GLBL_ERR_ADDR register offset from the start of the block */
 #define HL_GLBL_ERR_ADDR_OFFSET                0xF44
 /* GLBL_ERR_CAUSE register offset from the start of the block */
index 9f033a7..e6395a8 100644 (file)
 #define RAZWI_INITIATOR_ID_X_Y(xl, yl, xh) \
        (RAZWI_INITIATOR_ID_X_Y_LOW(xl, yl) | RAZWI_INITIATOR_ID_X_HIGH(xh))
 
-#define PSOC_RAZWI_ENG_STR_SIZE 128
-#define PSOC_RAZWI_MAX_ENG_PER_RTR 5
+#define PSOC_RAZWI_ENG_STR_SIZE                        128
+#define PSOC_RAZWI_MAX_ENG_PER_RTR             5
 
 /* HW scrambles only bits 0-25 */
-#define HW_UNSCRAMBLED_BITS_MASK GENMASK_ULL(63, 26)
+#define HW_UNSCRAMBLED_BITS_MASK               GENMASK_ULL(63, 26)
+
+#define GAUDI2_GLBL_ERR_MAX_CAUSE_NUM          17
 
 struct gaudi2_razwi_info {
        u32 axuser_xy;
@@ -3587,7 +3589,7 @@ static int gaudi2_special_blocks_config(struct hl_device *hdev)
        int i, rc;
 
        /* Configure Special blocks */
-       prop->glbl_err_cause_num = GAUDI2_NUM_OF_GLBL_ERR_CAUSE;
+       prop->glbl_err_max_cause_num = GAUDI2_GLBL_ERR_MAX_CAUSE_NUM;
        prop->num_of_special_blocks = ARRAY_SIZE(gaudi2_special_blocks);
        prop->special_blocks = kmalloc_array(prop->num_of_special_blocks,
                        sizeof(*prop->special_blocks), GFP_KERNEL);
index bc508c9..eee4138 100644 (file)
 #define GAUDI2_SOB_INCREMENT_BY_ONE    (FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_VAL_MASK, 1) | \
                                        FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_INC_MASK, 1))
 
-#define GAUDI2_NUM_TESTED_QS (GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0)
+#define GAUDI2_NUM_TESTED_QS           (GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0)
 
-#define GAUDI2_NUM_OF_GLBL_ERR_CAUSE           8
 
 enum gaudi2_reserved_sob_id {
        GAUDI2_RESERVED_SOB_CS_COMPLETION_FIRST,