Merge branch 'hns3-RAS'
authorDavid S. Miller <davem@davemloft.net>
Tue, 8 Jun 2021 21:43:31 +0000 (14:43 -0700)
committerDavid S. Miller <davem@davemloft.net>
Tue, 8 Jun 2021 21:43:31 +0000 (14:43 -0700)
Guangbin Huang says:

====================
net: hns3: add RAS compatibility adaptation solution

This patchset adds RAS compatibility adaptation solution for new devices.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/hisilicon/hns3/hnae3.h
drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h

index dc9b5bc..e564aa3 100644 (file)
@@ -91,6 +91,7 @@ enum HNAE3_DEV_CAP_BITS {
        HNAE3_DEV_SUPPORT_STASH_B,
        HNAE3_DEV_SUPPORT_UDP_TUNNEL_CSUM_B,
        HNAE3_DEV_SUPPORT_PAUSE_B,
+       HNAE3_DEV_SUPPORT_RAS_IMP_B,
        HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B,
        HNAE3_DEV_SUPPORT_PORT_VLAN_BYPASS_B,
        HNAE3_DEV_SUPPORT_VLAN_FLTR_MDF_B,
@@ -129,6 +130,9 @@ enum HNAE3_DEV_CAP_BITS {
 #define hnae3_dev_phy_imp_supported(hdev) \
        test_bit(HNAE3_DEV_SUPPORT_PHY_IMP_B, (hdev)->ae_dev->caps)
 
+#define hnae3_dev_ras_imp_supported(hdev) \
+       test_bit(HNAE3_DEV_SUPPORT_RAS_IMP_B, (hdev)->ae_dev->caps)
+
 #define hnae3_dev_tqp_txrx_indep_supported(hdev) \
        test_bit(HNAE3_DEV_SUPPORT_TQP_TXRX_INDEP_B, (hdev)->ae_dev->caps)
 
index cf1efd2..a0edca8 100644 (file)
@@ -349,6 +349,9 @@ static struct hns3_dbg_cap_info hns3_dbg_cap[] = {
        }, {
                .name = "support imp-controlled PHY",
                .cap_bit = HNAE3_DEV_SUPPORT_PHY_IMP_B,
+       }, {
+               .name = "support imp-controlled RAS",
+               .cap_bit = HNAE3_DEV_SUPPORT_RAS_IMP_B,
        }, {
                .name = "support rxd advanced layout",
                .cap_bit = HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B,
index 8f6ed85..887297e 100644 (file)
@@ -178,7 +178,8 @@ static bool hclge_is_special_opcode(u16 opcode)
                             HCLGE_QUERY_CLEAR_MPF_RAS_INT,
                             HCLGE_QUERY_CLEAR_PF_RAS_INT,
                             HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT,
-                            HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT};
+                            HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT,
+                            HCLGE_QUERY_ALL_ERR_INFO};
        int i;
 
        for (i = 0; i < ARRAY_SIZE(spec_opcode); i++) {
@@ -386,6 +387,8 @@ static void hclge_parse_capability(struct hclge_dev *hdev,
                set_bit(HNAE3_DEV_SUPPORT_PAUSE_B, ae_dev->caps);
        if (hnae3_get_bit(caps, HCLGE_CAP_PHY_IMP_B))
                set_bit(HNAE3_DEV_SUPPORT_PHY_IMP_B, ae_dev->caps);
+       if (hnae3_get_bit(caps, HCLGE_CAP_RAS_IMP_B))
+               set_bit(HNAE3_DEV_SUPPORT_RAS_IMP_B, ae_dev->caps);
        if (hnae3_get_bit(caps, HCLGE_CAP_RXD_ADV_LAYOUT_B))
                set_bit(HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B, ae_dev->caps);
        if (hnae3_get_bit(caps, HCLGE_CAP_PORT_VLAN_BYPASS_B)) {
index da78a64..221811a 100644 (file)
@@ -293,6 +293,8 @@ enum hclge_opcode_type {
        HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513,
        HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT      = 0x1514,
        HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT       = 0x1515,
+       HCLGE_QUERY_ALL_ERR_BD_NUM              = 0x1516,
+       HCLGE_QUERY_ALL_ERR_INFO                = 0x1517,
        HCLGE_CONFIG_ROCEE_RAS_INT_EN   = 0x1580,
        HCLGE_QUERY_CLEAR_ROCEE_RAS_INT = 0x1581,
        HCLGE_ROCEE_PF_RAS_INT_CMD      = 0x1584,
@@ -390,6 +392,7 @@ enum HCLGE_CAP_BITS {
        HCLGE_CAP_HW_PAD_B,
        HCLGE_CAP_STASH_B,
        HCLGE_CAP_UDP_TUNNEL_CSUM_B,
+       HCLGE_CAP_RAS_IMP_B = 12,
        HCLGE_CAP_FEC_B = 13,
        HCLGE_CAP_PAUSE_B = 14,
        HCLGE_CAP_RXD_ADV_LAYOUT_B = 15,
index f125aa4..bad9fda 100644 (file)
@@ -631,6 +631,134 @@ static const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int[] = {
        { /* sentinel */ }
 };
 
+static const struct hclge_hw_module_id hclge_hw_module_id_st[] = {
+       {
+               .module_id = MODULE_NONE,
+               .msg = "MODULE_NONE"
+       }, {
+               .module_id = MODULE_BIOS_COMMON,
+               .msg = "MODULE_BIOS_COMMON"
+       }, {
+               .module_id = MODULE_GE,
+               .msg = "MODULE_GE"
+       }, {
+               .module_id = MODULE_IGU_EGU,
+               .msg = "MODULE_IGU_EGU"
+       }, {
+               .module_id = MODULE_LGE,
+               .msg = "MODULE_LGE"
+       }, {
+               .module_id = MODULE_NCSI,
+               .msg = "MODULE_NCSI"
+       }, {
+               .module_id = MODULE_PPP,
+               .msg = "MODULE_PPP"
+       }, {
+               .module_id = MODULE_QCN,
+               .msg = "MODULE_QCN"
+       }, {
+               .module_id = MODULE_RCB_RX,
+               .msg = "MODULE_RCB_RX"
+       }, {
+               .module_id = MODULE_RTC,
+               .msg = "MODULE_RTC"
+       }, {
+               .module_id = MODULE_SSU,
+               .msg = "MODULE_SSU"
+       }, {
+               .module_id = MODULE_TM,
+               .msg = "MODULE_TM"
+       }, {
+               .module_id = MODULE_RCB_TX,
+               .msg = "MODULE_RCB_TX"
+       }, {
+               .module_id = MODULE_TXDMA,
+               .msg = "MODULE_TXDMA"
+       }, {
+               .module_id = MODULE_MASTER,
+               .msg = "MODULE_MASTER"
+       }, {
+               .module_id = MODULE_ROCEE_TOP,
+               .msg = "MODULE_ROCEE_TOP"
+       }, {
+               .module_id = MODULE_ROCEE_TIMER,
+               .msg = "MODULE_ROCEE_TIMER"
+       }, {
+               .module_id = MODULE_ROCEE_MDB,
+               .msg = "MODULE_ROCEE_MDB"
+       }, {
+               .module_id = MODULE_ROCEE_TSP,
+               .msg = "MODULE_ROCEE_TSP"
+       }, {
+               .module_id = MODULE_ROCEE_TRP,
+               .msg = "MODULE_ROCEE_TRP"
+       }, {
+               .module_id = MODULE_ROCEE_SCC,
+               .msg = "MODULE_ROCEE_SCC"
+       }, {
+               .module_id = MODULE_ROCEE_CAEP,
+               .msg = "MODULE_ROCEE_CAEP"
+       }, {
+               .module_id = MODULE_ROCEE_GEN_AC,
+               .msg = "MODULE_ROCEE_GEN_AC"
+       }, {
+               .module_id = MODULE_ROCEE_QMM,
+               .msg = "MODULE_ROCEE_QMM"
+       }, {
+               .module_id = MODULE_ROCEE_LSAN,
+               .msg = "MODULE_ROCEE_LSAN"
+       }
+};
+
+static const struct hclge_hw_type_id hclge_hw_type_id_st[] = {
+       {
+               .type_id = NONE_ERROR,
+               .msg = "none_error"
+       }, {
+               .type_id = FIFO_ERROR,
+               .msg = "fifo_error"
+       }, {
+               .type_id = MEMORY_ERROR,
+               .msg = "memory_error"
+       }, {
+               .type_id = POISON_ERROR,
+               .msg = "poison_error"
+       }, {
+               .type_id = MSIX_ECC_ERROR,
+               .msg = "msix_ecc_error"
+       }, {
+               .type_id = TQP_INT_ECC_ERROR,
+               .msg = "tqp_int_ecc_error"
+       }, {
+               .type_id = PF_ABNORMAL_INT_ERROR,
+               .msg = "pf_abnormal_int_error"
+       }, {
+               .type_id = MPF_ABNORMAL_INT_ERROR,
+               .msg = "mpf_abnormal_int_error"
+       }, {
+               .type_id = COMMON_ERROR,
+               .msg = "common_error"
+       }, {
+               .type_id = PORT_ERROR,
+               .msg = "port_error"
+       }, {
+               .type_id = ETS_ERROR,
+               .msg = "ets_error"
+       }, {
+               .type_id = NCSI_ERROR,
+               .msg = "ncsi_error"
+       }, {
+               .type_id = GLB_ERROR,
+               .msg = "glb_error"
+       }, {
+               .type_id = ROCEE_NORMAL_ERR,
+               .msg = "rocee_normal_error"
+       }, {
+               .type_id = ROCEE_OVF_ERR,
+               .msg = "rocee_ovf_error"
+       }
+};
+
 static void hclge_log_error(struct device *dev, char *reg,
                            const struct hclge_hw_error *err,
                            u32 err_sts, unsigned long *reset_requests)
@@ -1611,11 +1739,27 @@ static const struct hclge_hw_blk hw_blk[] = {
        { /* sentinel */ }
 };
 
+static void hclge_config_all_msix_error(struct hclge_dev *hdev, bool enable)
+{
+       u32 reg_val;
+
+       reg_val = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG);
+
+       if (enable)
+               reg_val |= BIT(HCLGE_VECTOR0_ALL_MSIX_ERR_B);
+       else
+               reg_val &= ~BIT(HCLGE_VECTOR0_ALL_MSIX_ERR_B);
+
+       hclge_write_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG, reg_val);
+}
+
 int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state)
 {
        const struct hclge_hw_blk *module = hw_blk;
        int ret = 0;
 
+       hclge_config_all_msix_error(hdev, state);
+
        while (module->name) {
                if (module->config_err_int) {
                        ret = module->config_err_int(hdev, state);
@@ -1876,11 +2020,8 @@ static int hclge_handle_pf_msix_error(struct hclge_dev *hdev,
 static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev,
                                          unsigned long *reset_requests)
 {
-       struct hclge_mac_tnl_stats mac_tnl_stats;
-       struct device *dev = &hdev->pdev->dev;
        u32 mpf_bd_num, pf_bd_num, bd_num;
        struct hclge_desc *desc;
-       u32 status;
        int ret;
 
        /* query the number of bds for the MSIx int status */
@@ -1903,29 +2044,7 @@ static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev,
        if (ret)
                goto msi_error;
 
-       /* query and clear mac tnl interruptions */
-       hclge_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_QUERY_MAC_TNL_INT,
-                                  true);
-       ret = hclge_cmd_send(&hdev->hw, &desc[0], 1);
-       if (ret) {
-               dev_err(dev, "query mac tnl int cmd failed (%d)\n", ret);
-               goto msi_error;
-       }
-
-       status = le32_to_cpu(desc->data[0]);
-       if (status) {
-               /* When mac tnl interrupt occurs, we record current time and
-                * register status here in a fifo, then clear the status. So
-                * that if link status changes suddenly at some time, we can
-                * query them by debugfs.
-                */
-               mac_tnl_stats.time = local_clock();
-               mac_tnl_stats.status = status;
-               kfifo_put(&hdev->mac_tnl_log, mac_tnl_stats);
-               ret = hclge_clear_mac_tnl_int(hdev);
-               if (ret)
-                       dev_err(dev, "clear mac tnl int failed (%d)\n", ret);
-       }
+       ret = hclge_handle_mac_tnl(hdev);
 
 msi_error:
        kfree(desc);
@@ -1947,10 +2066,43 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
        return hclge_handle_all_hw_msix_error(hdev, reset_requests);
 }
 
-void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev)
+int hclge_handle_mac_tnl(struct hclge_dev *hdev)
 {
-#define HCLGE_DESC_NO_DATA_LEN 8
+       struct hclge_mac_tnl_stats mac_tnl_stats;
+       struct device *dev = &hdev->pdev->dev;
+       struct hclge_desc desc;
+       u32 status;
+       int ret;
+
+       /* query and clear mac tnl interruptions */
+       hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_MAC_TNL_INT, true);
+       ret = hclge_cmd_send(&hdev->hw, &desc, 1);
+       if (ret) {
+               dev_err(dev, "failed to query mac tnl int, ret = %d.\n", ret);
+               return ret;
+       }
 
+       status = le32_to_cpu(desc.data[0]);
+       if (status) {
+               /* When mac tnl interrupt occurs, we record current time and
+                * register status here in a fifo, then clear the status. So
+                * that if link status changes suddenly at some time, we can
+                * query them by debugfs.
+                */
+               mac_tnl_stats.time = local_clock();
+               mac_tnl_stats.status = status;
+               kfifo_put(&hdev->mac_tnl_log, mac_tnl_stats);
+               ret = hclge_clear_mac_tnl_int(hdev);
+               if (ret)
+                       dev_err(dev, "failed to clear mac tnl int, ret = %d.\n",
+                               ret);
+       }
+
+       return ret;
+}
+
+void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev)
+{
        struct hclge_dev *hdev = ae_dev->priv;
        struct device *dev = &hdev->pdev->dev;
        u32 mpf_bd_num, pf_bd_num, bd_num;
@@ -1999,3 +2151,205 @@ void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev)
 msi_error:
        kfree(desc);
 }
+
+bool hclge_find_error_source(struct hclge_dev *hdev)
+{
+       u32 msix_src_flag, hw_err_src_flag;
+
+       msix_src_flag = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS) &
+                       HCLGE_VECTOR0_REG_MSIX_MASK;
+
+       hw_err_src_flag = hclge_read_dev(&hdev->hw,
+                                        HCLGE_RAS_PF_OTHER_INT_STS_REG) &
+                         HCLGE_RAS_REG_ERR_MASK;
+
+       return msix_src_flag || hw_err_src_flag;
+}
+
+void hclge_handle_occurred_error(struct hclge_dev *hdev)
+{
+       struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
+
+       if (hclge_find_error_source(hdev))
+               hclge_handle_error_info_log(ae_dev);
+}
+
+static void
+hclge_handle_error_type_reg_log(struct device *dev,
+                               struct hclge_mod_err_info *mod_info,
+                               struct hclge_type_reg_err_info *type_reg_info)
+{
+#define HCLGE_ERR_TYPE_MASK 0x7F
+#define HCLGE_ERR_TYPE_IS_RAS_OFFSET 7
+
+       u8 mod_id, total_module, type_id, total_type, i, is_ras;
+       u8 index_module = MODULE_NONE;
+       u8 index_type = NONE_ERROR;
+
+       mod_id = mod_info->mod_id;
+       type_id = type_reg_info->type_id & HCLGE_ERR_TYPE_MASK;
+       is_ras = type_reg_info->type_id >> HCLGE_ERR_TYPE_IS_RAS_OFFSET;
+
+       total_module = ARRAY_SIZE(hclge_hw_module_id_st);
+       total_type = ARRAY_SIZE(hclge_hw_type_id_st);
+
+       for (i = 0; i < total_module; i++) {
+               if (mod_id == hclge_hw_module_id_st[i].module_id) {
+                       index_module = i;
+                       break;
+               }
+       }
+
+       for (i = 0; i < total_type; i++) {
+               if (type_id == hclge_hw_type_id_st[i].type_id) {
+                       index_type = i;
+                       break;
+               }
+       }
+
+       if (index_module != MODULE_NONE && index_type != NONE_ERROR)
+               dev_err(dev,
+                       "found %s %s, is %s error.\n",
+                       hclge_hw_module_id_st[index_module].msg,
+                       hclge_hw_type_id_st[index_type].msg,
+                       is_ras ? "ras" : "msix");
+       else
+               dev_err(dev,
+                       "unknown module[%u] or type[%u].\n", mod_id, type_id);
+
+       dev_err(dev, "reg_value:\n");
+       for (i = 0; i < type_reg_info->reg_num; i++)
+               dev_err(dev, "0x%08x\n", type_reg_info->hclge_reg[i]);
+}
+
+static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev,
+                                         const u32 *buf, u32 buf_size)
+{
+       struct hclge_type_reg_err_info *type_reg_info;
+       struct hclge_dev *hdev = ae_dev->priv;
+       struct device *dev = &hdev->pdev->dev;
+       struct hclge_mod_err_info *mod_info;
+       struct hclge_sum_err_info *sum_info;
+       u8 mod_num, err_num, i;
+       u32 offset = 0;
+
+       sum_info = (struct hclge_sum_err_info *)&buf[offset++];
+       if (sum_info->reset_type &&
+           sum_info->reset_type != HNAE3_NONE_RESET)
+               set_bit(sum_info->reset_type, &ae_dev->hw_err_reset_req);
+       mod_num = sum_info->mod_num;
+
+       while (mod_num--) {
+               if (offset >= buf_size) {
+                       dev_err(dev, "The offset(%u) exceeds buf's size(%u).\n",
+                               offset, buf_size);
+                       return;
+               }
+               mod_info = (struct hclge_mod_err_info *)&buf[offset++];
+               err_num = mod_info->err_num;
+
+               for (i = 0; i < err_num; i++) {
+                       if (offset >= buf_size) {
+                               dev_err(dev,
+                                       "The offset(%u) exceeds buf size(%u).\n",
+                                       offset, buf_size);
+                               return;
+                       }
+
+                       type_reg_info = (struct hclge_type_reg_err_info *)
+                                           &buf[offset++];
+                       hclge_handle_error_type_reg_log(dev, mod_info,
+                                                       type_reg_info);
+
+                       offset += type_reg_info->reg_num;
+               }
+       }
+}
+
+static int hclge_query_all_err_bd_num(struct hclge_dev *hdev, u32 *bd_num)
+{
+       struct device *dev = &hdev->pdev->dev;
+       struct hclge_desc desc_bd;
+       int ret;
+
+       hclge_cmd_setup_basic_desc(&desc_bd, HCLGE_QUERY_ALL_ERR_BD_NUM, true);
+       ret = hclge_cmd_send(&hdev->hw, &desc_bd, 1);
+       if (ret) {
+               dev_err(dev, "failed to query error bd_num, ret = %d.\n", ret);
+               return ret;
+       }
+
+       *bd_num = le32_to_cpu(desc_bd.data[0]);
+       if (!(*bd_num)) {
+               dev_err(dev, "The value of bd_num is 0!\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int hclge_query_all_err_info(struct hclge_dev *hdev,
+                                   struct hclge_desc *desc, u32 bd_num)
+{
+       struct device *dev = &hdev->pdev->dev;
+       int ret;
+
+       hclge_cmd_setup_basic_desc(desc, HCLGE_QUERY_ALL_ERR_INFO, true);
+       ret = hclge_cmd_send(&hdev->hw, desc, bd_num);
+       if (ret)
+               dev_err(dev, "failed to query error info, ret = %d.\n", ret);
+
+       return ret;
+}
+
+int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev)
+{
+       u32 bd_num, desc_len, buf_len, buf_size, i;
+       struct hclge_dev *hdev = ae_dev->priv;
+       struct hclge_desc *desc;
+       __le32 *desc_data;
+       u32 *buf;
+       int ret;
+
+       ret = hclge_query_all_err_bd_num(hdev, &bd_num);
+       if (ret)
+               goto out;
+
+       desc_len = bd_num * sizeof(struct hclge_desc);
+       desc = kzalloc(desc_len, GFP_KERNEL);
+       if (!desc) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ret = hclge_query_all_err_info(hdev, desc, bd_num);
+       if (ret)
+               goto err_desc;
+
+       buf_len = bd_num * sizeof(struct hclge_desc) - HCLGE_DESC_NO_DATA_LEN;
+       buf_size = buf_len / sizeof(u32);
+
+       desc_data = kzalloc(buf_len, GFP_KERNEL);
+       if (!desc_data)
+               return -ENOMEM;
+
+       buf = kzalloc(buf_len, GFP_KERNEL);
+       if (!buf) {
+               ret = -ENOMEM;
+               goto err_buf_alloc;
+       }
+
+       memcpy(desc_data, &desc[0].data[0], buf_len);
+       for (i = 0; i < buf_size; i++)
+               buf[i] = le32_to_cpu(desc_data[i]);
+
+       hclge_handle_error_module_log(ae_dev, buf, buf_size);
+       kfree(buf);
+
+err_buf_alloc:
+       kfree(desc_data);
+err_desc:
+       kfree(desc);
+out:
+       return ret;
+}
index d647f3c..07987fb 100644 (file)
@@ -15,6 +15,8 @@
 #define HCLGE_RAS_PF_OTHER_INT_STS_REG   0x20B00
 #define HCLGE_RAS_REG_NFE_MASK   0xFF00
 #define HCLGE_RAS_REG_ROCEE_ERR_MASK   0x3000000
+#define HCLGE_RAS_REG_ERR_MASK \
+       (HCLGE_RAS_REG_NFE_MASK | HCLGE_RAS_REG_ROCEE_ERR_MASK)
 
 #define HCLGE_VECTOR0_REG_MSIX_MASK   0x1FF00
 
 #define HCLGE_ROCEE_OVF_ERR_INT_MASK           0x10000
 #define HCLGE_ROCEE_OVF_ERR_TYPE_MASK          0x3F
 
+#define HCLGE_DESC_DATA_MAX                    8
+#define HCLGE_REG_NUM_MAX                      256
+#define HCLGE_DESC_NO_DATA_LEN                 8
+
 enum hclge_err_int_type {
        HCLGE_ERR_INT_MSIX = 0,
        HCLGE_ERR_INT_RAS_CE = 1,
@@ -114,6 +120,56 @@ enum hclge_err_int_type {
        HCLGE_ERR_INT_RAS_FE = 3,
 };
 
+enum hclge_mod_name_list {
+       MODULE_NONE             = 0,
+       MODULE_BIOS_COMMON      = 1,
+       MODULE_GE               = 2,
+       MODULE_IGU_EGU          = 3,
+       MODULE_LGE              = 4,
+       MODULE_NCSI             = 5,
+       MODULE_PPP              = 6,
+       MODULE_QCN              = 7,
+       MODULE_RCB_RX           = 8,
+       MODULE_RTC              = 9,
+       MODULE_SSU              = 10,
+       MODULE_TM               = 11,
+       MODULE_RCB_TX           = 12,
+       MODULE_TXDMA            = 13,
+       MODULE_MASTER           = 14,
+       /* add new MODULE NAME for NIC here in order */
+       MODULE_ROCEE_TOP        = 40,
+       MODULE_ROCEE_TIMER      = 41,
+       MODULE_ROCEE_MDB        = 42,
+       MODULE_ROCEE_TSP        = 43,
+       MODULE_ROCEE_TRP        = 44,
+       MODULE_ROCEE_SCC        = 45,
+       MODULE_ROCEE_CAEP       = 46,
+       MODULE_ROCEE_GEN_AC     = 47,
+       MODULE_ROCEE_QMM        = 48,
+       MODULE_ROCEE_LSAN       = 49,
+       /* add new MODULE NAME for RoCEE here in order */
+};
+
+enum hclge_err_type_list {
+       NONE_ERROR              = 0,
+       FIFO_ERROR              = 1,
+       MEMORY_ERROR            = 2,
+       POISON_ERROR            = 3,
+       MSIX_ECC_ERROR          = 4,
+       TQP_INT_ECC_ERROR       = 5,
+       PF_ABNORMAL_INT_ERROR   = 6,
+       MPF_ABNORMAL_INT_ERROR  = 7,
+       COMMON_ERROR            = 8,
+       PORT_ERROR              = 9,
+       ETS_ERROR               = 10,
+       NCSI_ERROR              = 11,
+       GLB_ERROR               = 12,
+       /* add new ERROR TYPE for NIC here in order */
+       ROCEE_NORMAL_ERR        = 40,
+       ROCEE_OVF_ERR           = 41,
+       /* add new ERROR TYPE for ROCEE here in order */
+};
+
 struct hclge_hw_blk {
        u32 msk;
        const char *name;
@@ -126,11 +182,44 @@ struct hclge_hw_error {
        enum hnae3_reset_type reset_level;
 };
 
+struct hclge_hw_module_id {
+       enum hclge_mod_name_list module_id;
+       const char *msg;
+};
+
+struct hclge_hw_type_id {
+       enum hclge_err_type_list type_id;
+       const char *msg;
+};
+
+struct hclge_sum_err_info {
+       u8 reset_type;
+       u8 mod_num;
+       u8 rsv[2];
+};
+
+struct hclge_mod_err_info {
+       u8 mod_id;
+       u8 err_num;
+       u8 rsv[2];
+};
+
+struct hclge_type_reg_err_info {
+       u8 type_id;
+       u8 reg_num;
+       u8 rsv[2];
+       u32 hclge_reg[HCLGE_REG_NUM_MAX];
+};
+
 int hclge_config_mac_tnl_int(struct hclge_dev *hdev, bool en);
 int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state);
 int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en);
 void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev);
+bool hclge_find_error_source(struct hclge_dev *hdev);
+void hclge_handle_occurred_error(struct hclge_dev *hdev);
 pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev);
 int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
                               unsigned long *reset_requests);
+int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev);
+int hclge_handle_mac_tnl(struct hclge_dev *hdev);
 #endif
index 4510268..d960e08 100644 (file)
@@ -3307,11 +3307,13 @@ static int hclge_set_vf_link_state(struct hnae3_handle *handle, int vf,
 
 static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 {
-       u32 cmdq_src_reg, msix_src_reg;
+       u32 cmdq_src_reg, msix_src_reg, hw_err_src_reg;
 
        /* fetch the events from their corresponding regs */
        cmdq_src_reg = hclge_read_dev(&hdev->hw, HCLGE_VECTOR0_CMDQ_SRC_REG);
        msix_src_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
+       hw_err_src_reg = hclge_read_dev(&hdev->hw,
+                                       HCLGE_RAS_PF_OTHER_INT_STS_REG);
 
        /* Assumption: If by any chance reset and mailbox events are reported
         * together then we will only process reset event in this go and will
@@ -3339,11 +3341,10 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
                return HCLGE_VECTOR0_EVENT_RST;
        }
 
-       /* check for vector0 msix event source */
-       if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK) {
-               *clearval = msix_src_reg;
+       /* check for vector0 msix event and hardware error event source */
+       if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK ||
+           hw_err_src_reg & HCLGE_RAS_REG_ERR_MASK)
                return HCLGE_VECTOR0_EVENT_ERR;
-       }
 
        /* check for vector0 mailbox(=CMDQ RX) event source */
        if (BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B) & cmdq_src_reg) {
@@ -3354,9 +3355,8 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 
        /* print other vector0 event source */
        dev_info(&hdev->pdev->dev,
-                "CMDQ INT status:0x%x, other INT status:0x%x\n",
-                cmdq_src_reg, msix_src_reg);
-       *clearval = msix_src_reg;
+                "INT status: CMDQ(%#x) HW errors(%#x) other(%#x)\n",
+                cmdq_src_reg, hw_err_src_reg, msix_src_reg);
 
        return HCLGE_VECTOR0_EVENT_OTHER;
 }
@@ -3427,15 +3427,10 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data)
 
        hclge_clear_event_cause(hdev, event_cause, clearval);
 
-       /* Enable interrupt if it is not cause by reset. And when
-        * clearval equal to 0, it means interrupt status may be
-        * cleared by hardware before driver reads status register.
-        * For this case, vector0 interrupt also should be enabled.
-        */
-       if (!clearval ||
-           event_cause == HCLGE_VECTOR0_EVENT_MBX) {
+       /* Enable interrupt if it is not caused by reset event or error event */
+       if (event_cause == HCLGE_VECTOR0_EVENT_MBX ||
+           event_cause == HCLGE_VECTOR0_EVENT_OTHER)
                hclge_enable_vector(&hdev->misc_vector, true);
-       }
 
        return IRQ_HANDLED;
 }
@@ -4240,6 +4235,38 @@ static void hclge_reset_subtask(struct hclge_dev *hdev)
        hdev->reset_type = HNAE3_NONE_RESET;
 }
 
+static void hclge_handle_err_reset_request(struct hclge_dev *hdev)
+{
+       struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
+       enum hnae3_reset_type reset_type;
+
+       if (ae_dev->hw_err_reset_req) {
+               reset_type = hclge_get_reset_level(ae_dev,
+                                                  &ae_dev->hw_err_reset_req);
+               hclge_set_def_reset_request(ae_dev, reset_type);
+       }
+
+       if (hdev->default_reset_request && ae_dev->ops->reset_event)
+               ae_dev->ops->reset_event(hdev->pdev, NULL);
+
+       /* enable interrupt after error handling complete */
+       hclge_enable_vector(&hdev->misc_vector, true);
+}
+
+static void hclge_handle_err_recovery(struct hclge_dev *hdev)
+{
+       struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
+
+       ae_dev->hw_err_reset_req = 0;
+
+       if (hclge_find_error_source(hdev)) {
+               hclge_handle_error_info_log(ae_dev);
+               hclge_handle_mac_tnl(hdev);
+       }
+
+       hclge_handle_err_reset_request(hdev);
+}
+
 static void hclge_misc_err_recovery(struct hclge_dev *hdev)
 {
        struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
@@ -4247,19 +4274,16 @@ static void hclge_misc_err_recovery(struct hclge_dev *hdev)
        u32 msix_sts_reg;
 
        msix_sts_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
-
        if (msix_sts_reg & HCLGE_VECTOR0_REG_MSIX_MASK) {
-               if (hclge_handle_hw_msix_error(hdev,
-                                              &hdev->default_reset_request))
+               if (hclge_handle_hw_msix_error
+                               (hdev, &hdev->default_reset_request))
                        dev_info(dev, "received msix interrupt 0x%x\n",
                                 msix_sts_reg);
-
-               if (hdev->default_reset_request)
-                       if (ae_dev->ops->reset_event)
-                               ae_dev->ops->reset_event(hdev->pdev, NULL);
        }
 
-       hclge_enable_vector(&hdev->misc_vector, true);
+       hclge_handle_hw_ras_error(ae_dev);
+
+       hclge_handle_err_reset_request(hdev);
 }
 
 static void hclge_errhand_service_task(struct hclge_dev *hdev)
@@ -4267,7 +4291,10 @@ static void hclge_errhand_service_task(struct hclge_dev *hdev)
        if (!test_and_clear_bit(HCLGE_STATE_ERR_SERVICE_SCHED, &hdev->state))
                return;
 
-       hclge_misc_err_recovery(hdev);
+       if (hnae3_dev_ras_imp_supported(hdev))
+               hclge_handle_err_recovery(hdev);
+       else
+               hclge_misc_err_recovery(hdev);
 }
 
 static void hclge_reset_service_task(struct hclge_dev *hdev)
@@ -11524,7 +11551,10 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
        hclge_clear_resetting_state(hdev);
 
        /* Log and clear the hw errors those already occurred */
-       hclge_handle_all_hns_hw_errors(ae_dev);
+       if (hnae3_dev_ras_imp_supported(hdev))
+               hclge_handle_occurred_error(hdev);
+       else
+               hclge_handle_all_hns_hw_errors(ae_dev);
 
        /* request delayed reset for the error recovery because an immediate
         * global reset on a PF affecting pending initialization of other PFs
@@ -11877,7 +11907,10 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev)
        }
 
        /* Log and clear the hw errors those already occurred */
-       hclge_handle_all_hns_hw_errors(ae_dev);
+       if (hnae3_dev_ras_imp_supported(hdev))
+               hclge_handle_occurred_error(hdev);
+       else
+               hclge_handle_all_hns_hw_errors(ae_dev);
 
        /* Re-enable the hw error interrupts because
         * the interrupts get disabled on global reset.
index 9b8abb5..582972a 100644 (file)
@@ -190,6 +190,7 @@ enum HLCGE_PORT_TYPE {
 #define HCLGE_VECTOR0_IMP_RESET_INT_B  1
 #define HCLGE_VECTOR0_IMP_CMDQ_ERR_B   4U
 #define HCLGE_VECTOR0_IMP_RD_POISON_B  5U
+#define HCLGE_VECTOR0_ALL_MSIX_ERR_B   6U
 
 #define HCLGE_MAC_DEFAULT_FRAME \
        (ETH_HLEN + ETH_FCS_LEN + 2 * VLAN_HLEN + ETH_DATA_LEN)