net: hns3: add vf fault detect support
authorJie Wang <wangjie125@huawei.com>
Sat, 7 Oct 2023 03:12:15 +0000 (11:12 +0800)
committerJakub Kicinski <kuba@kernel.org>
Wed, 11 Oct 2023 20:24:55 +0000 (13:24 -0700)
Currently hns3 driver supports vf fault detect feature. Several ras caused
by VF resources don't need to do PF function reset for recovery. The driver
only needs to reset the specified VF.

So this patch adds process in ras module. New process will get detailed
information about ras and do the most correct measures based on these
accurate information.

Signed-off-by: Jie Wang <wangjie125@huawei.com>
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
Link: https://lore.kernel.org/r/20231007031215.1067758-3-shaojijie@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
drivers/net/ethernet/hisilicon/hns3/hnae3.h
drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c

index 46062106fc6a212e50647eb40f9c88bdfd12efb4..d7e175a9cb49b5495b1cc995e238a519a2594251 100644 (file)
@@ -275,6 +275,7 @@ enum hnae3_reset_type {
        HNAE3_GLOBAL_RESET,
        HNAE3_IMP_RESET,
        HNAE3_NONE_RESET,
+       HNAE3_VF_EXP_RESET,
        HNAE3_MAX_RESET,
 };
 
index 92e73d44f0e5b48bb0c60c9c9d8ef3ea6b181451..533c19d25e4f6d0ff57f9d5c7a92ace3ba38045b 100644 (file)
@@ -93,6 +93,7 @@ enum hclge_opcode_type {
        HCLGE_OPC_DFX_SSU_REG_2         = 0x004F,
 
        HCLGE_OPC_QUERY_DEV_SPECS       = 0x0050,
+       HCLGE_OPC_GET_QUEUE_ERR_VF      = 0x0067,
 
        /* MAC command */
        HCLGE_OPC_CONFIG_MAC_MODE       = 0x0301,
index 3f35227ef1fab3e555b684a1d43cec5a8185bacf..d63e114f93d0bb25d0783200d019f1813748cc0e 100644 (file)
@@ -1301,10 +1301,12 @@ static const struct hclge_hw_type_id hclge_hw_type_id_st[] = {
                .msg = "tqp_int_ecc_error"
        }, {
                .type_id = PF_ABNORMAL_INT_ERROR,
-               .msg = "pf_abnormal_int_error"
+               .msg = "pf_abnormal_int_error",
+               .cause_by_vf = true
        }, {
                .type_id = MPF_ABNORMAL_INT_ERROR,
-               .msg = "mpf_abnormal_int_error"
+               .msg = "mpf_abnormal_int_error",
+               .cause_by_vf = true
        }, {
                .type_id = COMMON_ERROR,
                .msg = "common_error"
@@ -2759,7 +2761,7 @@ void hclge_handle_occurred_error(struct hclge_dev *hdev)
                hclge_handle_error_info_log(ae_dev);
 }
 
-static void
+static bool
 hclge_handle_error_type_reg_log(struct device *dev,
                                struct hclge_mod_err_info *mod_info,
                                struct hclge_type_reg_err_info *type_reg_info)
@@ -2770,6 +2772,7 @@ hclge_handle_error_type_reg_log(struct device *dev,
        u8 mod_id, total_module, type_id, total_type, i, is_ras;
        u8 index_module = MODULE_NONE;
        u8 index_type = NONE_ERROR;
+       bool cause_by_vf = false;
 
        mod_id = mod_info->mod_id;
        type_id = type_reg_info->type_id & HCLGE_ERR_TYPE_MASK;
@@ -2788,6 +2791,7 @@ hclge_handle_error_type_reg_log(struct device *dev,
        for (i = 0; i < total_type; i++) {
                if (type_id == hclge_hw_type_id_st[i].type_id) {
                        index_type = i;
+                       cause_by_vf = hclge_hw_type_id_st[i].cause_by_vf;
                        break;
                }
        }
@@ -2805,6 +2809,8 @@ hclge_handle_error_type_reg_log(struct device *dev,
        dev_err(dev, "reg_value:\n");
        for (i = 0; i < type_reg_info->reg_num; i++)
                dev_err(dev, "0x%08x\n", type_reg_info->hclge_reg[i]);
+
+       return cause_by_vf;
 }
 
 static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev,
@@ -2815,6 +2821,7 @@ static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev,
        struct device *dev = &hdev->pdev->dev;
        struct hclge_mod_err_info *mod_info;
        struct hclge_sum_err_info *sum_info;
+       bool cause_by_vf = false;
        u8 mod_num, err_num, i;
        u32 offset = 0;
 
@@ -2843,12 +2850,16 @@ static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev,
 
                        type_reg_info = (struct hclge_type_reg_err_info *)
                                            &buf[offset++];
-                       hclge_handle_error_type_reg_log(dev, mod_info,
-                                                       type_reg_info);
+                       if (hclge_handle_error_type_reg_log(dev, mod_info,
+                                                           type_reg_info))
+                               cause_by_vf = true;
 
                        offset += type_reg_info->reg_num;
                }
        }
+
+       if (hnae3_ae_dev_vf_fault_supported(hdev->ae_dev) && cause_by_vf)
+               set_bit(HNAE3_VF_EXP_RESET, &ae_dev->hw_err_reset_req);
 }
 
 static int hclge_query_all_err_bd_num(struct hclge_dev *hdev, u32 *bd_num)
@@ -2940,3 +2951,98 @@ err_desc:
 out:
        return ret;
 }
+
+static bool hclge_reset_vf_in_bitmap(struct hclge_dev *hdev,
+                                    unsigned long *bitmap)
+{
+       struct hclge_vport *vport;
+       bool exist_set = false;
+       int func_id;
+       int ret;
+
+       func_id = find_first_bit(bitmap, HCLGE_VPORT_NUM);
+       if (func_id == PF_VPORT_ID)
+               return false;
+
+       while (func_id != HCLGE_VPORT_NUM) {
+               vport = hclge_get_vf_vport(hdev,
+                                          func_id - HCLGE_VF_VPORT_START_NUM);
+               if (!vport) {
+                       dev_err(&hdev->pdev->dev, "invalid func id(%d)\n",
+                               func_id);
+                       return false;
+               }
+
+               dev_info(&hdev->pdev->dev, "do function %d recovery.", func_id);
+
+               ret = hclge_reset_tqp(&vport->nic);
+               if (ret) {
+                       dev_err(&hdev->pdev->dev,
+                               "failed to reset tqp, ret = %d.", ret);
+                       return false;
+               }
+
+               ret = hclge_inform_vf_reset(vport, HNAE3_VF_FUNC_RESET);
+               if (ret) {
+                       dev_err(&hdev->pdev->dev,
+                               "failed to reset func %d, ret = %d.",
+                               func_id, ret);
+                       return false;
+               }
+
+               exist_set = true;
+               clear_bit(func_id, bitmap);
+               func_id = find_first_bit(bitmap, HCLGE_VPORT_NUM);
+       }
+
+       return exist_set;
+}
+
+static void hclge_get_vf_fault_bitmap(struct hclge_desc *desc,
+                                     unsigned long *bitmap)
+{
+#define HCLGE_FIR_FAULT_BYTES  24
+#define HCLGE_SEC_FAULT_BYTES  8
+
+       u8 *buff;
+
+       BUILD_BUG_ON(HCLGE_FIR_FAULT_BYTES + HCLGE_SEC_FAULT_BYTES !=
+                    BITS_TO_BYTES(HCLGE_VPORT_NUM));
+
+       memcpy(bitmap, desc[0].data, HCLGE_FIR_FAULT_BYTES);
+       buff = (u8 *)bitmap + HCLGE_FIR_FAULT_BYTES;
+       memcpy(buff, desc[1].data, HCLGE_SEC_FAULT_BYTES);
+}
+
+int hclge_handle_vf_queue_err_ras(struct hclge_dev *hdev)
+{
+       unsigned long vf_fault_bitmap[BITS_TO_LONGS(HCLGE_VPORT_NUM)];
+       struct hclge_desc desc[2];
+       bool cause_by_vf = false;
+       int ret;
+
+       if (!test_and_clear_bit(HNAE3_VF_EXP_RESET,
+                               &hdev->ae_dev->hw_err_reset_req) ||
+           !hnae3_ae_dev_vf_fault_supported(hdev->ae_dev))
+               return 0;
+
+       hclge_comm_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_GET_QUEUE_ERR_VF,
+                                       true);
+       desc[0].flag |= cpu_to_le16(HCLGE_COMM_CMD_FLAG_NEXT);
+       hclge_comm_cmd_setup_basic_desc(&desc[1], HCLGE_OPC_GET_QUEUE_ERR_VF,
+                                       true);
+
+       ret = hclge_comm_cmd_send(&hdev->hw.hw, desc, 2);
+       if (ret) {
+               dev_err(&hdev->pdev->dev,
+                       "failed to get vf bitmap, ret = %d.\n", ret);
+               return ret;
+       }
+       hclge_get_vf_fault_bitmap(desc, vf_fault_bitmap);
+
+       cause_by_vf = hclge_reset_vf_in_bitmap(hdev, vf_fault_bitmap);
+       if (cause_by_vf)
+               hdev->ae_dev->hw_err_reset_req = 0;
+
+       return 0;
+}
index 86be6fb329901755c0c409ffba3e9fdc2e023c33..68b738affa660a9bbcaa8c75c81d93a215615398 100644 (file)
@@ -196,6 +196,7 @@ struct hclge_hw_module_id {
 struct hclge_hw_type_id {
        enum hclge_err_type_list type_id;
        const char *msg;
+       bool cause_by_vf; /* indicate the error may from vf exception */
 };
 
 struct hclge_sum_err_info {
@@ -228,4 +229,5 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
                               unsigned long *reset_requests);
 int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev);
 int hclge_handle_mac_tnl(struct hclge_dev *hdev);
+int hclge_handle_vf_queue_err_ras(struct hclge_dev *hdev);
 #endif
index c42574e297476bdd1209e4a3090345c9a4e13b3c..99c0576e6383aa3e9d8b318c995a7633c61d0864 100644 (file)
@@ -3424,7 +3424,7 @@ static int hclge_get_status(struct hnae3_handle *handle)
        return hdev->hw.mac.link;
 }
 
-static struct hclge_vport *hclge_get_vf_vport(struct hclge_dev *hdev, int vf)
+struct hclge_vport *hclge_get_vf_vport(struct hclge_dev *hdev, int vf)
 {
        if (!pci_num_vf(hdev->pdev)) {
                dev_err(&hdev->pdev->dev,
@@ -4468,6 +4468,7 @@ static void hclge_handle_err_recovery(struct hclge_dev *hdev)
        if (hclge_find_error_source(hdev)) {
                hclge_handle_error_info_log(ae_dev);
                hclge_handle_mac_tnl(hdev);
+               hclge_handle_vf_queue_err_ras(hdev);
        }
 
        hclge_handle_err_reset_request(hdev);
index 7bc2049b723daa387aba2a083dc526a2e68083ba..02c7aab3546e78c6ea652770cc1a7e167027c51d 100644 (file)
@@ -1146,4 +1146,6 @@ int hclge_dbg_dump_rst_info(struct hclge_dev *hdev, char *buf, int len);
 int hclge_push_vf_link_status(struct hclge_vport *vport);
 int hclge_enable_vport_vlan_filter(struct hclge_vport *vport, bool request_en);
 int hclge_mac_update_stats(struct hclge_dev *hdev);
+struct hclge_vport *hclge_get_vf_vport(struct hclge_dev *hdev, int vf);
+int hclge_inform_vf_reset(struct hclge_vport *vport, u16 reset_type);
 #endif
index 04ff9bf121853ab7a0f876c55b49d1e6d9199035..4b0d07ca2505e3adfeb4afcbbae502457a4ac3db 100644 (file)
@@ -124,7 +124,7 @@ static int hclge_send_mbx_msg(struct hclge_vport *vport, u8 *msg, u16 msg_len,
        return status;
 }
 
-static int hclge_inform_vf_reset(struct hclge_vport *vport, u16 reset_type)
+int hclge_inform_vf_reset(struct hclge_vport *vport, u16 reset_type)
 {
        __le16 msg_data;
        u8 dest_vfid;