drm/amdgpu: MCA supports recording umc address information
authorYiPeng Chai <YiPeng.Chai@amd.com>
Tue, 12 Dec 2023 09:26:58 +0000 (17:26 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 19 Dec 2023 19:59:03 +0000 (14:59 -0500)
MCA supports recording umc address information.

V2:
  Move err_addr variable from struct ras_err_node to
struct ras_err_info.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c

index 210aea590a52ed7bc87c98776e5fd5107e0e7aea..8911310f98df2e1fb000fe2fc356ddd2632f9e19 100644 (file)
@@ -218,6 +218,7 @@ static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, st
 int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data)
 {
        struct amdgpu_smuio_mcm_config_info mcm_info;
+       struct ras_err_addr err_addr = {0};
        struct mca_bank_set mca_set;
        struct mca_bank_node *node;
        struct mca_bank_entry *entry;
@@ -246,10 +247,18 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
                mcm_info.socket_id = entry->info.socket_id;
                mcm_info.die_id = entry->info.aid;
 
+               if (blk == AMDGPU_RAS_BLOCK__UMC) {
+                       err_addr.err_status = entry->regs[MCA_REG_IDX_STATUS];
+                       err_addr.err_ipid = entry->regs[MCA_REG_IDX_IPID];
+                       err_addr.err_addr = entry->regs[MCA_REG_IDX_ADDR];
+               }
+
                if (type == AMDGPU_MCA_ERROR_TYPE_UE)
-                       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, (uint64_t)count);
+                       amdgpu_ras_error_statistic_ue_count(err_data,
+                               &mcm_info, &err_addr, (uint64_t)count);
                else
-                       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, (uint64_t)count);
+                       amdgpu_ras_error_statistic_ce_count(err_data,
+                               &mcm_info, &err_addr, (uint64_t)count);
        }
 
 out_mca_release:
index bacb59d8b701a6813b795ed67d4d8f98be60a784..bad62141f7084a6aeb5851572d8d2091471a419e 100644 (file)
@@ -1156,8 +1156,10 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
                for_each_ras_error(err_node, err_data) {
                        err_info = &err_node->err_info;
 
-                       amdgpu_ras_error_statistic_ce_count(&obj->err_data, &err_info->mcm_info, err_info->ce_count);
-                       amdgpu_ras_error_statistic_ue_count(&obj->err_data, &err_info->mcm_info, err_info->ue_count);
+                       amdgpu_ras_error_statistic_ce_count(&obj->err_data,
+                                       &err_info->mcm_info, NULL, err_info->ce_count);
+                       amdgpu_ras_error_statistic_ue_count(&obj->err_data,
+                                       &err_info->mcm_info, NULL, err_info->ue_count);
                }
        } else {
                /* for legacy asic path which doesn't has error source info */
@@ -3691,7 +3693,8 @@ static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct
 }
 
 static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
-                                                     struct amdgpu_smuio_mcm_config_info *mcm_info)
+                               struct amdgpu_smuio_mcm_config_info *mcm_info,
+                               struct ras_err_addr *err_addr)
 {
        struct ras_err_node *err_node;
 
@@ -3705,6 +3708,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
 
        memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
 
+       if (err_addr)
+               memcpy(&err_node->err_info.err_addr, err_addr, sizeof(*err_addr));
+
        err_data->err_list_count++;
        list_add_tail(&err_node->node, &err_data->err_node_list);
        list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp);
@@ -3713,7 +3719,8 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
 }
 
 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
-                                       struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
+               struct amdgpu_smuio_mcm_config_info *mcm_info,
+               struct ras_err_addr *err_addr, u64 count)
 {
        struct ras_err_info *err_info;
 
@@ -3723,7 +3730,7 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
        if (!count)
                return 0;
 
-       err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
+       err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
        if (!err_info)
                return -EINVAL;
 
@@ -3734,7 +3741,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
 }
 
 int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
-                                       struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
+               struct amdgpu_smuio_mcm_config_info *mcm_info,
+               struct ras_err_addr *err_addr, u64 count)
 {
        struct ras_err_info *err_info;
 
@@ -3744,7 +3752,7 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
        if (!count)
                return 0;
 
-       err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
+       err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
        if (!err_info)
                return -EINVAL;
 
index 6a941eb8fb8fd77e948e1bc170309e4a1418f2a2..76fb85628716f6302b3c02beb0965c85f2723a05 100644 (file)
@@ -452,10 +452,17 @@ struct ras_fs_data {
        char debugfs_name[32];
 };
 
+struct ras_err_addr {
+       uint64_t err_status;
+       uint64_t err_ipid;
+       uint64_t err_addr;
+};
+
 struct ras_err_info {
        struct amdgpu_smuio_mcm_config_info mcm_info;
        u64 ce_count;
        u64 ue_count;
+       struct ras_err_addr err_addr;
 };
 
 struct ras_err_node {
@@ -806,8 +813,10 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
 int amdgpu_ras_error_data_init(struct ras_err_data *err_data);
 void amdgpu_ras_error_data_fini(struct ras_err_data *err_data);
 int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
-                                       struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count);
+               struct amdgpu_smuio_mcm_config_info *mcm_info,
+               struct ras_err_addr *err_addr, u64 count);
 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
-                                       struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count);
+               struct amdgpu_smuio_mcm_config_info *mcm_info,
+               struct ras_err_addr *err_addr, u64 count);
 
 #endif
index 9a95b9f226b85a5c711cd4f21e28dc36021611dd..a6c88f2fe6e5750ea42d153dcfb855046d1a5b25 100644 (file)
@@ -1313,10 +1313,10 @@ static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct a
 
        switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) {
        case AMDGPU_MCA_ERROR_TYPE_UE:
-               amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL);
+               amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, NULL, 1ULL);
                break;
        case AMDGPU_MCA_ERROR_TYPE_CE:
-               amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL);
+               amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, NULL, 1ULL);
                break;
        default:
                break;
index 00b21ece081f96b4470d4f4ba2163148c71354ea..131cddbdda0dc11716205307e51d72aa72b271bf 100644 (file)
@@ -3828,8 +3828,8 @@ static void gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev,
        /* the caller should make sure initialize value of
         * err_data->ue_count and err_data->ce_count
         */
-       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
-       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
+       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
+       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
 }
 
 static void gfx_v9_4_3_inst_reset_ras_err_count(struct amdgpu_device *adev,
index 9b0146732e13ced30b38336fc76e0d46922ff77e..fb53aacdcba20f01019a20d63c7bb07d60e1e8d1 100644 (file)
@@ -652,8 +652,8 @@ static void mmhub_v1_8_inst_query_ras_error_count(struct amdgpu_device *adev,
                                        AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
                                        &ue_count);
 
-       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
-       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
+       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
+       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
 }
 
 static void mmhub_v1_8_query_ras_error_count(struct amdgpu_device *adev,
index 0f24af6f28102bc490d6bc2ecdc890294e5f1905..2d688dca26bedba5018bd41c76fb09a65a38cd66 100644 (file)
@@ -2156,7 +2156,7 @@ static void sdma_v4_4_2_inst_query_ras_error_count(struct amdgpu_device *adev,
                                        AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
                                        &ue_count);
 
-       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
+       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
 }
 
 static void sdma_v4_4_2_query_ras_error_count(struct amdgpu_device *adev,
index e9c2ff74f0bc1d6f530a5433b2383072289b0940..8d60c39ae1c551a10a1f895f7300e2f402969bc3 100644 (file)
@@ -166,8 +166,8 @@ static int umc_v12_0_query_error_count(struct amdgpu_device *adev,
        umc_v12_0_query_correctable_error_count(adev, umc_reg_offset, &ce_count);
        umc_v12_0_query_uncorrectable_error_count(adev, umc_reg_offset, &ue_count);
 
-       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
-       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
+       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count);
+       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count);
 
        return 0;
 }