drm/amdgpu: retire bad pages for umc v12_0
authorYiPeng Chai <YiPeng.Chai@amd.com>
Mon, 18 Mar 2024 06:24:13 +0000 (14:24 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 26 Apr 2024 21:22:42 +0000 (17:22 -0400)
Retire bad pages for umc v12_0.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c

index 6c2b61ef5b5716b1546cda0116723a7220eeaafb..21dc51c013d18b649790dcd476a3a2eb930724a6 100644 (file)
@@ -28,6 +28,8 @@
 #include "umc/umc_12_0_0_sh_mask.h"
 #include "mp/mp_13_0_6_sh_mask.h"
 
+#define MAX_ECC_NUM_PER_RETIREMENT  32
+
 static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
                                            uint32_t node_inst,
                                            uint32_t umc_inst,
@@ -374,6 +376,7 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
        return 0;
 }
 
+#ifdef TO_BE_REMOVED
 static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
                                        void *ras_error_status)
 {
@@ -442,6 +445,7 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
                }
        }
 }
+#endif
 
 static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
                        enum amdgpu_mca_error_type type, void *ras_error_status)
@@ -633,6 +637,58 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
        return 0;
 }
 
+static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
+                               struct ras_ecc_err *ecc_err, void *ras_error_status)
+{
+       struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+       uint32_t i = 0;
+       int ret = 0;
+
+       if (!err_data || !ecc_err)
+               return -EINVAL;
+
+       for (i = 0; i < ecc_err->err_pages.count; i++) {
+               ret = amdgpu_umc_fill_error_record(err_data,
+                               ecc_err->addr,
+                               ecc_err->err_pages.pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
+                               MCA_IPID_2_UMC_CH(ecc_err->ipid),
+                               MCA_IPID_2_UMC_INST(ecc_err->ipid));
+               if (ret)
+                       break;
+       }
+
+       err_data->de_count++;
+
+       return ret;
+}
+
+static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
+                                       void *ras_error_status)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
+       struct radix_tree_root *ecc_tree;
+       int new_detected, ret, i;
+
+       ecc_tree = &con->umc_ecc_log.de_page_tree;
+
+       mutex_lock(&con->umc_ecc_log.lock);
+       new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
+                       0, ARRAY_SIZE(entries), UMC_ECC_NEW_DETECTED_TAG);
+       for (i = 0; i < new_detected; i++) {
+               if (!entries[i])
+                       continue;
+
+               ret = umc_v12_0_fill_error_record(adev, entries[i], ras_error_status);
+               if (ret) {
+                       dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n", ret);
+                       break;
+               }
+               radix_tree_tag_clear(ecc_tree, entries[i]->hash_index, UMC_ECC_NEW_DETECTED_TAG);
+       }
+       mutex_unlock(&con->umc_ecc_log.lock);
+}
+
 struct amdgpu_umc_ras umc_v12_0_ras = {
        .ras_block = {
                .hw_ops = &umc_v12_0_ras_hw_ops,
@@ -640,8 +696,7 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
        },
        .err_cnt_init = umc_v12_0_err_cnt_init,
        .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
-       .ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count,
-       .ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address,
+       .ecc_info_query_ras_error_address = umc_v12_0_query_ras_ecc_err_addr,
        .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
        .update_ecc_status = umc_v12_0_update_ecc_status,
 };