drm/amdgpu: add interface to check mca umc status
authorYiPeng Chai <YiPeng.Chai@amd.com>
Mon, 15 Jan 2024 02:56:02 +0000 (10:56 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 22 Jan 2024 22:13:25 +0000 (17:13 -0500)
Add interface to check mca umc status.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c

index 666fd8fa39ad5e12adeeb8724ebf96aae0d93bf4..6452c09f22c6b4ee72927a943835d72edacd3442 100644 (file)
 #include "umc/umc_6_7_0_offset.h"
 #include "umc/umc_6_7_0_sh_mask.h"
 
+static bool amdgpu_mca_is_deferred_error(struct amdgpu_device *adev,
+                                       uint64_t mc_status)
+{
+       if (adev->umc.ras->check_ecc_err_status)
+               return adev->umc.ras->check_ecc_err_status(adev,
+                               AMDGPU_MCA_ERROR_TYPE_DE, &mc_status);
+
+       return false;
+}
+
 void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
                                              uint64_t mc_status_addr,
                                              unsigned long *error_count)
@@ -257,7 +267,7 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
                        amdgpu_ras_error_statistic_ue_count(err_data,
                                &mcm_info, &err_addr, (uint64_t)count);
                else {
-                       if (!!(MCA_REG__STATUS__DEFERRED(entry->regs[MCA_REG_IDX_STATUS])))
+                       if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS]))
                                amdgpu_ras_error_statistic_de_count(err_data,
                                        &mcm_info, &err_addr, (uint64_t)count);
                        else
index b399f1b62887a98432a5d8a9b9ac34e913384004..b964110ed1e05e4f1a55e2659837fe1c3cb601af 100644 (file)
@@ -65,6 +65,7 @@ enum amdgpu_mca_ip {
 enum amdgpu_mca_error_type {
        AMDGPU_MCA_ERROR_TYPE_UE = 0,
        AMDGPU_MCA_ERROR_TYPE_CE,
+       AMDGPU_MCA_ERROR_TYPE_DE,
 };
 
 struct amdgpu_mca_ras_block {
index de2dc18536367f5afc585edaf13ba7cba1fd1073..83199296ed106f79ef7c40a63330885ffc5cf97e 100644 (file)
@@ -21,7 +21,7 @@
 #ifndef __AMDGPU_UMC_H__
 #define __AMDGPU_UMC_H__
 #include "amdgpu_ras.h"
-
+#include "amdgpu_mca.h"
 /*
  * (addr / 256) * 4096, the higher 26 bits in ErrorAddr
  * is the index of 4KB block
@@ -64,6 +64,8 @@ struct amdgpu_umc_ras {
                                      void *ras_error_status);
        void (*ecc_info_query_ras_error_address)(struct amdgpu_device *adev,
                                        void *ras_error_status);
+       bool (*check_ecc_err_status)(struct amdgpu_device *adev,
+                       enum amdgpu_mca_error_type type, void *ras_error_status);
        /* support different eeprom table version for different asic */
        void (*set_eeprom_table_version)(struct amdgpu_ras_eeprom_table_header *hdr);
 };
index d3d3844dc891e75e1508eb3a90103a76b21efe92..5ca73fefe35819d77f9fa7de1ce19c4c0d53eeac 100644 (file)
@@ -422,6 +422,25 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
        }
 }
 
+static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
+                       enum amdgpu_mca_error_type type, void *ras_error_status)
+{
+       uint64_t mc_umc_status = *(uint64_t *)ras_error_status;
+
+       switch (type) {
+       case AMDGPU_MCA_ERROR_TYPE_UE:
+               return umc_v12_0_is_uncorrectable_error(adev, mc_umc_status);
+       case AMDGPU_MCA_ERROR_TYPE_CE:
+               return umc_v12_0_is_correctable_error(adev, mc_umc_status);
+       case AMDGPU_MCA_ERROR_TYPE_DE:
+               return umc_v12_0_is_deferred_error(adev, mc_umc_status);
+       default:
+               return false;
+       }
+
+       return false;
+}
+
 static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev)
 {
        amdgpu_umc_loop_channels(adev,
@@ -507,5 +526,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
        .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
        .ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count,
        .ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address,
+       .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
 };
 
index 952a983da49aca5f370914cc84c67b4deefea291..67fc01e0f9c60bd5a03165110129b5bc0528177e 100644 (file)
@@ -2557,9 +2557,9 @@ static int mca_umc_mca_get_err_count(const struct mca_ras_info *mca_ras, struct
                return 0;
        }
 
-       if ((type == AMDGPU_MCA_ERROR_TYPE_UE && umc_v12_0_is_uncorrectable_error(adev, status0)) ||
-           (type == AMDGPU_MCA_ERROR_TYPE_CE && (umc_v12_0_is_correctable_error(adev, status0) ||
-            umc_v12_0_is_deferred_error(adev, status0))))
+       if (umc_v12_0_is_deferred_error(adev, status0) ||
+           umc_v12_0_is_uncorrectable_error(adev, status0) ||
+           umc_v12_0_is_correctable_error(adev, status0))
                *count = 1;
 
        return 0;