drm/amdgpu: Add poison mode check error condition for umc v12_0
authorYiPeng Chai <YiPeng.Chai@amd.com>
Wed, 13 Dec 2023 08:14:40 +0000 (16:14 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 19 Dec 2023 19:59:03 +0000 (14:59 -0500)
Add poison mode check error condition for umc v12_0.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c

index 8d60c39ae1c551a10a1f895f7300e2f402969bc3..8430888760bae3cde5925cf5f4599f37ba7094a3 100644 (file)
@@ -88,16 +88,26 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device *adev)
                umc_v12_0_reset_error_count_per_channel, NULL);
 }
 
-bool umc_v12_0_is_uncorrectable_error(uint64_t mc_umc_status)
+bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
 {
+       if (amdgpu_ras_is_poison_mode_supported(adev) &&
+           (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
+           (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1))
+               return true;
+
        return ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
                (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
                REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
                REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1));
 }
 
-bool umc_v12_0_is_correctable_error(uint64_t mc_umc_status)
+bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
 {
+       if (amdgpu_ras_is_poison_mode_supported(adev) &&
+           (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
+           (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1))
+               return false;
+
        return (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
                (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1 ||
                (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 &&
@@ -105,7 +115,7 @@ bool umc_v12_0_is_correctable_error(uint64_t mc_umc_status)
                /* Identify data parity error in replay mode */
                ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 0x5 ||
                REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 0xb) &&
-               !(umc_v12_0_is_uncorrectable_error(mc_umc_status)))));
+               !(umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)))));
 }
 
 static void umc_v12_0_query_correctable_error_count(struct amdgpu_device *adev,
@@ -124,7 +134,7 @@ static void umc_v12_0_query_correctable_error_count(struct amdgpu_device *adev,
        mc_umc_status =
                RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4);
 
-       if (umc_v12_0_is_correctable_error(mc_umc_status))
+       if (umc_v12_0_is_correctable_error(adev, mc_umc_status))
                *error_count += 1;
 }
 
@@ -142,7 +152,7 @@ static void umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device *adev
        mc_umc_status =
                RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4);
 
-       if (umc_v12_0_is_uncorrectable_error(mc_umc_status))
+       if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status))
                *error_count += 1;
 }
 
index b34b1e358f8b823f439cfa437b870726aba6984b..17b4b52d6f1364ba8707e9d3bbfc54f99d81a435 100644 (file)
                (pa) |= (UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) << UMC_V12_0_PA_CH6_BIT); \
        } while (0)
 
-bool umc_v12_0_is_uncorrectable_error(uint64_t mc_umc_status);
-bool umc_v12_0_is_correctable_error(uint64_t mc_umc_status);
+bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
+bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
 
 extern const uint32_t
        umc_v12_0_channel_idx_tbl[]
index 3998c9b31d076bfa6a1d4cc473e49a1ad83078b7..6cbd335a804b9afdb1be146a41b92535dcb234dd 100644 (file)
@@ -2524,9 +2524,9 @@ static int mca_umc_mca_get_err_count(const struct mca_ras_info *mca_ras, struct
                return 0;
        }
 
-       if (type == AMDGPU_MCA_ERROR_TYPE_UE && umc_v12_0_is_uncorrectable_error(status0))
+       if (type == AMDGPU_MCA_ERROR_TYPE_UE && umc_v12_0_is_uncorrectable_error(adev, status0))
                *count = 1;
-       else if (type == AMDGPU_MCA_ERROR_TYPE_CE && umc_v12_0_is_correctable_error(status0))
+       else if (type == AMDGPU_MCA_ERROR_TYPE_CE && umc_v12_0_is_correctable_error(adev, status0))
                *count = 1;
 
        return 0;