drm/amdgpu/pm: support mca_ceumc_addr in ecctable
authorStanley.Yang <Stanley.Yang@amd.com>
Fri, 20 May 2022 10:22:21 +0000 (18:22 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 3 Jun 2022 20:43:36 +0000 (16:43 -0400)
SMU add a new variable mca_ceumc_addr to record
umc correctable error address in EccInfo table,
driver side add EccInfo_V2_t to support this feature

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h
drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c

index b9a6fac2b8b205ad634138d96dbbd14ff08ee908..28e603243b6729be6938b5c22fd88aa5379d6f18 100644 (file)
@@ -328,6 +328,7 @@ struct ecc_info_per_ch {
        uint16_t ce_count_hi_chip;
        uint64_t mca_umc_status;
        uint64_t mca_umc_addr;
+       uint64_t mca_ceumc_addr;
 };
 
 struct umc_ecc_info {
index 0f67c56c2863abcd6161ba679954565068b8346c..6f92038470ecf9a2bac3189019fefbadafb09f30 100644 (file)
@@ -519,7 +519,21 @@ typedef struct {
 } EccInfo_t;
 
 typedef struct {
-       EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
+       uint64_t mca_umc_status;
+       uint64_t mca_umc_addr;
+       uint64_t mca_ceumc_addr;
+
+       uint16_t ce_count_lo_chip;
+       uint16_t ce_count_hi_chip;
+
+       uint32_t eccPadding;
+} EccInfo_V2_t;
+
+typedef struct {
+       union {
+               EccInfo_t  EccInfo[ALDEBARAN_UMC_CHANNEL_NUM];
+               EccInfo_V2_t EccInfo_V2[ALDEBARAN_UMC_CHANNEL_NUM];
+       };
 } EccInfoTable_t;
 
 // These defines are used with the following messages:
index fb130409309c03bdf157bce8ae7718412913edce..bf124bc98b804d4baace560e6e90f706a3ae4a77 100644 (file)
  */
 #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
 
+/*
+ * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0,
+ * use this to check mca_ceumc_addr record whether support
+ */
+#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
+
 /*
  * SMU support BAD CHENNEL info MSG since version 68.51.00,
  * use this to check ECCTALE feature whether support
@@ -1803,7 +1809,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
        return sizeof(struct gpu_metrics_v1_3);
 }
 
-static int aldebaran_check_ecc_table_support(struct smu_context *smu)
+static int aldebaran_check_ecc_table_support(struct smu_context *smu,
+               int *ecctable_version)
 {
        uint32_t if_version = 0xff, smu_version = 0xff;
        int ret = 0;
@@ -1816,6 +1823,11 @@ static int aldebaran_check_ecc_table_support(struct smu_context *smu)
 
        if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
                ret = -EOPNOTSUPP;
+       else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION &&
+                       smu_version < SUPPORT_ECCTABLE_V2_SMU_VERSION)
+               *ecctable_version = 1;
+       else
+               *ecctable_version = 2;
 
        return ret;
 }
@@ -1827,9 +1839,10 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
        EccInfoTable_t *ecc_table = NULL;
        struct ecc_info_per_ch *ecc_info_per_channel = NULL;
        int i, ret = 0;
+       int table_version = 0;
        struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
 
-       ret = aldebaran_check_ecc_table_support(smu);
+       ret = aldebaran_check_ecc_table_support(smu, &table_version);
        if (ret)
                return ret;
 
@@ -1845,16 +1858,32 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
 
        ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
 
-       for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
-               ecc_info_per_channel = &(eccinfo->ecc[i]);
-               ecc_info_per_channel->ce_count_lo_chip =
-                       ecc_table->EccInfo[i].ce_count_lo_chip;
-               ecc_info_per_channel->ce_count_hi_chip =
-                       ecc_table->EccInfo[i].ce_count_hi_chip;
-               ecc_info_per_channel->mca_umc_status =
-                       ecc_table->EccInfo[i].mca_umc_status;
-               ecc_info_per_channel->mca_umc_addr =
-                       ecc_table->EccInfo[i].mca_umc_addr;
+       if (table_version == 1) {
+               for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+                       ecc_info_per_channel = &(eccinfo->ecc[i]);
+                       ecc_info_per_channel->ce_count_lo_chip =
+                               ecc_table->EccInfo[i].ce_count_lo_chip;
+                       ecc_info_per_channel->ce_count_hi_chip =
+                               ecc_table->EccInfo[i].ce_count_hi_chip;
+                       ecc_info_per_channel->mca_umc_status =
+                               ecc_table->EccInfo[i].mca_umc_status;
+                       ecc_info_per_channel->mca_umc_addr =
+                               ecc_table->EccInfo[i].mca_umc_addr;
+               }
+       } else if (table_version == 2) {
+               for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+                       ecc_info_per_channel = &(eccinfo->ecc[i]);
+                       ecc_info_per_channel->ce_count_lo_chip =
+                               ecc_table->EccInfo_V2[i].ce_count_lo_chip;
+                       ecc_info_per_channel->ce_count_hi_chip =
+                               ecc_table->EccInfo_V2[i].ce_count_hi_chip;
+                       ecc_info_per_channel->mca_umc_status =
+                               ecc_table->EccInfo_V2[i].mca_umc_status;
+                       ecc_info_per_channel->mca_umc_addr =
+                               ecc_table->EccInfo_V2[i].mca_umc_addr;
+                       ecc_info_per_channel->mca_ceumc_addr =
+                               ecc_table->EccInfo_V2[i].mca_ceumc_addr;
+               }
        }
 
        return ret;