drm/amd/pm: add smu_13_0_6 mca dump support
authorYang Wang <kevinyang.wang@amd.com>
Mon, 4 Sep 2023 08:32:30 +0000 (16:32 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 20 Sep 2023 20:24:06 +0000 (16:24 -0400)
v1:
implement smu_v13_0_6 mca bank interface.

v2:
- remove unnecessary lock
- move MCMP1_* macros to mp_13_0_6_sh_mask.h file

Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/include/asic_reg/mp/mp_13_0_6_sh_mask.h
drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c

index 780d9824d5edf30f828c9a06b901a83fbb58ce81..2684e396f548d31facf98f5dd9f771254705ae47 100644 (file)
 #define MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK                                                           0x00000001L
 #define MP1_FIRMWARE_FLAGS__RESERVED_MASK                                                                     0xFFFFFFFEL
 
+//MCMP1_IPIDT0
+#define MCMP1_IPIDT0__InstanceIdLo__SHIFT                                                                     0x0
+#define MCMP1_IPIDT0__HardwareID__SHIFT                                                                       0x20
+#define MCMP1_IPIDT0__InstanceIdHi__SHIFT                                                                     0x2c
+#define MCMP1_IPIDT0__McaType__SHIFT                                                                          0x30
+
+#define MCMP1_IPIDT0__InstanceIdLo_MASK                                                                       0x00000000FFFFFFFFL
+#define MCMP1_IPIDT0__HardwareID_MASK                                                                         0x00000FFF00000000L
+#define MCMP1_IPIDT0__InstanceIdHi_MASK                                                                       0x0000F00000000000L
+#define MCMP1_IPIDT0__McaType_MASK                                                                            0xFFFF000000000000L
+
+//MCMP1_STATUST0
+#define MCMP1_STATUST0__ErrorCode__SHIFT                                                                      0x0
+#define MCMP1_STATUST0__ErrorCodeExt__SHIFT                                                                   0x10
+#define MCMP1_STATUST0__PCC__SHIFT                                                                            0x39
+#define MCMP1_STATUST0__UC__SHIFT                                                                             0x3d
+#define MCMP1_STATUST0__Val__SHIFT                                                                            0x3f
+
+#define MCMP1_STATUST0__ErrorCode_MASK                                                                        0x000000000000FFFFL
+#define MCMP1_STATUST0__ErrorCodeExt_MASK                                                                     0x00000000003F0000L
+#define MCMP1_STATUST0__PCC_MASK                                                                              0x0200000000000000L
+#define MCMP1_STATUST0__UC_MASK                                                                               0x2000000000000000L
+#define MCMP1_STATUST0__Val_MASK                                                                              0x8000000000000000L
+
+//MCMP1_MISC0T0
+#define MCMP1_MISC0T0__ErrCnt__SHIFT                                                                          0x20
+
+#define MCMP1_MISC0T0__ErrCnt_MASK                                                                            0x00000FFF00000000L
 
 #endif
index 8888278654864d21223d35f2634f5ab0b0e1b3ee..df6a6b706c13e71bad653362e4ee52ac24c6e898 100644 (file)
@@ -44,6 +44,7 @@
 #include "amdgpu_xgmi.h"
 #include <linux/pci.h>
 #include "amdgpu_ras.h"
+#include "amdgpu_mca.h"
 #include "smu_cmn.h"
 #include "mp/mp_13_0_6_offset.h"
 #include "mp/mp_13_0_6_sh_mask.h"
 
 #define SMU_13_0_6_DSCLK_THRESHOLD 100
 
+#define MCA_BANK_IPID(_ip, _hwid, _type) \
+       [AMDGPU_MCA_IP_##_ip] = { .hwid = _hwid, .mcatype = _type, }
+
+enum mca_reg_idx {
+       MCA_REG_IDX_CONTROL             = 0,
+       MCA_REG_IDX_STATUS              = 1,
+       MCA_REG_IDX_ADDR                = 2,
+       MCA_REG_IDX_MISC0               = 3,
+       MCA_REG_IDX_CONFIG              = 4,
+       MCA_REG_IDX_IPID                = 5,
+       MCA_REG_IDX_SYND                = 6,
+       MCA_REG_IDX_COUNT               = 16,
+};
+
+struct mca_bank_ipid {
+       enum amdgpu_mca_ip ip;
+       uint16_t hwid;
+       uint16_t mcatype;
+};
+
+struct mca_ras_info {
+       enum amdgpu_ras_block blkid;
+       enum amdgpu_mca_ip ip;
+       int *err_code_array;
+       int err_code_count;
+       int (*get_err_count)(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev,
+                            enum amdgpu_mca_error_type type, int idx, uint32_t *count);
+};
+
 static const struct cmn2asic_msg_mapping smu_v13_0_6_message_map[SMU_MSG_MAX_COUNT] = {
        MSG_MAP(TestMessage,                         PPSMC_MSG_TestMessage,                     0),
        MSG_MAP(GetSmuVersion,                       PPSMC_MSG_GetSmuVersion,                   1),
@@ -2184,6 +2214,506 @@ static int smu_v13_0_6_post_init(struct smu_context *smu)
        return 0;
 }
 
+static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
+{
+       struct smu_context *smu = adev->powerplay.pp_handle;
+
+       return smu_v13_0_6_mca_set_debug_mode(smu, enable);
+}
+
+static int smu_v13_0_6_get_valid_mca_count(struct smu_context *smu, enum amdgpu_mca_error_type type, uint32_t *count)
+{
+       uint32_t msg;
+       int ret;
+
+       if (!count)
+               return -EINVAL;
+
+       switch (type) {
+       case AMDGPU_MCA_ERROR_TYPE_UE:
+               msg = SMU_MSG_QueryValidMcaCount;
+               break;
+       case AMDGPU_MCA_ERROR_TYPE_CE:
+               msg = SMU_MSG_QueryValidMcaCeCount;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       ret = smu_cmn_send_smc_msg(smu, msg, count);
+       if (ret) {
+               *count = 0;
+               return ret;
+       }
+
+       return 0;
+}
+
+static int __smu_v13_0_6_mca_dump_bank(struct smu_context *smu, enum amdgpu_mca_error_type type,
+                                      int idx, int offset, uint32_t *val)
+{
+       uint32_t msg, param;
+
+       switch (type) {
+       case AMDGPU_MCA_ERROR_TYPE_UE:
+               msg = SMU_MSG_McaBankDumpDW;
+               break;
+       case AMDGPU_MCA_ERROR_TYPE_CE:
+               msg = SMU_MSG_McaBankCeDumpDW;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       param = ((idx & 0xffff) << 16) | (offset & 0xfffc);
+
+       return smu_cmn_send_smc_msg_with_param(smu, msg, param, val);
+}
+
+static int smu_v13_0_6_mca_dump_bank(struct smu_context *smu, enum amdgpu_mca_error_type type,
+                                    int idx, int offset, uint32_t *val, int count)
+{
+       int ret, i;
+
+       if (!val)
+               return -EINVAL;
+
+       for (i = 0; i < count; i++) {
+               ret = __smu_v13_0_6_mca_dump_bank(smu, type, idx, offset + (i << 2), &val[i]);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static const struct mca_bank_ipid smu_v13_0_6_mca_ipid_table[AMDGPU_MCA_IP_COUNT] = {
+       MCA_BANK_IPID(UMC, 0x96, 0x0),
+       MCA_BANK_IPID(SMU, 0x01, 0x1),
+       MCA_BANK_IPID(MP5, 0x01, 0x2),
+};
+
+static void mca_bank_entry_info_decode(struct mca_bank_entry *entry, struct mca_bank_info *info)
+{
+       uint64_t ipid = entry->regs[MCA_REG_IDX_IPID];
+       uint32_t insthi;
+
+       /* NOTE: All MCA IPID register share the same format,
+        * so the driver can share the MCMP1 register header file.
+        * */
+
+       info->hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
+       info->mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
+
+       insthi = REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi);
+       info->aid = ((insthi >> 2) & 0x03);
+       info->socket_id = insthi & 0x03;
+}
+
+static int mca_bank_read_reg(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
+                            int idx, int reg_idx, uint64_t *val)
+{
+       struct smu_context *smu = adev->powerplay.pp_handle;
+       uint32_t data[2] = {0, 0};
+       int ret;
+
+       if (!val || reg_idx >= MCA_REG_IDX_COUNT)
+               return -EINVAL;
+
+       ret = smu_v13_0_6_mca_dump_bank(smu, type, idx, reg_idx * 8, data, ARRAY_SIZE(data));
+       if (ret)
+               return ret;
+
+       *val = (uint64_t)data[1] << 32 | data[0];
+
+       dev_dbg(adev->dev, "mca read bank reg: type:%s, index: %d, reg_idx: %d, val: 0x%016llx\n",
+               type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE", idx, reg_idx, *val);
+
+       return 0;
+}
+
+static int mca_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
+                            int idx, struct mca_bank_entry *entry)
+{
+       int i, ret;
+
+       /* NOTE: populated all mca register by default */
+       for (i = 0; i < ARRAY_SIZE(entry->regs); i++) {
+               ret = mca_bank_read_reg(adev, type, idx, i, &entry->regs[i]);
+               if (ret)
+                       return ret;
+       }
+
+       if (ret)
+               return ret;
+
+       entry->idx = idx;
+       entry->type = type;
+
+       mca_bank_entry_info_decode(entry, &entry->info);
+
+       return 0;
+}
+
+static int mca_decode_mca_ipid(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, int idx, int *ip)
+{
+       const struct mca_bank_ipid *ipid;
+       uint64_t val;
+       uint16_t hwid, mcatype;
+       int i, ret;
+
+       ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_IPID, &val);
+       if (ret)
+               return ret;
+
+       hwid = REG_GET_FIELD(val, MCMP1_IPIDT0, HardwareID);
+       mcatype = REG_GET_FIELD(val, MCMP1_IPIDT0, McaType);
+
+       if (hwid) {
+               for (i = 0; i < ARRAY_SIZE(smu_v13_0_6_mca_ipid_table); i++) {
+                       ipid = &smu_v13_0_6_mca_ipid_table[i];
+
+                       if (!ipid->hwid)
+                               continue;
+
+                       if (ipid->hwid == hwid && ipid->mcatype == mcatype) {
+                               *ip = i;
+                               return 0;
+                       }
+               }
+       }
+
+       *ip = AMDGPU_MCA_IP_UNKNOW;
+
+       return 0;
+}
+
+static int mca_normal_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev,
+                                       enum amdgpu_mca_error_type type, int idx, uint32_t *count)
+{
+       uint64_t status0;
+       int ret;
+
+       ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_STATUS, &status0);
+       if (ret)
+               return ret;
+
+       if (REG_GET_FIELD(status0, MCMP1_STATUST0, Val))
+               *count = 1;
+       else
+               *count = 0;
+
+       return 0;
+}
+
+static bool mca_smu_check_error_code(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras,
+                                    uint32_t errcode)
+{
+       int i;
+
+       if (!mca_ras->err_code_count || !mca_ras->err_code_array)
+               return true;
+
+       for (i = 0; i < mca_ras->err_code_count; i++) {
+               if (errcode == mca_ras->err_code_array[i])
+                       return true;
+       }
+
+       return false;
+}
+
+static int mca_mp5_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev,
+                                    enum amdgpu_mca_error_type type, int idx, uint32_t *count)
+{
+       uint64_t status0 = 0, misc0 = 0;
+       uint32_t errcode;
+       int ret;
+
+       if (mca_ras->ip != AMDGPU_MCA_IP_MP5)
+               return -EINVAL;
+
+       ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_STATUS, &status0);
+       if (ret)
+               return ret;
+
+       if (!REG_GET_FIELD(status0, MCMP1_STATUST0, Val)) {
+               *count = 0;
+               return 0;
+       }
+
+       errcode = REG_GET_FIELD(status0, MCMP1_STATUST0, ErrorCode);
+       if (!mca_smu_check_error_code(adev, mca_ras, errcode))
+               return 0;
+
+       if (type == AMDGPU_MCA_ERROR_TYPE_UE &&
+           REG_GET_FIELD(status0, MCMP1_STATUST0, UC) == 1 &&
+           REG_GET_FIELD(status0, MCMP1_STATUST0, PCC) == 1) {
+               if (count)
+                       *count = 1;
+               return 0;
+       }
+
+       ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_MISC0, &misc0);
+       if (ret)
+               return ret;
+
+       if (count)
+               *count = REG_GET_FIELD(misc0, MCMP1_MISC0T0, ErrCnt);
+
+       return 0;
+}
+
+static int mca_smu_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev,
+                                    enum amdgpu_mca_error_type type, int idx, uint32_t *count)
+{
+       uint64_t status0 = 0, misc0 = 0;
+       uint32_t errcode;
+       int ret;
+
+       if (mca_ras->ip != AMDGPU_MCA_IP_SMU)
+               return -EINVAL;
+
+       ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_STATUS, &status0);
+       if (ret)
+               return ret;
+
+       if (!REG_GET_FIELD(status0, MCMP1_STATUST0, Val)) {
+               *count = 0;
+               return 0;
+       }
+
+       errcode = REG_GET_FIELD(status0, MCMP1_STATUST0, ErrorCode);
+       if (!mca_smu_check_error_code(adev, mca_ras, errcode))
+               return 0;
+
+       if (type == AMDGPU_MCA_ERROR_TYPE_UE &&
+           REG_GET_FIELD(status0, MCMP1_STATUST0, UC) == 1 &&
+           REG_GET_FIELD(status0, MCMP1_STATUST0, PCC) == 1) {
+               if (count)
+                       *count = 1;
+               return 0;
+       }
+
+       ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_MISC0, &misc0);
+       if (ret)
+               return ret;
+
+       if (count)
+               *count = REG_GET_FIELD(misc0, MCMP1_MISC0T0, ErrCnt);
+
+       return 0;
+}
+
+static int sdma_err_codes[] = { CODE_SDMA0, CODE_SDMA1, CODE_SDMA2, CODE_SDMA3 };
+static int mmhub_err_codes[] = {
+       CODE_DAGB0, CODE_DAGB0 + 1, CODE_DAGB0 + 2, CODE_DAGB0 + 3, CODE_DAGB0 + 4, /* DAGB0-4 */
+       CODE_EA0, CODE_EA0 + 1, CODE_EA0 + 2, CODE_EA0 + 3, CODE_EA0 + 4,       /* MMEA0-4*/
+       CODE_VML2, CODE_VML2_WALKER, CODE_MMCANE,
+};
+
+static const struct mca_ras_info mca_ras_table[] = {
+       {
+               .blkid = AMDGPU_RAS_BLOCK__UMC,
+               .ip = AMDGPU_MCA_IP_UMC,
+               .get_err_count = mca_normal_mca_get_err_count,
+       }, {
+               .blkid = AMDGPU_RAS_BLOCK__GFX,
+               .ip = AMDGPU_MCA_IP_MP5,
+               .get_err_count = mca_mp5_mca_get_err_count,
+       }, {
+               .blkid = AMDGPU_RAS_BLOCK__SDMA,
+               .ip = AMDGPU_MCA_IP_SMU,
+               .err_code_array = sdma_err_codes,
+               .err_code_count = ARRAY_SIZE(sdma_err_codes),
+               .get_err_count = mca_smu_mca_get_err_count,
+       }, {
+               .blkid = AMDGPU_RAS_BLOCK__MMHUB,
+               .ip = AMDGPU_MCA_IP_SMU,
+               .err_code_array = mmhub_err_codes,
+               .err_code_count = ARRAY_SIZE(mmhub_err_codes),
+               .get_err_count = mca_smu_mca_get_err_count,
+       },
+};
+
+static const struct mca_ras_info *mca_get_mca_ras_info(struct amdgpu_device *adev, enum amdgpu_ras_block blkid)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(mca_ras_table); i++) {
+               if (mca_ras_table[i].blkid == blkid)
+                       return &mca_ras_table[i];
+       }
+
+       return NULL;
+}
+
+static int mca_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count)
+{
+       struct smu_context *smu = adev->powerplay.pp_handle;
+       int ret;
+
+       switch (type) {
+       case AMDGPU_MCA_ERROR_TYPE_UE:
+       case AMDGPU_MCA_ERROR_TYPE_CE:
+               ret = smu_v13_0_6_get_valid_mca_count(smu, type, count);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
+static bool mca_bank_is_valid(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras,
+                             enum amdgpu_mca_error_type type, int idx)
+{
+       int ret, ip = AMDGPU_MCA_IP_UNKNOW;
+
+       ret = mca_decode_mca_ipid(adev, type, idx, &ip);
+       if (ret)
+               return false;
+
+       if (ip == AMDGPU_MCA_IP_UNKNOW)
+               return false;
+
+       return ip == mca_ras->ip;
+}
+
+static int mca_get_valid_mca_idx(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras,
+                                enum amdgpu_mca_error_type type,
+                                uint32_t mca_cnt, int *idx_array, int idx_array_size)
+{
+       int i, idx_cnt = 0;
+
+       for (i = 0; i < mca_cnt; i++) {
+               if (!mca_bank_is_valid(adev, mca_ras, type, i))
+                       continue;
+
+               if (idx_array) {
+                       if (idx_cnt < idx_array_size)
+                               idx_array[idx_cnt] = i;
+                       else
+                               return -EINVAL;
+               }
+
+               idx_cnt++;
+       }
+
+       return idx_cnt;
+}
+
+static int __mca_smu_get_error_count(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras, enum amdgpu_mca_error_type type, uint32_t *count)
+{
+       uint32_t result, mca_cnt, total = 0;
+       int idx_array[16];
+       int i, ret, idx_cnt = 0;
+
+       ret = mca_get_valid_mca_count(adev, type, &mca_cnt);
+       if (ret)
+               return ret;
+
+       /* if valid mca bank count is 0, the driver can return 0 directly */
+       if (!mca_cnt) {
+               *count = 0;
+               return 0;
+       }
+
+       if (!mca_ras->get_err_count)
+               return -EINVAL;
+
+       idx_cnt = mca_get_valid_mca_idx(adev, mca_ras, type, mca_cnt, idx_array, ARRAY_SIZE(idx_array));
+       if (idx_cnt < 0)
+               return -EINVAL;
+
+       for (i = 0; i < idx_cnt; i++) {
+               result = 0;
+               ret = mca_ras->get_err_count(mca_ras, adev, type, idx_array[i], &result);
+               if (ret)
+                       return ret;
+
+               total += result;
+       }
+
+       *count = total;
+
+       return 0;
+}
+
+static int mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+                                  enum amdgpu_mca_error_type type, uint32_t *count)
+{
+       const struct mca_ras_info *mca_ras;
+
+       if (!count)
+               return -EINVAL;
+
+       mca_ras = mca_get_mca_ras_info(adev, blk);
+       if (!mca_ras)
+               return -EOPNOTSUPP;
+
+       return __mca_smu_get_error_count(adev, mca_ras, type, count);
+}
+
+static int __mca_smu_get_ras_mca_idx_array(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras,
+                                          enum amdgpu_mca_error_type type, int *idx_array, int *idx_array_size)
+{
+       uint32_t mca_cnt = 0;
+       int ret, idx_cnt = 0;
+
+       ret = mca_get_valid_mca_count(adev, type, &mca_cnt);
+       if (ret)
+               return ret;
+
+       /* if valid mca bank count is 0, the driver can return 0 directly */
+       if (!mca_cnt) {
+               *idx_array_size = 0;
+               return 0;
+       }
+
+       idx_cnt = mca_get_valid_mca_idx(adev, mca_ras, type, mca_cnt, idx_array, *idx_array_size);
+       if (idx_cnt < 0)
+               return -EINVAL;
+
+       *idx_array_size = idx_cnt;
+
+       return 0;
+}
+
+static int mca_smu_get_ras_mca_idx_array(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+                                        enum amdgpu_mca_error_type type, int *idx_array, int *idx_array_size)
+{
+       const struct mca_ras_info *mca_ras;
+
+       mca_ras = mca_get_mca_ras_info(adev, blk);
+       if (!mca_ras)
+               return -EOPNOTSUPP;
+
+       return __mca_smu_get_ras_mca_idx_array(adev, mca_ras, type, idx_array, idx_array_size);
+}
+
+static int mca_smu_get_mca_entry(struct amdgpu_device *adev,
+                                enum amdgpu_mca_error_type type, int idx, struct mca_bank_entry *entry)
+{
+       return mca_get_mca_entry(adev, type, idx, entry);
+}
+
+static int mca_smu_get_valid_mca_count(struct amdgpu_device *adev,
+                                      enum amdgpu_mca_error_type type, uint32_t *count)
+{
+       return mca_get_valid_mca_count(adev, type, count);
+}
+
+static const struct amdgpu_mca_smu_funcs smu_v13_0_6_mca_smu_funcs = {
+       .max_ue_count = 12,
+       .max_ce_count = 12,
+       .mca_set_debug_mode = mca_smu_set_debug_mode,
+       .mca_get_error_count = mca_smu_get_error_count,
+       .mca_get_mca_entry = mca_smu_get_mca_entry,
+       .mca_get_valid_mca_count = mca_smu_get_valid_mca_count,
+       .mca_get_ras_mca_idx_array = mca_smu_get_ras_mca_idx_array,
+};
+
 static const struct pptable_funcs smu_v13_0_6_ppt_funcs = {
        /* init dpm */
        .get_allowed_feature_mask = smu_v13_0_6_get_allowed_feature_mask,
@@ -2249,4 +2779,5 @@ void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)
        smu->table_map = smu_v13_0_6_table_map;
        smu->smc_driver_if_version = SMU13_0_6_DRIVER_IF_VERSION;
        smu_v13_0_set_smu_mailbox_registers(smu);
+       amdgpu_mca_smu_init_funcs(smu->adev, &smu_v13_0_6_mca_smu_funcs);
 }