EDAC/amd64: Add support for family 0x19, models 0x90-9f devices
authorMuralidhara M K <muralidhara.mk@amd.com>
Thu, 2 Nov 2023 11:42:25 +0000 (11:42 +0000)
committerBorislav Petkov (AMD) <bp@alien8.de>
Wed, 29 Nov 2023 10:21:05 +0000 (11:21 +0100)
AMD Models 90h-9fh are APUs. They have built-in HBM3 memory. ECC support
is enabled by default.

APU models have a single Data Fabric (DF) per Package. Each DF is
visible to the OS in the same way as chiplet-based systems like Zen2
CPUs and later. However, the Unified Memory Controllers (UMCs) are
arranged in the same way as GPU-based MI200 devices rather than
CPU-based systems.

Use the existing gpu_ops for hetergeneous systems to support enumeration
of nodes and memory topology with few fixups.

  [ bp: Massage comments. ]

Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20231102114225.2006878-5-muralimk@amd.com
drivers/edac/amd64_edac.c
drivers/edac/amd64_edac.h

index 9b6642d0087130ca7450c8838878b525686e68c0..537b9987a431c23c263fd59631316cf3780b3e72 100644 (file)
@@ -996,15 +996,23 @@ static struct local_node_map {
 #define LNTM_NODE_COUNT                                GENMASK(27, 16)
 #define LNTM_BASE_NODE_ID                      GENMASK(11, 0)
 
-static int gpu_get_node_map(void)
+static int gpu_get_node_map(struct amd64_pvt *pvt)
 {
        struct pci_dev *pdev;
        int ret;
        u32 tmp;
 
        /*
-        * Node ID 0 is reserved for CPUs.
-        * Therefore, a non-zero Node ID means we've already cached the values.
+        * Mapping of nodes from hardware-provided AMD Node ID to a
+        * Linux logical one is applicable for MI200 models. Therefore,
+        * return early for other heterogeneous systems.
+        */
+       if (pvt->F3->device != PCI_DEVICE_ID_AMD_MI200_DF_F3)
+               return 0;
+
+       /*
+        * Node ID 0 is reserved for CPUs. Therefore, a non-zero Node ID
+        * means the values have been already cached.
         */
        if (gpu_node_map.base_node_id)
                return 0;
@@ -3851,7 +3859,7 @@ static void gpu_init_csrows(struct mem_ctl_info *mci)
 
                        dimm->nr_pages = gpu_get_csrow_nr_pages(pvt, umc, cs);
                        dimm->edac_mode = EDAC_SECDED;
-                       dimm->mtype = MEM_HBM2;
+                       dimm->mtype = pvt->dram_type;
                        dimm->dtype = DEV_X16;
                        dimm->grain = 64;
                }
@@ -3880,7 +3888,7 @@ static bool gpu_ecc_enabled(struct amd64_pvt *pvt)
        return true;
 }
 
-static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
+static inline u32 gpu_get_umc_base(struct amd64_pvt *pvt, u8 umc, u8 channel)
 {
        /*
         * On CPUs, there is one channel per UMC, so UMC numbering equals
@@ -3893,13 +3901,16 @@ static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
         * On GPU nodes channels are selected in 3rd nibble
         * HBM chX[3:0]= [Y  ]5X[3:0]000;
         * HBM chX[7:4]= [Y+1]5X[3:0]000
+        *
+        * On MI300 APU nodes, same as GPU nodes but channels are selected
+        * in the base address of 0x90000
         */
        umc *= 2;
 
        if (channel >= 4)
                umc++;
 
-       return 0x50000 + (umc << 20) + ((channel % 4) << 12);
+       return pvt->gpu_umc_base + (umc << 20) + ((channel % 4) << 12);
 }
 
 static void gpu_read_mc_regs(struct amd64_pvt *pvt)
@@ -3910,7 +3921,7 @@ static void gpu_read_mc_regs(struct amd64_pvt *pvt)
 
        /* Read registers from each UMC */
        for_each_umc(i) {
-               umc_base = gpu_get_umc_base(i, 0);
+               umc_base = gpu_get_umc_base(pvt, i, 0);
                umc = &pvt->umc[i];
 
                amd_smn_read(nid, umc_base + UMCCH_UMC_CFG, &umc->umc_cfg);
@@ -3927,7 +3938,7 @@ static void gpu_read_base_mask(struct amd64_pvt *pvt)
 
        for_each_umc(umc) {
                for_each_chip_select(cs, umc, pvt) {
-                       base_reg = gpu_get_umc_base(umc, cs) + UMCCH_BASE_ADDR;
+                       base_reg = gpu_get_umc_base(pvt, umc, cs) + UMCCH_BASE_ADDR;
                        base = &pvt->csels[umc].csbases[cs];
 
                        if (!amd_smn_read(pvt->mc_node_id, base_reg, base)) {
@@ -3935,7 +3946,7 @@ static void gpu_read_base_mask(struct amd64_pvt *pvt)
                                         umc, cs, *base, base_reg);
                        }
 
-                       mask_reg = gpu_get_umc_base(umc, cs) + UMCCH_ADDR_MASK;
+                       mask_reg = gpu_get_umc_base(pvt, umc, cs) + UMCCH_ADDR_MASK;
                        mask = &pvt->csels[umc].csmasks[cs];
 
                        if (!amd_smn_read(pvt->mc_node_id, mask_reg, mask)) {
@@ -3960,7 +3971,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt)
 {
        int ret;
 
-       ret = gpu_get_node_map();
+       ret = gpu_get_node_map(pvt);
        if (ret)
                return ret;
 
@@ -4125,6 +4136,8 @@ static int per_family_init(struct amd64_pvt *pvt)
                        if (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) {
                                pvt->ctl_name           = "MI200";
                                pvt->max_mcs            = 4;
+                               pvt->dram_type          = MEM_HBM2;
+                               pvt->gpu_umc_base       = 0x50000;
                                pvt->ops                = &gpu_ops;
                        } else {
                                pvt->ctl_name           = "F19h_M30h";
@@ -4142,6 +4155,13 @@ static int per_family_init(struct amd64_pvt *pvt)
                        pvt->ctl_name                   = "F19h_M70h";
                        pvt->flags.zn_regs_v2           = 1;
                        break;
+               case 0x90 ... 0x9f:
+                       pvt->ctl_name                   = "F19h_M90h";
+                       pvt->max_mcs                    = 4;
+                       pvt->dram_type                  = MEM_HBM3;
+                       pvt->gpu_umc_base               = 0x90000;
+                       pvt->ops                        = &gpu_ops;
+                       break;
                case 0xa0 ... 0xaf:
                        pvt->ctl_name                   = "F19h_MA0h";
                        pvt->max_mcs                    = 12;
@@ -4180,23 +4200,33 @@ static const struct attribute_group *amd64_edac_attr_groups[] = {
        NULL
 };
 
+/*
+ * For heterogeneous and APU models EDAC CHIP_SELECT and CHANNEL layers
+ * should be swapped to fit into the layers.
+ */
+static unsigned int get_layer_size(struct amd64_pvt *pvt, u8 layer)
+{
+       bool is_gpu = (pvt->ops == &gpu_ops);
+
+       if (!layer)
+               return is_gpu ? pvt->max_mcs
+                             : pvt->csels[0].b_cnt;
+       else
+               return is_gpu ? pvt->csels[0].b_cnt
+                             : pvt->max_mcs;
+}
+
 static int init_one_instance(struct amd64_pvt *pvt)
 {
        struct mem_ctl_info *mci = NULL;
        struct edac_mc_layer layers[2];
        int ret = -ENOMEM;
 
-       /*
-        * For Heterogeneous family EDAC CHIP_SELECT and CHANNEL layers should
-        * be swapped to fit into the layers.
-        */
        layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
-       layers[0].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ?
-                        pvt->max_mcs : pvt->csels[0].b_cnt;
+       layers[0].size = get_layer_size(pvt, 0);
        layers[0].is_virt_csrow = true;
        layers[1].type = EDAC_MC_LAYER_CHANNEL;
-       layers[1].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ?
-                        pvt->csels[0].b_cnt : pvt->max_mcs;
+       layers[1].size = get_layer_size(pvt, 1);
        layers[1].is_virt_csrow = false;
 
        mci = edac_mc_alloc(pvt->mc_node_id, ARRAY_SIZE(layers), layers, 0);
index 5a4e4a59682b062b1c396ad1a75f9de3a206279d..1665f7932bacbe1f0920a77d8bcbc69cf529ae69 100644 (file)
@@ -362,6 +362,7 @@ struct amd64_pvt {
        u32 dct_sel_lo;         /* DRAM Controller Select Low */
        u32 dct_sel_hi;         /* DRAM Controller Select High */
        u32 online_spare;       /* On-Line spare Reg */
+       u32 gpu_umc_base;       /* Base address used for channel selection on GPUs */
 
        /* x4, x8, or x16 syndromes in use */
        u8 ecc_sym_sz;