RAS/AMD/ATL: Add MI300 row retirement support
authorYazen Ghannam <yazen.ghannam@amd.com>
Wed, 14 Feb 2024 03:35:15 +0000 (21:35 -0600)
committerBorislav Petkov (AMD) <bp@alien8.de>
Wed, 14 Feb 2024 16:10:06 +0000 (17:10 +0100)
DRAM row retirement depends on model-specific information that is best
done within the AMD Address Translation Library.

Export a generic wrapper function for other modules to use. Add any
model-specific helpers here.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20240214033516.1344948-2-yazen.ghannam@amd.com
drivers/ras/amd/atl/Kconfig
drivers/ras/amd/atl/umc.c
include/linux/ras.h

index a43513a700f1dd23508669b1d16a1fabdff56f1f..df49c23e7f6234b07a94390e326315c8ae24bc6d 100644 (file)
@@ -10,6 +10,7 @@
 config AMD_ATL
        tristate "AMD Address Translation Library"
        depends on AMD_NB && X86_64 && RAS
+       depends on MEMORY_FAILURE
        default N
        help
          This library includes support for implementation-specific
index 7e310d1dfcfc76467d372b64e9c5afc692bdc2e9..08c6dbd44c6226dd430342cbcbd8b3bef8497384 100644 (file)
@@ -239,6 +239,57 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
        return addr;
 }
 
+/*
+ * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire
+ * all memory within that DRAM row. This applies to the memory with a DRAM
+ * bank.
+ *
+ * To find the memory addresses, loop through permutations of the DRAM column
+ * bits and find the System Physical address of each. The column bits are used
+ * to calculate the intermediate Normalized address, so all permutations should
+ * be checked.
+ *
+ * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
+ */
+#define MI300_NUM_COL          BIT(HWEIGHT(MI300_UMC_MCA_COL))
+static void retire_row_mi300(struct atl_err *a_err)
+{
+       unsigned long addr;
+       struct page *p;
+       u8 col;
+
+       for (col = 0; col < MI300_NUM_COL; col++) {
+               a_err->addr &= ~MI300_UMC_MCA_COL;
+               a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col);
+
+               addr = amd_convert_umc_mca_addr_to_sys_addr(a_err);
+               if (IS_ERR_VALUE(addr))
+                       continue;
+
+               addr = PHYS_PFN(addr);
+
+               /*
+                * Skip invalid or already poisoned pages to avoid unnecessary
+                * error messages from memory_failure().
+                */
+               p = pfn_to_online_page(addr);
+               if (!p)
+                       continue;
+
+               if (PageHWPoison(p))
+                       continue;
+
+               memory_failure(addr, 0);
+       }
+}
+
+void amd_retire_dram_row(struct atl_err *a_err)
+{
+       if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
+               return retire_row_mi300(a_err);
+}
+EXPORT_SYMBOL_GPL(amd_retire_dram_row);
+
 static unsigned long get_addr(unsigned long addr)
 {
        if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
index 09c632832bf1b87f6a447f6f67fd5d745935ed1f..a64182bc72ad3f2b430c53c7a9e23e798a1c1fbe 100644 (file)
@@ -45,8 +45,10 @@ struct atl_err {
 #if IS_ENABLED(CONFIG_AMD_ATL)
 void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *));
 void amd_atl_unregister_decoder(void);
+void amd_retire_dram_row(struct atl_err *err);
 unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
 #else
+static inline void amd_retire_dram_row(struct atl_err *err) { }
 static inline unsigned long
 amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
 #endif /* CONFIG_AMD_ATL */