drm/amdgpu: umc v12_0 logs ecc errors
authorYiPeng Chai <YiPeng.Chai@amd.com>
Mon, 18 Mar 2024 03:48:07 +0000 (11:48 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 26 Apr 2024 21:22:41 +0000 (17:22 -0400)
1. umc v12_0 logs ecc errors.
2. Reserve newly detected ecc error pages.
3. Add tag for bad pages, so that they can
   be retired later.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c

index 7006a57277ef6a56393fda4e3a227269b0ea1f76..0f2d765c4e2d582bdeeb2bf2e006c69121971d93 100644 (file)
  *
  */
 
+#include <linux/sort.h>
 #include "amdgpu.h"
 #include "umc_v6_7.h"
 #define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms
 
+#define MAX_UMC_HASH_STRING_SIZE  256
+
 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
                                    struct ras_err_data *err_data, uint64_t err_addr,
                                    uint32_t ch_inst, uint32_t umc_inst)
@@ -446,3 +449,67 @@ int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
                                        status, ipid, addr);
        return 0;
 }
+
+static int amdgpu_umc_uint64_cmp(const void *a, const void *b)
+{
+       uint64_t *addr_a = (uint64_t *)a;
+       uint64_t *addr_b = (uint64_t *)b;
+
+       if (*addr_a > *addr_b)
+               return 1;
+       else if (*addr_a < *addr_b)
+               return -1;
+       else
+               return 0;
+}
+
+/* Use string hash to avoid logging the same bad pages repeatedly */
+int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
+               uint64_t *pfns, int len, uint64_t *val)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       char buf[MAX_UMC_HASH_STRING_SIZE] = {0};
+       int offset = 0, i = 0;
+       uint64_t hash_val;
+
+       if (!pfns || !len)
+               return -EINVAL;
+
+       sort(pfns, len, sizeof(uint64_t), amdgpu_umc_uint64_cmp, NULL);
+
+       for (i = 0; i < len; i++)
+               offset += snprintf(&buf[offset], sizeof(buf) - offset, "%llx", pfns[i]);
+
+       hash_val = siphash(buf, offset, &con->umc_ecc_log.ecc_key);
+
+       *val = hash_val;
+
+       return 0;
+}
+
+int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
+               struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct ras_ecc_log_info *ecc_log;
+       int ret;
+
+       ecc_log = &con->umc_ecc_log;
+
+       mutex_lock(&ecc_log->lock);
+       ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err);
+       if (!ret) {
+               struct ras_err_pages *err_pages = &ecc_err->err_pages;
+               int i;
+
+               /* Reserve memory */
+               for (i = 0; i < err_pages->count; i++)
+                       amdgpu_ras_reserve_page(adev, err_pages->pfn[i]);
+
+               radix_tree_tag_set(ecc_tree,
+                       ecc_err->hash_index, UMC_ECC_NEW_DETECTED_TAG);
+       }
+       mutex_unlock(&ecc_log->lock);
+
+       return ret;
+}
index 4f3834fa10a8f9c8a6cec244d4e7cf337afb9ef1..c83d24097c5cc3980051ae5f5c32472b617725c6 100644 (file)
@@ -52,6 +52,8 @@
 #define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) \
                LOOP_UMC_NODE_INST((node_inst)) LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst))
 
+/* Page retirement tag */
+#define UMC_ECC_NEW_DETECTED_TAG       0x1
 
 typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst,
                        uint32_t umc_inst, uint32_t ch_inst, void *data);
@@ -127,5 +129,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
 
 int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
                                uint64_t status, uint64_t ipid, uint64_t addr);
-
+int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
+               uint64_t *pfns, int len, uint64_t *val);
+int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
+               struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
 #endif
index 085dcfe16b5ec4b93582926c2e7af8a859245930..6c2b61ef5b5716b1546cda0116723a7220eeaafb 100644 (file)
@@ -546,8 +546,10 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
        uint16_t hwid, mcatype;
        struct ta_ras_query_address_input addr_in;
        uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
-       uint64_t err_addr;
+       uint64_t err_addr, hash_val = 0;
+       struct ras_ecc_err *ecc_err;
        int count;
+       int ret;
 
        hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
        mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
@@ -589,6 +591,43 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
                return 0;
        }
 
+       ret = amdgpu_umc_build_pages_hash(adev,
+                       page_pfn, count, &hash_val);
+       if (ret) {
+               dev_err(adev->dev, "Fail to build error pages hash\n");
+               return ret;
+       }
+
+       ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL);
+       if (!ecc_err)
+               return -ENOMEM;
+
+       ecc_err->err_pages.pfn = kcalloc(count, sizeof(*ecc_err->err_pages.pfn), GFP_KERNEL);
+       if (!ecc_err->err_pages.pfn) {
+               kfree(ecc_err);
+               return -ENOMEM;
+       }
+
+       memcpy(ecc_err->err_pages.pfn, page_pfn, count * sizeof(*ecc_err->err_pages.pfn));
+       ecc_err->err_pages.count = count;
+
+       ecc_err->hash_index = hash_val;
+       ecc_err->status = status;
+       ecc_err->ipid = ipid;
+       ecc_err->addr = addr;
+
+       ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
+       if (ret) {
+               if (ret == -EEXIST)
+                       con->umc_ecc_log.de_updated = true;
+               else
+                       dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
+
+               kfree(ecc_err->err_pages.pfn);
+               kfree(ecc_err);
+               return ret;
+       }
+
        con->umc_ecc_log.de_updated = true;
 
        return 0;