drm/amdgpu: Add delay work to retire bad pages
authorYiPeng Chai <YiPeng.Chai@amd.com>
Mon, 22 Apr 2024 09:38:54 +0000 (17:38 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 26 Apr 2024 21:22:41 +0000 (17:22 -0400)
Add delay work to retire bad pages.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h

index 63b5723e26ea5ad3ad55f3e43d419b904d48a936..72daa51f8beb51848ff79cfd03ae894ef99dc31e 100644 (file)
@@ -122,6 +122,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
 
 #define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
 
+#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
+
 enum amdgpu_ras_retire_page_reservation {
        AMDGPU_RAS_RETIRE_PAGE_RESERVED,
        AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2776,6 +2778,30 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
        ecc_log->de_updated = false;
 }
 
+static void amdgpu_ras_do_page_retirement(struct work_struct *work)
+{
+       struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
+                                             page_retirement_dwork.work);
+       struct amdgpu_device *adev = con->adev;
+       struct ras_err_data err_data;
+
+       if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery))
+               return;
+
+       amdgpu_ras_error_data_init(&err_data);
+
+       amdgpu_umc_handle_bad_pages(adev, &err_data);
+
+       amdgpu_ras_error_data_fini(&err_data);
+
+       mutex_lock(&con->umc_ecc_log.lock);
+       if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
+                               UMC_ECC_NEW_DETECTED_TAG))
+               schedule_delayed_work(&con->page_retirement_dwork,
+                       msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL));
+       mutex_unlock(&con->umc_ecc_log.lock);
+}
+
 static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
                        enum amdgpu_ras_block ras_block, uint32_t timeout_ms)
 {
@@ -2814,7 +2840,12 @@ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
 static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
                                        uint32_t timeout)
 {
-       amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       int ret;
+
+       ret = amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
+       if (!ret)
+               schedule_delayed_work(&con->page_retirement_dwork, 0);
 }
 
 static int amdgpu_ras_page_retirement_thread(void *param)
@@ -2929,6 +2960,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
                dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n");
        }
 
+       INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement);
        amdgpu_ras_ecc_log_init(&con->umc_ecc_log);
 #ifdef CONFIG_X86_MCE_AMD
        if ((adev->asic_type == CHIP_ALDEBARAN) &&
@@ -2974,6 +3006,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 
        cancel_work_sync(&con->recovery_work);
 
+       cancel_delayed_work_sync(&con->page_retirement_dwork);
+
        amdgpu_ras_ecc_log_fini(&con->umc_ecc_log);
 
        mutex_lock(&con->recovery_lock);
index 634654cf26347881f1a59423822423611dfe6fe7..cb5a0f31d201d5fce623123a1489eb060b4b3bab 100644 (file)
@@ -537,6 +537,7 @@ struct amdgpu_ras {
        struct mutex page_rsv_lock;
        DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
        struct ras_ecc_log_info  umc_ecc_log;
+       struct delayed_work page_retirement_dwork;
 
        /* Fatal error detected flag */
        atomic_t fed;
index 0f2d765c4e2d582bdeeb2bf2e006c69121971d93..2bd88218c20e555b8977ffb0b87e7f3423ce3722 100644 (file)
@@ -89,7 +89,7 @@ out_fini_err_data:
        return ret;
 }
 
-static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
                        void *ras_error_status)
 {
        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
index c83d24097c5cc3980051ae5f5c32472b617725c6..2d08d076f7c9b36345b39cc05ba1c98f9a2e45e4 100644 (file)
@@ -133,4 +133,7 @@ int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
                uint64_t *pfns, int len, uint64_t *val);
 int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
                struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
+
+void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+                       void *ras_error_status);
 #endif