drm/amdgpu: message smu to update hbm bad page number
authorStanley.Yang <Stanley.Yang@amd.com>
Fri, 11 Jun 2021 07:38:50 +0000 (15:38 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 18 Jun 2021 21:11:56 +0000 (17:11 -0400)
Use SMU to update the bad pages rather than directly
accessing the EEPROM from the driver.

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c

index 6d1e6005bd87686ae78c5aff8b48cff32d4cf6e4..c13b02caf8c32a97a27446ec035a102692d49c81 100644 (file)
@@ -1984,6 +1984,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
                ret = amdgpu_ras_load_bad_pages(adev);
                if (ret)
                        goto free;
+
+               if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num)
+                       adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.num_recs);
        }
 
        return 0;
index ea6f99be070bd87501f5abeb9aa88323f888e7a8..f4489773715e654edbed66ba19ec3e9b35783764 100644 (file)
@@ -94,6 +94,7 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
                struct amdgpu_iv_entry *entry)
 {
        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
        if (adev->umc.ras_funcs &&
@@ -131,6 +132,9 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
                        amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
                                                err_data->err_addr_cnt);
                        amdgpu_ras_save_bad_pages(adev);
+
+                       if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num)
+                               adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.num_recs);
                }
 
                amdgpu_ras_reset_gpu(adev);
index 6559912ba2293eb1aa29016d10fe30d4e4bc5525..3e89852e48202db212c09d44b7ecf33a5da2d9bf 100644 (file)
@@ -1232,6 +1232,12 @@ struct pptable_funcs {
         */
        int (*wait_for_event)(struct smu_context *smu,
                              enum smu_event_type event, uint64_t event_arg);
+
+       /**
+        * @sned_hbm_bad_pages_num:  message SMU to update bad page number
+        *                                                                              of SMUBUS table.
+        */
+       int (*send_hbm_bad_pages_num)(struct smu_context *smu, uint32_t size);
 };
 
 typedef enum {
index 82099c528ccbae860b2f6011c545178cf14d5f84..9316a726195cc00f949eba5915f5e18339d341f6 100644 (file)
@@ -1923,6 +1923,20 @@ static int aldebaran_set_mp1_state(struct smu_context *smu,
        }
 }
 
+static int aldebaran_smu_send_hbm_bad_page_num(struct smu_context *smu,
+               uint32_t size)
+{
+       int ret = 0;
+
+       /* message SMU to update the bad page number on SMUBUS */
+       ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetNumBadHbmPagesRetired, size, NULL);
+       if (ret)
+               dev_err(smu->adev->dev, "[%s] failed to message SMU to update HBM bad pages number\n",
+                               __func__);
+
+       return ret;
+}
+
 static const struct pptable_funcs aldebaran_ppt_funcs = {
        /* init dpm */
        .get_allowed_feature_mask = aldebaran_get_allowed_feature_mask,
@@ -1985,6 +1999,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = {
        .wait_for_event = smu_v13_0_wait_for_event,
        .i2c_init = aldebaran_i2c_control_init,
        .i2c_fini = aldebaran_i2c_control_fini,
+       .send_hbm_bad_pages_num = aldebaran_smu_send_hbm_bad_page_num,
 };
 
 void aldebaran_set_ppt_funcs(struct smu_context *smu)