From: Tao Zhou Date: Mon, 17 Oct 2022 10:31:20 +0000 (+0800) Subject: drm/amdgpu: add RAS poison handling for MCA X-Git-Url: http://git.maquefel.me/?a=commitdiff_plain;h=ae45a18b80d9d0d29f0ecfc52fb4e7831671b299;p=linux.git drm/amdgpu: add RAS poison handling for MCA For MCA poison, if unmap queue fails, only gpu reset should be triggered without page retirement handling, MCA notifier will do it. v2: handle MCA poison consumption in umc_poison_handler directly. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 3c83129f40900..758942150c097 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -169,19 +169,28 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, void *ras_error_status, bool reset) { - int ret; - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - struct ras_common_if head = { - .block = AMDGPU_RAS_BLOCK__UMC, - }; - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); + int ret = AMDGPU_RAS_SUCCESS; - ret = - amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); + if (!adev->gmc.xgmi.connected_to_cpu) { + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + struct ras_common_if head = { + .block = AMDGPU_RAS_BLOCK__UMC, + }; + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); - if (ret == AMDGPU_RAS_SUCCESS && obj) { - obj->err_data.ue_count += err_data->ue_count; - obj->err_data.ce_count += err_data->ce_count; + ret = + amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); + + if (ret == AMDGPU_RAS_SUCCESS && obj) { + obj->err_data.ue_count += err_data->ue_count; + obj->err_data.ce_count += err_data->ce_count; + } + } else if (reset) { + /* MCA poison handler is only responsible for GPU reset, + * let MCA notifier do page retirement. + */ + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + amdgpu_ras_reset_gpu(adev); } return ret;