drm/amdgpu: add RAS poison handling for MCA
authorTao Zhou <tao.zhou1@amd.com>
Mon, 17 Oct 2022 10:31:20 +0000 (18:31 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 27 Oct 2022 19:12:08 +0000 (15:12 -0400)
For MCA poison, if unmap queue fails, only gpu reset should be
triggered without page retirement handling, MCA notifier will do it.

v2: handle MCA poison consumption in umc_poison_handler directly.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

index 3c83129f40900082b0357e56930544bac4e606ff..758942150c0972a7bb5245df7736500a33c2b24e 100644 (file)
@@ -169,19 +169,28 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
                void *ras_error_status,
                bool reset)
 {
-       int ret;
-       struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
-       struct ras_common_if head = {
-               .block = AMDGPU_RAS_BLOCK__UMC,
-       };
-       struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+       int ret = AMDGPU_RAS_SUCCESS;
 
-       ret =
-               amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
+       if (!adev->gmc.xgmi.connected_to_cpu) {
+               struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+               struct ras_common_if head = {
+                       .block = AMDGPU_RAS_BLOCK__UMC,
+               };
+               struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
 
-       if (ret == AMDGPU_RAS_SUCCESS && obj) {
-               obj->err_data.ue_count += err_data->ue_count;
-               obj->err_data.ce_count += err_data->ce_count;
+               ret =
+                       amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
+
+               if (ret == AMDGPU_RAS_SUCCESS && obj) {
+                       obj->err_data.ue_count += err_data->ue_count;
+                       obj->err_data.ce_count += err_data->ce_count;
+               }
+       } else if (reset) {
+               /* MCA poison handler is only responsible for GPU reset,
+                * let MCA notifier do page retirement.
+                */
+               kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+               amdgpu_ras_reset_gpu(adev);
        }
 
        return ret;