drm/amdgpu: check recovery status of xgmi hive in ras_reset_error_count
authorTao Zhou <tao.zhou1@amd.com>
Mon, 30 Oct 2023 12:44:37 +0000 (20:44 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 3 Nov 2023 16:18:32 +0000 (12:18 -0400)
Handle xgmi hive case.

Suggested-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Stanley.Yang <Stanley.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 3af50754800d5ab1db35fa88fa812dcb4d174f53..b7fe5951b166ebffd72b5a25c4057fb2e195e59e 100644 (file)
@@ -1222,6 +1222,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
        struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
        const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+       struct amdgpu_hive_info *hive;
+       int hive_ras_recovery = 0;
 
        if (!block_obj || !block_obj->hw_ops) {
                dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
@@ -1233,8 +1235,15 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
            !amdgpu_ras_get_mca_debug_mode(adev))
                return -EOPNOTSUPP;
 
+       hive = amdgpu_get_xgmi_hive(adev);
+       if (hive) {
+               hive_ras_recovery = atomic_read(&hive->ras_recovery);
+               amdgpu_put_xgmi_hive(hive);
+       }
+
        /* skip ras error reset in gpu reset */
-       if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery)) &&
+       if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) ||
+           hive_ras_recovery) &&
            mca_funcs && mca_funcs->mca_set_debug_mode)
                return -EOPNOTSUPP;