drm/amdgpu: add bad_page_threshold check in ras_eeprom_check_err
authorTao Zhou <tao.zhou1@amd.com>
Tue, 21 Feb 2023 08:03:49 +0000 (16:03 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 23 Feb 2023 22:35:59 +0000 (17:35 -0500)
bad_page_threshold controls page retirement behavior and it should be
also checked.

v2: simplify the condition of bad page handling path.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Stanley.Yang <Stanley.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

index 9d370465b08d28f700ce0fa422dcb8e1e262dca5..2e08fce875217996f70c4cf0b9a7d581d759b007 100644 (file)
@@ -417,7 +417,8 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
-       if (!__is_ras_eeprom_supported(adev))
+       if (!__is_ras_eeprom_supported(adev) ||
+           !amdgpu_bad_page_threshold)
                return false;
 
        /* skip check eeprom table for VEGA20 Gaming */
@@ -428,10 +429,18 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
                        return false;
 
        if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
-               dev_warn(adev->dev, "This GPU is in BAD status.");
-               dev_warn(adev->dev, "Please retire it or set a larger "
-                        "threshold value when reloading driver.\n");
-               return true;
+               if (amdgpu_bad_page_threshold == -1) {
+                       dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
+                               con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold);
+                       dev_warn(adev->dev,
+                               "But GPU can be operated due to bad_page_threshold = -1.\n");
+                       return false;
+               } else {
+                       dev_warn(adev->dev, "This GPU is in BAD status.");
+                       dev_warn(adev->dev, "Please retire it or set a larger "
+                                "threshold value when reloading driver.\n");
+                       return true;
+               }
        }
 
        return false;