drm/amdgpu: only harvest gcea/mmea error status in arcturus
authorHawking Zhang <Hawking.Zhang@amd.com>
Fri, 16 Apr 2021 09:34:13 +0000 (17:34 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 21 Apr 2021 01:35:45 +0000 (21:35 -0400)
SDP RdRspStatus/WrRspStatus or first parity error on
RdRsp data can cause system fatal error in arcturus.
GPU will be freezed in such case.

Driver needs to harvest these error information before
reset the GPU. Check error type to avoid harvest normal
gcea/mmea information.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Stanley Yang <Stanley.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_sh_mask.h

index 830080ff90d85829df22807cc8cf876d2b5f98dd..b4789dfc2bb959b7c60ec61bfcae6513873aacfb 100644 (file)
@@ -994,7 +994,7 @@ static int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
        return ret;
 }
 
-static const struct soc15_reg_entry gfx_v9_4_rdrsp_status_regs =
+static const struct soc15_reg_entry gfx_v9_4_ea_err_status_regs =
        { SOC15_REG_ENTRY(GC, 0, mmGCEA_ERR_STATUS), 0, 1, 32 };
 
 static void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev)
@@ -1007,15 +1007,21 @@ static void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev)
 
        mutex_lock(&adev->grbm_idx_mutex);
 
-       for (i = 0; i < gfx_v9_4_rdrsp_status_regs.se_num; i++) {
-               for (j = 0; j < gfx_v9_4_rdrsp_status_regs.instance;
+       for (i = 0; i < gfx_v9_4_ea_err_status_regs.se_num; i++) {
+               for (j = 0; j < gfx_v9_4_ea_err_status_regs.instance;
                     j++) {
                        gfx_v9_4_select_se_sh(adev, i, 0, j);
                        reg_value = RREG32(SOC15_REG_ENTRY_OFFSET(
-                               gfx_v9_4_rdrsp_status_regs));
-                       if (reg_value)
+                               gfx_v9_4_ea_err_status_regs));
+                       if (REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_STATUS) ||
+                           REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_WRRSP_STATUS) ||
+                           REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_DATAPARITY_ERROR)) {
+                               /* SDP read/write error/parity error in FUE_IS_FATAL mode
+                                * can cause system fatal error in arcturas. Harvest the error
+                                * status before GPU reset */
                                dev_warn(adev->dev, "GCEA err detected at instance: %d, status: 0x%x!\n",
                                                j, reg_value);
+                       }
                }
        }
 
index 1a92177c522f4783a044bb2f9a6aa75b8c880c6d..47c8dd9d1c78e9aa347ece4cf50f5ca47ef39dc1 100644 (file)
@@ -1645,9 +1645,15 @@ static void mmhub_v9_4_query_ras_error_status(struct amdgpu_device *adev)
        for (i = 0; i < ARRAY_SIZE(mmhub_v9_4_err_status_regs); i++) {
                reg_value =
                        RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v9_4_err_status_regs[i]));
-               if (reg_value)
+               if (REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_RDRSP_STATUS) ||
+                   REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_WRRSP_STATUS) ||
+                   REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_RDRSP_DATAPARITY_ERROR)) {
+                       /* SDP read/write error/parity error in FUE_IS_FATAL mode
+                        * can cause system fatal error in arcturas. Harvest the error
+                        * status before GPU reset */
                        dev_warn(adev->dev, "MMHUB EA err detected at instance: %d, status: 0x%x!\n",
                                        i, reg_value);
+               }
        }
 }
 
index 4089cfa081f55dbf9e9cc84d814540139275c0b6..849450caca1567d8fe4551d43608187dd88c94d7 100644 (file)
 #define GCEA_EDC_CNT3__MAM_A3MEM_SEC_COUNT_MASK                                                               0x30000000L
 #define GCEA_EDC_CNT3__MAM_A3MEM_DED_COUNT_MASK                                                               0xC0000000L
 
+//GCEA_ERR_STATUS
+#define GCEA_ERR_STATUS__SDP_RDRSP_STATUS__SHIFT                                                              0x0
+#define GCEA_ERR_STATUS__SDP_WRRSP_STATUS__SHIFT                                                              0x4
+#define GCEA_ERR_STATUS__SDP_RDRSP_DATASTATUS__SHIFT                                                          0x8
+#define GCEA_ERR_STATUS__SDP_RDRSP_DATAPARITY_ERROR__SHIFT                                                    0xa
+#define GCEA_ERR_STATUS__CLEAR_ERROR_STATUS__SHIFT                                                            0xb
+#define GCEA_ERR_STATUS__BUSY_ON_ERROR__SHIFT                                                                 0xc
+#define GCEA_ERR_STATUS__FUE_FLAG__SHIFT                                                                      0xd
+#define GCEA_ERR_STATUS__SDP_RDRSP_STATUS_MASK                                                                0x0000000FL
+#define GCEA_ERR_STATUS__SDP_WRRSP_STATUS_MASK                                                                0x000000F0L
+#define GCEA_ERR_STATUS__SDP_RDRSP_DATASTATUS_MASK                                                            0x00000300L
+#define GCEA_ERR_STATUS__SDP_RDRSP_DATAPARITY_ERROR_MASK                                                      0x00000400L
+#define GCEA_ERR_STATUS__CLEAR_ERROR_STATUS_MASK                                                              0x00000800L
+#define GCEA_ERR_STATUS__BUSY_ON_ERROR_MASK                                                                   0x00001000L
+#define GCEA_ERR_STATUS__FUE_FLAG_MASK                                                                        0x00002000L
+
 // addressBlock: gc_gfxudec
 //GRBM_GFX_INDEX
 #define GRBM_GFX_INDEX__INSTANCE_INDEX__SHIFT                                                                 0x0