drm/amdgpu: only harvest gcea/mmea error status in arcturus

SDP RdRspStatus/WrRspStatus or first parity error on
RdRsp data can cause system fatal error in arcturus.
GPU will be freezed in such case.

Driver needs to harvest these error information before
reset the GPU. Check error type to avoid harvest normal
gcea/mmea information.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Stanley Yang <Stanley.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Hawking Zhang 2021-04-16 17:34:13 +08:00 committed by Alex Deucher
parent 9406d39bb6
commit 53ee6609b4
3 changed files with 34 additions and 6 deletions

View File

@ -994,7 +994,7 @@ static int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
return ret;
}
static const struct soc15_reg_entry gfx_v9_4_rdrsp_status_regs =
static const struct soc15_reg_entry gfx_v9_4_ea_err_status_regs =
{ SOC15_REG_ENTRY(GC, 0, mmGCEA_ERR_STATUS), 0, 1, 32 };
static void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev)
@ -1007,15 +1007,21 @@ static void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev)
mutex_lock(&adev->grbm_idx_mutex);
for (i = 0; i < gfx_v9_4_rdrsp_status_regs.se_num; i++) {
for (j = 0; j < gfx_v9_4_rdrsp_status_regs.instance;
for (i = 0; i < gfx_v9_4_ea_err_status_regs.se_num; i++) {
for (j = 0; j < gfx_v9_4_ea_err_status_regs.instance;
j++) {
gfx_v9_4_select_se_sh(adev, i, 0, j);
reg_value = RREG32(SOC15_REG_ENTRY_OFFSET(
gfx_v9_4_rdrsp_status_regs));
if (reg_value)
gfx_v9_4_ea_err_status_regs));
if (REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_STATUS) ||
REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_WRRSP_STATUS) ||
REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_DATAPARITY_ERROR)) {
/* SDP read/write error/parity error in FUE_IS_FATAL mode
* can cause system fatal error in arcturas. Harvest the error
* status before GPU reset */
dev_warn(adev->dev, "GCEA err detected at instance: %d, status: 0x%x!\n",
j, reg_value);
}
}
}

View File

@ -1645,9 +1645,15 @@ static void mmhub_v9_4_query_ras_error_status(struct amdgpu_device *adev)
for (i = 0; i < ARRAY_SIZE(mmhub_v9_4_err_status_regs); i++) {
reg_value =
RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v9_4_err_status_regs[i]));
if (reg_value)
if (REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_RDRSP_STATUS) ||
REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_WRRSP_STATUS) ||
REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_RDRSP_DATAPARITY_ERROR)) {
/* SDP read/write error/parity error in FUE_IS_FATAL mode
* can cause system fatal error in arcturas. Harvest the error
* status before GPU reset */
dev_warn(adev->dev, "MMHUB EA err detected at instance: %d, status: 0x%x!\n",
i, reg_value);
}
}
}

View File

@ -617,6 +617,22 @@
#define GCEA_EDC_CNT3__MAM_A3MEM_SEC_COUNT_MASK 0x30000000L
#define GCEA_EDC_CNT3__MAM_A3MEM_DED_COUNT_MASK 0xC0000000L
//GCEA_ERR_STATUS
#define GCEA_ERR_STATUS__SDP_RDRSP_STATUS__SHIFT 0x0
#define GCEA_ERR_STATUS__SDP_WRRSP_STATUS__SHIFT 0x4
#define GCEA_ERR_STATUS__SDP_RDRSP_DATASTATUS__SHIFT 0x8
#define GCEA_ERR_STATUS__SDP_RDRSP_DATAPARITY_ERROR__SHIFT 0xa
#define GCEA_ERR_STATUS__CLEAR_ERROR_STATUS__SHIFT 0xb
#define GCEA_ERR_STATUS__BUSY_ON_ERROR__SHIFT 0xc
#define GCEA_ERR_STATUS__FUE_FLAG__SHIFT 0xd
#define GCEA_ERR_STATUS__SDP_RDRSP_STATUS_MASK 0x0000000FL
#define GCEA_ERR_STATUS__SDP_WRRSP_STATUS_MASK 0x000000F0L
#define GCEA_ERR_STATUS__SDP_RDRSP_DATASTATUS_MASK 0x00000300L
#define GCEA_ERR_STATUS__SDP_RDRSP_DATAPARITY_ERROR_MASK 0x00000400L
#define GCEA_ERR_STATUS__CLEAR_ERROR_STATUS_MASK 0x00000800L
#define GCEA_ERR_STATUS__BUSY_ON_ERROR_MASK 0x00001000L
#define GCEA_ERR_STATUS__FUE_FLAG_MASK 0x00002000L
// addressBlock: gc_gfxudec
//GRBM_GFX_INDEX
#define GRBM_GFX_INDEX__INSTANCE_INDEX__SHIFT 0x0