drm/amdgpu: Support multiple error query modes

Direct error query mode and firmware error query mode
are supported for now.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Yang Wang <kevinyang.wang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Hawking Zhang 2023-11-08 16:07:45 +08:00 committed by Alex Deucher
parent 07c1db7036
commit 8cc0f5669e
2 changed files with 78 additions and 23 deletions

View File

@ -1165,13 +1165,53 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
}
}
/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info)
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
struct ras_query_if *info,
struct ras_err_data *err_data,
unsigned int error_query_mode)
{
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
struct amdgpu_ras_block_object *block_obj = NULL;
if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY)
return -EINVAL;
if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
amdgpu_ras_get_ecc_info(adev, err_data);
} else {
block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
get_ras_block_str(&info->head));
return -EINVAL;
}
if (block_obj->hw_ops->query_ras_error_count)
block_obj->hw_ops->query_ras_error_count(adev, &err_data);
if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
(info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
(info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
if (block_obj->hw_ops->query_ras_error_status)
block_obj->hw_ops->query_ras_error_status(adev);
}
}
} else {
/* FIXME: add code to check return value later */
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
}
return 0;
}
/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data;
unsigned int error_query_mode;
int ret;
if (!obj)
@ -1181,27 +1221,14 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
if (ret)
return ret;
if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
amdgpu_ras_get_ecc_info(adev, &err_data);
} else {
block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
get_ras_block_str(&info->head));
ret = -EINVAL;
goto out_fini_err_data;
}
if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
return -EINVAL;
if (block_obj->hw_ops->query_ras_error_count)
block_obj->hw_ops->query_ras_error_count(adev, &err_data);
if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
(info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
(info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
if (block_obj->hw_ops->query_ras_error_status)
block_obj->hw_ops->query_ras_error_status(adev);
}
}
ret = amdgpu_ras_query_error_status_helper(adev, info,
&err_data,
error_query_mode);
if (ret)
goto out_fini_err_data;
amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);
@ -3397,6 +3424,26 @@ bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
return true;
}
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
unsigned int *error_query_mode)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!con) {
*error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
return false;
}
if (mca_funcs && mca_funcs->mca_set_debug_mode)
*error_query_mode =
(con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
else
*error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
return true;
}
/* Register each ip ras block into amdgpu ras */
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj)

View File

@ -320,6 +320,12 @@ enum amdgpu_ras_ret {
AMDGPU_RAS_PT,
};
enum amdgpu_ras_error_query_mode {
AMDGPU_RAS_INVALID_ERROR_QUERY = 0,
AMDGPU_RAS_DIRECT_ERROR_QUERY = 1,
AMDGPU_RAS_FIRMWARE_ERROR_QUERY = 2,
};
/* ras error status reisger fields */
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT 0x0
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK 0x00000001L
@ -769,6 +775,8 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co
void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
unsigned int *mode);
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj);