drm/amdgpu: Support multiple error query modes

Direct error query mode and firmware error query mode
are supported for now.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Yang Wang <kevinyang.wang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Hawking Zhang 2023-11-08 16:07:45 +08:00 committed by Alex Deucher
parent 07c1db7036
commit 8cc0f5669e
2 changed files with 78 additions and 23 deletions

View file

@ -1165,31 +1165,26 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
} }
} }
/* query/inject/cure begin */ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info,
struct ras_query_if *info) struct ras_err_data *err_data,
unsigned int error_query_mode)
{ {
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
struct amdgpu_ras_block_object *block_obj = NULL; struct amdgpu_ras_block_object *block_obj = NULL;
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data;
int ret;
if (!obj) if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY)
return -EINVAL; return -EINVAL;
ret = amdgpu_ras_error_data_init(&err_data); if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
if (ret)
return ret;
if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
amdgpu_ras_get_ecc_info(adev, &err_data); amdgpu_ras_get_ecc_info(adev, err_data);
} else { } else {
block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
if (!block_obj || !block_obj->hw_ops) { if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
get_ras_block_str(&info->head)); get_ras_block_str(&info->head));
ret = -EINVAL; return -EINVAL;
goto out_fini_err_data;
} }
if (block_obj->hw_ops->query_ras_error_count) if (block_obj->hw_ops->query_ras_error_count)
@ -1202,6 +1197,38 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
block_obj->hw_ops->query_ras_error_status(adev); block_obj->hw_ops->query_ras_error_status(adev);
} }
} }
} else {
/* FIXME: add code to check return value later */
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
}
return 0;
}
/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data;
unsigned int error_query_mode;
int ret;
if (!obj)
return -EINVAL;
ret = amdgpu_ras_error_data_init(&err_data);
if (ret)
return ret;
if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
return -EINVAL;
ret = amdgpu_ras_query_error_status_helper(adev, info,
&err_data,
error_query_mode);
if (ret)
goto out_fini_err_data;
amdgpu_rasmgr_error_data_statistic_update(obj, &err_data); amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);
@ -3397,6 +3424,26 @@ bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
return true; return true;
} }
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
unsigned int *error_query_mode)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!con) {
*error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
return false;
}
if (mca_funcs && mca_funcs->mca_set_debug_mode)
*error_query_mode =
(con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
else
*error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
return true;
}
/* Register each ip ras block into amdgpu ras */ /* Register each ip ras block into amdgpu ras */
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj) struct amdgpu_ras_block_object *ras_block_obj)

View file

@ -320,6 +320,12 @@ enum amdgpu_ras_ret {
AMDGPU_RAS_PT, AMDGPU_RAS_PT,
}; };
enum amdgpu_ras_error_query_mode {
AMDGPU_RAS_INVALID_ERROR_QUERY = 0,
AMDGPU_RAS_DIRECT_ERROR_QUERY = 1,
AMDGPU_RAS_FIRMWARE_ERROR_QUERY = 2,
};
/* ras error status reisger fields */ /* ras error status reisger fields */
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT 0x0 #define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT 0x0
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK 0x00000001L #define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK 0x00000001L
@ -769,6 +775,8 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co
void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev); bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
unsigned int *mode);
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj); struct amdgpu_ras_block_object *ras_block_obj);