drm/amdgpu: save error count in RAS poison handler

Otherwise the RAS error count couldn't be queried from sysfs.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Stanley.Yang <Stanley.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Tao Zhou 2021-12-20 16:36:54 +08:00 committed by Alex Deucher
parent 45e3d1db7d
commit fec8c5244f
3 changed files with 97 additions and 76 deletions

View file

@ -727,7 +727,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
/* CPU MCA will handle page retirement if connected_to_cpu is 1 */
if (!adev->gmc.xgmi.connected_to_cpu)
amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
amdgpu_umc_poison_handler(adev, &err_data, reset);
else if (reset)
amdgpu_amdkfd_gpu_reset(adev);
}

View file

@ -23,79 +23,7 @@
#include "amdgpu_ras.h"
static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry)
{
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
}
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
{
int r;
struct ras_fs_if fs_info = {
.sysfs_name = "umc_err_count",
};
struct ras_ih_if ih_info = {
.cb = amdgpu_umc_process_ras_data_cb,
};
if (!adev->umc.ras_if) {
adev->umc.ras_if =
kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
if (!adev->umc.ras_if)
return -ENOMEM;
adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
adev->umc.ras_if->sub_block_index = 0;
}
ih_info.head = fs_info.head = *adev->umc.ras_if;
r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
&fs_info, &ih_info);
if (r)
goto free;
if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
if (r)
goto late_fini;
} else {
r = 0;
goto free;
}
/* ras init of specific umc version */
if (adev->umc.ras_funcs &&
adev->umc.ras_funcs->err_cnt_init)
adev->umc.ras_funcs->err_cnt_init(adev);
return 0;
late_fini:
amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
free:
kfree(adev->umc.ras_if);
adev->umc.ras_if = NULL;
return r;
}
void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
{
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
adev->umc.ras_if) {
struct ras_common_if *ras_if = adev->umc.ras_if;
struct ras_ih_if ih_info = {
.head = *ras_if,
.cb = amdgpu_umc_process_ras_data_cb,
};
amdgpu_ras_late_fini(adev, ras_if, &ih_info);
kfree(ras_if);
}
}
int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry,
bool reset)
@ -180,6 +108,100 @@ int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
return AMDGPU_RAS_SUCCESS;
}
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
void *ras_error_status,
bool reset)
{
int ret;
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct ras_common_if head = {
.block = AMDGPU_RAS_BLOCK__UMC,
};
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
ret =
amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
if (ret == AMDGPU_RAS_SUCCESS && obj) {
obj->err_data.ue_count += err_data->ue_count;
obj->err_data.ce_count += err_data->ce_count;
}
return ret;
}
static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry)
{
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
}
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
{
int r;
struct ras_fs_if fs_info = {
.sysfs_name = "umc_err_count",
};
struct ras_ih_if ih_info = {
.cb = amdgpu_umc_process_ras_data_cb,
};
if (!adev->umc.ras_if) {
adev->umc.ras_if =
kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
if (!adev->umc.ras_if)
return -ENOMEM;
adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
adev->umc.ras_if->sub_block_index = 0;
}
ih_info.head = fs_info.head = *adev->umc.ras_if;
r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
&fs_info, &ih_info);
if (r)
goto free;
if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
if (r)
goto late_fini;
} else {
r = 0;
goto free;
}
/* ras init of specific umc version */
if (adev->umc.ras_funcs &&
adev->umc.ras_funcs->err_cnt_init)
adev->umc.ras_funcs->err_cnt_init(adev);
return 0;
late_fini:
amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
free:
kfree(adev->umc.ras_if);
adev->umc.ras_if = NULL;
return r;
}
void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
{
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
adev->umc.ras_if) {
struct ras_common_if *ras_if = adev->umc.ras_if;
struct ras_ih_if ih_info = {
.head = *ras_if,
.cb = amdgpu_umc_process_ras_data_cb,
};
amdgpu_ras_late_fini(adev, ras_if, &ih_info);
kfree(ras_if);
}
}
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry)

View file

@ -78,9 +78,8 @@ struct amdgpu_umc {
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev);
void amdgpu_umc_ras_fini(struct amdgpu_device *adev);
int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry,
bool reset);
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,