mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-10-01 06:33:07 +00:00
drm/amdgpu: save error count in RAS poison handler
Otherwise the RAS error count couldn't be queried from sysfs. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Stanley.Yang <Stanley.Yang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
45e3d1db7d
commit
fec8c5244f
3 changed files with 97 additions and 76 deletions
|
@ -727,7 +727,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
|
|||
|
||||
/* CPU MCA will handle page retirement if connected_to_cpu is 1 */
|
||||
if (!adev->gmc.xgmi.connected_to_cpu)
|
||||
amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
|
||||
amdgpu_umc_poison_handler(adev, &err_data, reset);
|
||||
else if (reset)
|
||||
amdgpu_amdkfd_gpu_reset(adev);
|
||||
}
|
||||
|
|
|
@ -23,79 +23,7 @@
|
|||
|
||||
#include "amdgpu_ras.h"
|
||||
|
||||
static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
|
||||
void *ras_error_status,
|
||||
struct amdgpu_iv_entry *entry)
|
||||
{
|
||||
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
|
||||
}
|
||||
|
||||
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
|
||||
{
|
||||
int r;
|
||||
struct ras_fs_if fs_info = {
|
||||
.sysfs_name = "umc_err_count",
|
||||
};
|
||||
struct ras_ih_if ih_info = {
|
||||
.cb = amdgpu_umc_process_ras_data_cb,
|
||||
};
|
||||
|
||||
if (!adev->umc.ras_if) {
|
||||
adev->umc.ras_if =
|
||||
kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
|
||||
if (!adev->umc.ras_if)
|
||||
return -ENOMEM;
|
||||
adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
|
||||
adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
|
||||
adev->umc.ras_if->sub_block_index = 0;
|
||||
}
|
||||
ih_info.head = fs_info.head = *adev->umc.ras_if;
|
||||
|
||||
r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
|
||||
&fs_info, &ih_info);
|
||||
if (r)
|
||||
goto free;
|
||||
|
||||
if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
|
||||
r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
|
||||
if (r)
|
||||
goto late_fini;
|
||||
} else {
|
||||
r = 0;
|
||||
goto free;
|
||||
}
|
||||
|
||||
/* ras init of specific umc version */
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->err_cnt_init)
|
||||
adev->umc.ras_funcs->err_cnt_init(adev);
|
||||
|
||||
return 0;
|
||||
|
||||
late_fini:
|
||||
amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
|
||||
free:
|
||||
kfree(adev->umc.ras_if);
|
||||
adev->umc.ras_if = NULL;
|
||||
return r;
|
||||
}
|
||||
|
||||
void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
|
||||
{
|
||||
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
|
||||
adev->umc.ras_if) {
|
||||
struct ras_common_if *ras_if = adev->umc.ras_if;
|
||||
struct ras_ih_if ih_info = {
|
||||
.head = *ras_if,
|
||||
.cb = amdgpu_umc_process_ras_data_cb,
|
||||
};
|
||||
|
||||
amdgpu_ras_late_fini(adev, ras_if, &ih_info);
|
||||
kfree(ras_if);
|
||||
}
|
||||
}
|
||||
|
||||
int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
||||
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
||||
void *ras_error_status,
|
||||
struct amdgpu_iv_entry *entry,
|
||||
bool reset)
|
||||
|
@ -180,6 +108,100 @@ int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
|||
return AMDGPU_RAS_SUCCESS;
|
||||
}
|
||||
|
||||
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
|
||||
void *ras_error_status,
|
||||
bool reset)
|
||||
{
|
||||
int ret;
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
struct ras_common_if head = {
|
||||
.block = AMDGPU_RAS_BLOCK__UMC,
|
||||
};
|
||||
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
|
||||
|
||||
ret =
|
||||
amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
|
||||
|
||||
if (ret == AMDGPU_RAS_SUCCESS && obj) {
|
||||
obj->err_data.ue_count += err_data->ue_count;
|
||||
obj->err_data.ce_count += err_data->ce_count;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
|
||||
void *ras_error_status,
|
||||
struct amdgpu_iv_entry *entry)
|
||||
{
|
||||
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
|
||||
}
|
||||
|
||||
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
|
||||
{
|
||||
int r;
|
||||
struct ras_fs_if fs_info = {
|
||||
.sysfs_name = "umc_err_count",
|
||||
};
|
||||
struct ras_ih_if ih_info = {
|
||||
.cb = amdgpu_umc_process_ras_data_cb,
|
||||
};
|
||||
|
||||
if (!adev->umc.ras_if) {
|
||||
adev->umc.ras_if =
|
||||
kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
|
||||
if (!adev->umc.ras_if)
|
||||
return -ENOMEM;
|
||||
adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
|
||||
adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
|
||||
adev->umc.ras_if->sub_block_index = 0;
|
||||
}
|
||||
ih_info.head = fs_info.head = *adev->umc.ras_if;
|
||||
|
||||
r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
|
||||
&fs_info, &ih_info);
|
||||
if (r)
|
||||
goto free;
|
||||
|
||||
if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
|
||||
r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
|
||||
if (r)
|
||||
goto late_fini;
|
||||
} else {
|
||||
r = 0;
|
||||
goto free;
|
||||
}
|
||||
|
||||
/* ras init of specific umc version */
|
||||
if (adev->umc.ras_funcs &&
|
||||
adev->umc.ras_funcs->err_cnt_init)
|
||||
adev->umc.ras_funcs->err_cnt_init(adev);
|
||||
|
||||
return 0;
|
||||
|
||||
late_fini:
|
||||
amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
|
||||
free:
|
||||
kfree(adev->umc.ras_if);
|
||||
adev->umc.ras_if = NULL;
|
||||
return r;
|
||||
}
|
||||
|
||||
void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
|
||||
{
|
||||
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
|
||||
adev->umc.ras_if) {
|
||||
struct ras_common_if *ras_if = adev->umc.ras_if;
|
||||
struct ras_ih_if ih_info = {
|
||||
.head = *ras_if,
|
||||
.cb = amdgpu_umc_process_ras_data_cb,
|
||||
};
|
||||
|
||||
amdgpu_ras_late_fini(adev, ras_if, &ih_info);
|
||||
kfree(ras_if);
|
||||
}
|
||||
}
|
||||
|
||||
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
|
||||
struct amdgpu_irq_src *source,
|
||||
struct amdgpu_iv_entry *entry)
|
||||
|
|
|
@ -78,9 +78,8 @@ struct amdgpu_umc {
|
|||
|
||||
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev);
|
||||
void amdgpu_umc_ras_fini(struct amdgpu_device *adev);
|
||||
int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
||||
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
|
||||
void *ras_error_status,
|
||||
struct amdgpu_iv_entry *entry,
|
||||
bool reset);
|
||||
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
|
||||
struct amdgpu_irq_src *source,
|
||||
|
|
Loading…
Reference in a new issue