drm/amdgpu: RAS harvest on driver load

In event of RAS UE + warm reset, error counters shall be harvested and cleared on driver load

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
John Clements 2021-03-25 17:10:10 +08:00 committed by Alex Deucher
parent 340c571beb
commit 134d16d50f

View file

@ -2090,6 +2090,32 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
return r;
}
static int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
{
if (adev->gmc.xgmi.connected_to_cpu)
return 1;
return 0;
}
static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
struct ras_common_if *ras_block)
{
struct ras_query_if info = {
.head = *ras_block,
};
if (!amdgpu_persistent_edc_harvesting_supported(adev))
return 0;
if (amdgpu_ras_query_error_status(adev, &info) != 0)
DRM_WARN("RAS init harvest failure");
if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
DRM_WARN("RAS init harvest reset failure");
return 0;
}
/* helper function to handle common stuff in ip late init phase */
int amdgpu_ras_late_init(struct amdgpu_device *adev,
struct ras_common_if *ras_block,
@ -2119,6 +2145,9 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
return r;
}
/* check for errors on warm reset edc persisant supported ASIC */
amdgpu_persistent_edc_harvesting(adev, ras_block);
/* in resume phase, no need to create ras fs node */
if (adev->in_suspend || amdgpu_in_reset(adev))
return 0;