drm/amdgpu: Fix RAS function interface

The correctable and uncorrectable errors
are calculated at each invocation of this
function. Therefore, it is highly inefficient to
return just one of them based on a Boolean
input. If the caller wants both, twice the work
would be done. (And this work is O(n^3) on
Vega20.)

Fix this "interface" to simply return what it had
calculated--both values. Let the caller choose
what it wants to record, inspect, use.

Cc: Alexander Deucher <Alexander.Deucher@amd.com>
Cc: John Clements <john.clements@amd.com>
Cc: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
Reviewed-by: Alexander Deucher <Alexander.Deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Luben Tuikov 2021-05-18 21:07:17 -04:00 committed by Alex Deucher
parent 2871e10199
commit a46751fbcd
2 changed files with 18 additions and 10 deletions

View File

@ -1043,29 +1043,36 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
}
/* get the total error counts on all IPs */
unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
bool is_ce)
void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
unsigned long *ce_count,
unsigned long *ue_count)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
struct ras_err_data data = {0, 0};
unsigned long ce, ue;
if (!adev->ras_enabled || !con)
return 0;
return;
ce = 0;
ue = 0;
list_for_each_entry(obj, &con->head, node) {
struct ras_query_if info = {
.head = obj->head,
};
if (amdgpu_ras_query_error_status(adev, &info))
return 0;
return;
data.ce_count += info.ce_count;
data.ue_count += info.ue_count;
ce += info.ce_count;
ue += info.ue_count;
}
return is_ce ? data.ce_count : data.ue_count;
if (ce_count)
*ce_count = ce;
if (ue_count)
*ue_count = ue;
}
/* query/inject/cure end */

View File

@ -485,8 +485,9 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
void amdgpu_ras_resume(struct amdgpu_device *adev);
void amdgpu_ras_suspend(struct amdgpu_device *adev);
unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
bool is_ce);
void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
unsigned long *ce_count,
unsigned long *ue_count);
/* error handling functions */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,