drm/amdgpu: add amdgpu smu mca dump feature support

add amdgpu smu mca dump feature support.

Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Yang Wang 2023-09-05 11:39:10 +08:00 committed by Alex Deucher
parent 615585d09b
commit 7ff607e272
2 changed files with 129 additions and 0 deletions

View file

@ -142,3 +142,73 @@ int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev)
return 0;
}
void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs)
{
struct amdgpu_mca *mca = &adev->mca;
mca->mca_funcs = mca_funcs;
}
int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
{
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (mca_funcs && mca_funcs->mca_set_debug_mode)
return mca_funcs->mca_set_debug_mode(adev, enable);
return -EOPNOTSUPP;
}
int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count)
{
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!count)
return -EINVAL;
if (mca_funcs && mca_funcs->mca_get_valid_mca_count)
return mca_funcs->mca_get_valid_mca_count(adev, type, count);
return -EOPNOTSUPP;
}
int amdgpu_mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, uint32_t *count)
{
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!count)
return -EINVAL;
if (mca_funcs && mca_funcs->mca_get_error_count)
return mca_funcs->mca_get_error_count(adev, blk, type, count);
return -EOPNOTSUPP;
}
int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
int idx, struct mca_bank_entry *entry)
{
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
int count;
switch (type) {
case AMDGPU_MCA_ERROR_TYPE_UE:
count = mca_funcs->max_ue_count;
break;
case AMDGPU_MCA_ERROR_TYPE_CE:
count = mca_funcs->max_ce_count;
break;
default:
return -EINVAL;
}
if (idx >= count)
return -EINVAL;
if (mca_funcs && mca_funcs->mca_get_mca_entry)
return mca_funcs->mca_get_mca_entry(adev, type, idx, entry);
return -EOPNOTSUPP;
}

View file

@ -21,6 +21,26 @@
#ifndef __AMDGPU_MCA_H__
#define __AMDGPU_MCA_H__
#include "amdgpu_ras.h"
#define MCA_MAX_REGS_COUNT (16)
enum amdgpu_mca_ip {
AMDGPU_MCA_IP_UNKNOW = -1,
AMDGPU_MCA_IP_PSP = 0,
AMDGPU_MCA_IP_SDMA,
AMDGPU_MCA_IP_GC,
AMDGPU_MCA_IP_SMU,
AMDGPU_MCA_IP_MP5,
AMDGPU_MCA_IP_UMC,
AMDGPU_MCA_IP_COUNT,
};
enum amdgpu_mca_error_type {
AMDGPU_MCA_ERROR_TYPE_UE = 0,
AMDGPU_MCA_ERROR_TYPE_CE,
};
struct amdgpu_mca_ras_block {
struct amdgpu_ras_block_object ras_block;
};
@ -34,6 +54,36 @@ struct amdgpu_mca {
struct amdgpu_mca_ras mp0;
struct amdgpu_mca_ras mp1;
struct amdgpu_mca_ras mpio;
const struct amdgpu_mca_smu_funcs *mca_funcs;
};
struct mca_bank_info {
int socket_id;
int aid;
int hwid;
int mcatype;
};
struct mca_bank_entry {
int idx;
enum amdgpu_mca_error_type type;
enum amdgpu_mca_ip ip;
struct mca_bank_info info;
uint64_t regs[MCA_MAX_REGS_COUNT];
};
struct amdgpu_mca_smu_funcs {
int max_ue_count;
int max_ce_count;
int (*mca_set_debug_mode)(struct amdgpu_device *adev, bool enable);
int (*mca_get_error_count)(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, uint32_t *count);
int (*mca_get_valid_mca_count)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
uint32_t *count);
int (*mca_get_mca_entry)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
int idx, struct mca_bank_entry *entry);
int (*mca_get_ras_mca_idx_array)(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, int *idx_array, int *idx_array_size);
};
void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
@ -53,4 +103,13 @@ void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev);
void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs);
int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable);
int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count);
int amdgpu_mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum amdgpu_mca_error_type type, uint32_t *count);
int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
int idx, struct mca_bank_entry *entry);
#endif