misc: smpro-errmon: Add dimm training failure syndrome

Adds event_dimm[0-15]_syndrome sysfs to report the failure syndrome
to BMC when DIMM training failed.

Signed-off-by: Quan Nguyen <quan@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20230310083416.3670980-3-quan@os.amperecomputing.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
Quan Nguyen 2023-03-10 15:34:16 +07:00 committed by Greg Kroah-Hartman
parent b0f64c80cb
commit c2c99326cc
2 changed files with 87 additions and 0 deletions

View File

@ -265,6 +265,16 @@ Description:
For more details, see section `5.7 GPI Status Registers and 5.9 Memory Error Register Definitions,
Altra Family Soc BMC Interface Specification`.
What: /sys/bus/platform/devices/smpro-errmon.*/event_dimm[0-15]_syndrome
KernelVersion: 6.4
Contact: Quan Nguyen <quan@os.amperecomputing.com>
Description:
(RO) The sysfs returns the 2-byte DIMM failure syndrome data for slot
0-15 if it failed to initialize.
For more details, see section `5.11 Boot Stage Register Definitions,
Altra Family Soc BMC Interface Specification`.
What: /sys/bus/platform/devices/smpro-misc.*/boot_progress
KernelVersion: 6.1
Contact: Quan Nguyen <quan@os.amperecomputing.com>

View File

@ -47,6 +47,12 @@
#define WARN_PMPRO_INFO_LO 0xAC
#define WARN_PMPRO_INFO_HI 0xAD
/* Boot Stage Register */
#define BOOTSTAGE 0xB0
#define DIMM_SYNDROME_SEL 0xB4
#define DIMM_SYNDROME_ERR 0xB5
#define DIMM_SYNDROME_STAGE 4
/* PCIE Error Registers */
#define PCIE_CE_ERR_CNT 0xC0
#define PCIE_CE_ERR_LEN 0xC1
@ -468,6 +474,61 @@ EVENT_RO(vrd_hot, VRD_HOT_EVENT);
EVENT_RO(dimm_hot, DIMM_HOT_EVENT);
EVENT_RO(dimm_2x_refresh, DIMM_2X_REFRESH_EVENT);
static ssize_t smpro_dimm_syndrome_read(struct device *dev, struct device_attribute *da,
char *buf, unsigned int slot)
{
struct smpro_errmon *errmon = dev_get_drvdata(dev);
unsigned int data;
int ret;
ret = regmap_read(errmon->regmap, BOOTSTAGE, &data);
if (ret)
return ret;
/* check for valid stage */
data = (data >> 8) & 0xff;
if (data != DIMM_SYNDROME_STAGE)
return ret;
/* Write the slot ID to retrieve Error Syndrome */
ret = regmap_write(errmon->regmap, DIMM_SYNDROME_SEL, slot);
if (ret)
return ret;
/* Read the Syndrome error */
ret = regmap_read(errmon->regmap, DIMM_SYNDROME_ERR, &data);
if (ret || !data)
return ret;
return sysfs_emit(buf, "%04x\n", data);
}
#define EVENT_DIMM_SYNDROME(_slot) \
static ssize_t event_dimm##_slot##_syndrome_show(struct device *dev, \
struct device_attribute *da, \
char *buf) \
{ \
return smpro_dimm_syndrome_read(dev, da, buf, _slot); \
} \
static DEVICE_ATTR_RO(event_dimm##_slot##_syndrome)
EVENT_DIMM_SYNDROME(0);
EVENT_DIMM_SYNDROME(1);
EVENT_DIMM_SYNDROME(2);
EVENT_DIMM_SYNDROME(3);
EVENT_DIMM_SYNDROME(4);
EVENT_DIMM_SYNDROME(5);
EVENT_DIMM_SYNDROME(6);
EVENT_DIMM_SYNDROME(7);
EVENT_DIMM_SYNDROME(8);
EVENT_DIMM_SYNDROME(9);
EVENT_DIMM_SYNDROME(10);
EVENT_DIMM_SYNDROME(11);
EVENT_DIMM_SYNDROME(12);
EVENT_DIMM_SYNDROME(13);
EVENT_DIMM_SYNDROME(14);
EVENT_DIMM_SYNDROME(15);
static struct attribute *smpro_errmon_attrs[] = {
&dev_attr_overflow_core_ce.attr,
&dev_attr_overflow_core_ue.attr,
@ -493,6 +554,22 @@ static struct attribute *smpro_errmon_attrs[] = {
&dev_attr_event_vrd_hot.attr,
&dev_attr_event_dimm_hot.attr,
&dev_attr_event_dimm_2x_refresh.attr,
&dev_attr_event_dimm0_syndrome.attr,
&dev_attr_event_dimm1_syndrome.attr,
&dev_attr_event_dimm2_syndrome.attr,
&dev_attr_event_dimm3_syndrome.attr,
&dev_attr_event_dimm4_syndrome.attr,
&dev_attr_event_dimm5_syndrome.attr,
&dev_attr_event_dimm6_syndrome.attr,
&dev_attr_event_dimm7_syndrome.attr,
&dev_attr_event_dimm8_syndrome.attr,
&dev_attr_event_dimm9_syndrome.attr,
&dev_attr_event_dimm10_syndrome.attr,
&dev_attr_event_dimm11_syndrome.attr,
&dev_attr_event_dimm12_syndrome.attr,
&dev_attr_event_dimm13_syndrome.attr,
&dev_attr_event_dimm14_syndrome.attr,
&dev_attr_event_dimm15_syndrome.attr,
NULL
};