diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index cc73061e7255..6e986088817d 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -42,6 +42,7 @@ #define MCI_STATUS_CEC_SHIFT 38 /* Corrected Error Count */ #define MCI_STATUS_CEC_MASK GENMASK_ULL(52,38) #define MCI_STATUS_CEC(c) (((c) & MCI_STATUS_CEC_MASK) >> MCI_STATUS_CEC_SHIFT) +#define MCI_STATUS_MSCOD(m) (((m) >> 16) & 0xffff) /* AMD-specific bits */ #define MCI_STATUS_TCC BIT_ULL(55) /* Task context corrupt */ diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index eb58644bb019..6faeb2ab3960 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -103,7 +103,6 @@ static void edac_mc_dump_dimm(struct dimm_info *dimm) edac_dbg(4, " dimm->label = '%s'\n", dimm->label); edac_dbg(4, " dimm->nr_pages = 0x%x\n", dimm->nr_pages); edac_dbg(4, " dimm->grain = %d\n", dimm->grain); - edac_dbg(4, " dimm->nr_pages = 0x%x\n", dimm->nr_pages); } static void edac_mc_dump_csrow(struct csrow_info *csrow) diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h index 96f6de0c8ff6..50ed9f2425bb 100644 --- a/drivers/edac/edac_module.h +++ b/drivers/edac/edac_module.h @@ -28,13 +28,9 @@ void edac_mc_sysfs_exit(void); extern int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, const struct attribute_group **groups); extern void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci); -extern int edac_get_log_ue(void); -extern int edac_get_log_ce(void); -extern int edac_get_panic_on_ue(void); extern int edac_mc_get_log_ue(void); extern int edac_mc_get_log_ce(void); extern int edac_mc_get_panic_on_ue(void); -extern int edac_get_poll_msec(void); extern unsigned int edac_mc_get_poll_msec(void); unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf, diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 6cf50ee0b77c..a22ea053f8e1 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -74,31 +74,47 @@ static struct list_head *i10nm_edac_list; static struct res_config *res_cfg; static int retry_rd_err_log; +static int decoding_via_mca; +static bool mem_cfg_2lm; static u32 offsets_scrub_icx[] = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8}; static u32 offsets_scrub_spr[] = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8}; +static u32 offsets_scrub_spr_hbm0[] = {0x2860, 0x2854, 0x2b08, 0x2858, 0x2828, 0x0ed8}; +static u32 offsets_scrub_spr_hbm1[] = {0x2c60, 0x2c54, 0x2f08, 0x2c58, 0x2c28, 0x0fa8}; static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0}; static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0}; +static u32 offsets_demand2_spr[] = {0x22c70, 0x22d80, 0x22f18, 0x22d58, 0x22c64, 0x20f10}; +static u32 offsets_demand_spr_hbm0[] = {0x2a54, 0x2a60, 0x2b10, 0x2a58, 0x2a5c, 0x0ee0}; +static u32 offsets_demand_spr_hbm1[] = {0x2e54, 0x2e60, 0x2f10, 0x2e58, 0x2e5c, 0x0fb0}; -static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable) +static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable, + u32 *offsets_scrub, u32 *offsets_demand, + u32 *offsets_demand2) { - u32 s, d; + u32 s, d, d2; - if (!imc->mbase) - return; - - s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]); - d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]); + s = I10NM_GET_REG32(imc, chan, offsets_scrub[0]); + d = I10NM_GET_REG32(imc, chan, offsets_demand[0]); + if (offsets_demand2) + d2 = I10NM_GET_REG32(imc, chan, offsets_demand2[0]); if (enable) { /* Save default configurations */ imc->chan[chan].retry_rd_err_log_s = s; imc->chan[chan].retry_rd_err_log_d = d; + if (offsets_demand2) + imc->chan[chan].retry_rd_err_log_d2 = d2; s &= ~RETRY_RD_ERR_LOG_NOOVER_UC; s |= RETRY_RD_ERR_LOG_EN; d &= ~RETRY_RD_ERR_LOG_NOOVER_UC; d |= RETRY_RD_ERR_LOG_EN; + + if (offsets_demand2) { + d2 &= ~RETRY_RD_ERR_LOG_UC; + d2 |= RETRY_RD_ERR_LOG_NOOVER; + d2 |= RETRY_RD_ERR_LOG_EN; + } } else { /* Restore default configurations */ if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC) @@ -113,23 +129,55 @@ static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable d |= RETRY_RD_ERR_LOG_NOOVER; if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN)) d &= ~RETRY_RD_ERR_LOG_EN; + + if (offsets_demand2) { + if (imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_UC) + d2 |= RETRY_RD_ERR_LOG_UC; + if (!(imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_NOOVER)) + d2 &= ~RETRY_RD_ERR_LOG_NOOVER; + if (!(imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_EN)) + d2 &= ~RETRY_RD_ERR_LOG_EN; + } } - I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s); - I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d); + I10NM_SET_REG32(imc, chan, offsets_scrub[0], s); + I10NM_SET_REG32(imc, chan, offsets_demand[0], d); + if (offsets_demand2) + I10NM_SET_REG32(imc, chan, offsets_demand2[0], d2); } static void enable_retry_rd_err_log(bool enable) { + struct skx_imc *imc; struct skx_dev *d; int i, j; edac_dbg(2, "\n"); list_for_each_entry(d, i10nm_edac_list, list) - for (i = 0; i < I10NM_NUM_IMC; i++) - for (j = 0; j < I10NM_NUM_CHANNELS; j++) - __enable_retry_rd_err_log(&d->imc[i], j, enable); + for (i = 0; i < I10NM_NUM_IMC; i++) { + imc = &d->imc[i]; + if (!imc->mbase) + continue; + + for (j = 0; j < I10NM_NUM_CHANNELS; j++) { + if (imc->hbm_mc) { + __enable_retry_rd_err_log(imc, j, enable, + res_cfg->offsets_scrub_hbm0, + res_cfg->offsets_demand_hbm0, + NULL); + __enable_retry_rd_err_log(imc, j, enable, + res_cfg->offsets_scrub_hbm1, + res_cfg->offsets_demand_hbm1, + NULL); + } else { + __enable_retry_rd_err_log(imc, j, enable, + res_cfg->offsets_scrub, + res_cfg->offsets_demand, + res_cfg->offsets_demand2); + } + } + } } static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, @@ -138,14 +186,33 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, struct skx_imc *imc = &res->dev->imc[res->imc]; u32 log0, log1, log2, log3, log4; u32 corr0, corr1, corr2, corr3; + u32 lxg0, lxg1, lxg3, lxg4; + u32 *xffsets = NULL; u64 log2a, log5; + u64 lxg2a, lxg5; u32 *offsets; - int n; + int n, pch; if (!imc->mbase) return; - offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand; + if (imc->hbm_mc) { + pch = res->cs & 1; + + if (pch) + offsets = scrub_err ? res_cfg->offsets_scrub_hbm1 : + res_cfg->offsets_demand_hbm1; + else + offsets = scrub_err ? res_cfg->offsets_scrub_hbm0 : + res_cfg->offsets_demand_hbm0; + } else { + if (scrub_err) { + offsets = res_cfg->offsets_scrub; + } else { + offsets = res_cfg->offsets_demand; + xffsets = res_cfg->offsets_demand2; + } + } log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]); log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]); @@ -153,20 +220,52 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]); log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]); + if (xffsets) { + lxg0 = I10NM_GET_REG32(imc, res->channel, xffsets[0]); + lxg1 = I10NM_GET_REG32(imc, res->channel, xffsets[1]); + lxg3 = I10NM_GET_REG32(imc, res->channel, xffsets[3]); + lxg4 = I10NM_GET_REG32(imc, res->channel, xffsets[4]); + lxg5 = I10NM_GET_REG64(imc, res->channel, xffsets[5]); + } + if (res_cfg->type == SPR) { log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]); - n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx]", + n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx", log0, log1, log2a, log3, log4, log5); + + if (len - n > 0) { + if (xffsets) { + lxg2a = I10NM_GET_REG64(imc, res->channel, xffsets[2]); + n += snprintf(msg + n, len - n, " %.8x %.8x %.16llx %.8x %.8x %.16llx]", + lxg0, lxg1, lxg2a, lxg3, lxg4, lxg5); + } else { + n += snprintf(msg + n, len - n, "]"); + } + } } else { log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]); n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]", log0, log1, log2, log3, log4, log5); } - corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18); - corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c); - corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20); - corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24); + if (imc->hbm_mc) { + if (pch) { + corr0 = I10NM_GET_REG32(imc, res->channel, 0x2c18); + corr1 = I10NM_GET_REG32(imc, res->channel, 0x2c1c); + corr2 = I10NM_GET_REG32(imc, res->channel, 0x2c20); + corr3 = I10NM_GET_REG32(imc, res->channel, 0x2c24); + } else { + corr0 = I10NM_GET_REG32(imc, res->channel, 0x2818); + corr1 = I10NM_GET_REG32(imc, res->channel, 0x281c); + corr2 = I10NM_GET_REG32(imc, res->channel, 0x2820); + corr3 = I10NM_GET_REG32(imc, res->channel, 0x2824); + } + } else { + corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18); + corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c); + corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20); + corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24); + } if (len - n > 0) snprintf(msg + n, len - n, @@ -177,9 +276,16 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, corr3 & 0xffff, corr3 >> 16); /* Clear status bits */ - if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) { - log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V; - I10NM_SET_REG32(imc, res->channel, offsets[0], log0); + if (retry_rd_err_log == 2) { + if (log0 & RETRY_RD_ERR_LOG_OVER_UC_V) { + log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V; + I10NM_SET_REG32(imc, res->channel, offsets[0], log0); + } + + if (xffsets && (lxg0 & RETRY_RD_ERR_LOG_OVER_UC_V)) { + lxg0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V; + I10NM_SET_REG32(imc, res->channel, xffsets[0], lxg0); + } } } @@ -231,6 +337,103 @@ static bool i10nm_check_2lm(struct res_config *cfg) return false; } +/* + * Check whether the error comes from DDRT by ICX/Tremont model specific error code. + * Refer to SDM vol3B 16.11.3 Intel IMC MC error codes for IA32_MCi_STATUS. + */ +static bool i10nm_mscod_is_ddrt(u32 mscod) +{ + switch (mscod) { + case 0x0106: case 0x0107: + case 0x0800: case 0x0804: + case 0x0806 ... 0x0808: + case 0x080a ... 0x080e: + case 0x0810: case 0x0811: + case 0x0816: case 0x081e: + case 0x081f: + return true; + } + + return false; +} + +static bool i10nm_mc_decode_available(struct mce *mce) +{ + u8 bank; + + if (!decoding_via_mca || mem_cfg_2lm) + return false; + + if ((mce->status & (MCI_STATUS_MISCV | MCI_STATUS_ADDRV)) + != (MCI_STATUS_MISCV | MCI_STATUS_ADDRV)) + return false; + + bank = mce->bank; + + switch (res_cfg->type) { + case I10NM: + if (bank < 13 || bank > 26) + return false; + + /* DDRT errors can't be decoded from MCA bank registers */ + if (MCI_MISC_ECC_MODE(mce->misc) == MCI_MISC_ECC_DDRT) + return false; + + if (i10nm_mscod_is_ddrt(MCI_STATUS_MSCOD(mce->status))) + return false; + + /* Check whether one of {13,14,17,18,21,22,25,26} */ + return ((bank - 13) & BIT(1)) == 0; + default: + return false; + } +} + +static bool i10nm_mc_decode(struct decoded_addr *res) +{ + struct mce *m = res->mce; + struct skx_dev *d; + u8 bank; + + if (!i10nm_mc_decode_available(m)) + return false; + + list_for_each_entry(d, i10nm_edac_list, list) { + if (d->imc[0].src_id == m->socketid) { + res->socket = m->socketid; + res->dev = d; + break; + } + } + + switch (res_cfg->type) { + case I10NM: + bank = m->bank - 13; + res->imc = bank / 4; + res->channel = bank % 2; + break; + default: + return false; + } + + if (!res->dev) { + skx_printk(KERN_ERR, "No device for src_id %d imc %d\n", + m->socketid, res->imc); + return false; + } + + res->column = GET_BITFIELD(m->misc, 9, 18) << 2; + res->row = GET_BITFIELD(m->misc, 19, 39); + res->bank_group = GET_BITFIELD(m->misc, 40, 41); + res->bank_address = GET_BITFIELD(m->misc, 42, 43); + res->bank_group |= GET_BITFIELD(m->misc, 44, 44) << 2; + res->rank = GET_BITFIELD(m->misc, 56, 58); + res->dimm = res->rank >> 2; + res->rank = res->rank % 4; + + return true; +} + static int i10nm_get_ddr_munits(void) { struct pci_dev *mdev; @@ -420,7 +623,12 @@ static struct res_config spr_cfg = { .sad_all_devfn = PCI_DEVFN(10, 0), .sad_all_offset = 0x300, .offsets_scrub = offsets_scrub_spr, + .offsets_scrub_hbm0 = offsets_scrub_spr_hbm0, + .offsets_scrub_hbm1 = offsets_scrub_spr_hbm1, .offsets_demand = offsets_demand_spr, + .offsets_demand2 = offsets_demand2_spr, + .offsets_demand_hbm0 = offsets_demand_spr_hbm0, + .offsets_demand_hbm1 = offsets_demand_spr_hbm1, }; static const struct x86_cpu_id i10nm_cpuids[] = { @@ -574,7 +782,8 @@ static int __init i10nm_init(void) return -ENODEV; } - skx_set_mem_cfg(i10nm_check_2lm(cfg)); + mem_cfg_2lm = i10nm_check_2lm(cfg); + skx_set_mem_cfg(mem_cfg_2lm); rc = i10nm_get_ddr_munits(); @@ -626,9 +835,11 @@ static int __init i10nm_init(void) setup_i10nm_debug(); if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) { - skx_set_decode(NULL, show_retry_rd_err_log); + skx_set_decode(i10nm_mc_decode, show_retry_rd_err_log); if (retry_rd_err_log == 2) enable_retry_rd_err_log(true); + } else { + skx_set_decode(i10nm_mc_decode, NULL); } i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION); @@ -658,6 +869,34 @@ static void __exit i10nm_exit(void) module_init(i10nm_init); module_exit(i10nm_exit); +static int set_decoding_via_mca(const char *buf, const struct kernel_param *kp) +{ + unsigned long val; + int ret; + + ret = kstrtoul(buf, 0, &val); + + if (ret || val > 1) + return -EINVAL; + + if (val && mem_cfg_2lm) { + i10nm_printk(KERN_NOTICE, "Decoding errors via MCA banks for 2LM isn't supported yet\n"); + return -EIO; + } + + ret = param_set_int(buf, kp); + + return ret; +} + +static const struct kernel_param_ops decoding_via_mca_param_ops = { + .set = set_decoding_via_mca, + .get = param_get_int, +}; + +module_param_cb(decoding_via_mca, &decoding_via_mca_param_ops, &decoding_via_mca, 0644); +MODULE_PARM_DESC(decoding_via_mca, "decoding_via_mca: 0=off(default), 1=enable"); + module_param(retry_rd_err_log, int, 0444); MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)"); diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c index 4f28b8c8d378..61adaa872ba7 100644 --- a/drivers/edac/i7300_edac.c +++ b/drivers/edac/i7300_edac.c @@ -1193,7 +1193,7 @@ static int __init i7300_init(void) } /** - * i7300_init() - Unregisters the driver + * i7300_exit() - Unregisters the driver */ static void __exit i7300_exit(void) { diff --git a/drivers/edac/ppc4xx_edac.c b/drivers/edac/ppc4xx_edac.c index 0bc670778c99..046969b4e82e 100644 --- a/drivers/edac/ppc4xx_edac.c +++ b/drivers/edac/ppc4xx_edac.c @@ -178,11 +178,6 @@ struct ppc4xx_ecc_status { u32 wmirq; }; -/* Function Prototypes */ - -static int ppc4xx_edac_probe(struct platform_device *device); -static int ppc4xx_edac_remove(struct platform_device *device); - /* Global Variables */ /* @@ -197,15 +192,6 @@ static const struct of_device_id ppc4xx_edac_match[] = { }; MODULE_DEVICE_TABLE(of, ppc4xx_edac_match); -static struct platform_driver ppc4xx_edac_driver = { - .probe = ppc4xx_edac_probe, - .remove = ppc4xx_edac_remove, - .driver = { - .name = PPC4XX_EDAC_MODULE_NAME, - .of_match_table = ppc4xx_edac_match, - }, -}; - /* * TODO: The row and channel parameters likely need to be dynamically * set based on the aforementioned variant controller realizations. @@ -1391,6 +1377,15 @@ ppc4xx_edac_opstate_init(void) EDAC_OPSTATE_UNKNOWN_STR))); } +static struct platform_driver ppc4xx_edac_driver = { + .probe = ppc4xx_edac_probe, + .remove = ppc4xx_edac_remove, + .driver = { + .name = PPC4XX_EDAC_MODULE_NAME, + .of_match_table = ppc4xx_edac_match, + }, +}; + /** * ppc4xx_edac_init - driver/module insertion entry point * diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 9678ab97c7ac..8e39370fdb5c 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -335,6 +335,12 @@ struct sbridge_info { struct sbridge_channel { u32 ranks; u32 dimms; + struct dimm { + u32 rowbits; + u32 colbits; + u32 bank_xor_enable; + u32 amap_fine; + } dimm[MAX_DIMMS]; }; struct pci_id_descr { @@ -1603,7 +1609,7 @@ static int __populate_dimms(struct mem_ctl_info *mci, banks = 8; for (i = 0; i < channels; i++) { - u32 mtr; + u32 mtr, amap = 0; int max_dimms_per_channel; @@ -1615,6 +1621,7 @@ static int __populate_dimms(struct mem_ctl_info *mci, max_dimms_per_channel = ARRAY_SIZE(mtr_regs); if (!pvt->pci_tad[i]) continue; + pci_read_config_dword(pvt->pci_tad[i], 0x8c, &amap); } for (j = 0; j < max_dimms_per_channel; j++) { @@ -1627,6 +1634,7 @@ static int __populate_dimms(struct mem_ctl_info *mci, mtr_regs[j], &mtr); } edac_dbg(4, "Channel #%d MTR%d = %x\n", i, j, mtr); + if (IS_DIMM_PRESENT(mtr)) { if (!IS_ECC_ENABLED(pvt->info.mcmtr)) { sbridge_printk(KERN_ERR, "CPU SrcID #%d, Ha #%d, Channel #%d has DIMMs, but ECC is disabled\n", @@ -1661,6 +1669,11 @@ static int __populate_dimms(struct mem_ctl_info *mci, dimm->dtype = pvt->info.get_width(pvt, mtr); dimm->mtype = mtype; dimm->edac_mode = mode; + pvt->channel[i].dimm[j].rowbits = order_base_2(rows); + pvt->channel[i].dimm[j].colbits = order_base_2(cols); + pvt->channel[i].dimm[j].bank_xor_enable = + GET_BITFIELD(pvt->info.mcmtr, 9, 9); + pvt->channel[i].dimm[j].amap_fine = GET_BITFIELD(amap, 0, 0); snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_Ha#%u_Chan#%u_DIMM#%u", pvt->sbridge_dev->source_id, pvt->sbridge_dev->dom, i, j); @@ -1922,6 +1935,99 @@ static struct mem_ctl_info *get_mci_for_node_id(u8 node_id, u8 ha) return NULL; } +static u8 sb_close_row[] = { + 15, 16, 17, 18, 20, 21, 22, 28, 10, 11, 12, 13, 29, 30, 31, 32, 33 +}; + +static u8 sb_close_column[] = { + 3, 4, 5, 14, 19, 23, 24, 25, 26, 27 +}; + +static u8 sb_open_row[] = { + 14, 15, 16, 20, 28, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33 +}; + +static u8 sb_open_column[] = { + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +}; + +static u8 sb_open_fine_column[] = { + 3, 4, 5, 7, 8, 9, 10, 11, 12, 13 +}; + +static int sb_bits(u64 addr, int nbits, u8 *bits) +{ + int i, res = 0; + + for (i = 0; i < nbits; i++) + res |= ((addr >> bits[i]) & 1) << i; + return res; +} + +static int sb_bank_bits(u64 addr, int b0, int b1, int do_xor, int x0, int x1) +{ + int ret = GET_BITFIELD(addr, b0, b0) | (GET_BITFIELD(addr, b1, b1) << 1); + + if (do_xor) + ret ^= GET_BITFIELD(addr, x0, x0) | (GET_BITFIELD(addr, x1, x1) << 1); + + return ret; +} + +static bool sb_decode_ddr4(struct mem_ctl_info *mci, int ch, u8 rank, + u64 rank_addr, char *msg) +{ + int dimmno = 0; + int row, col, bank_address, bank_group; + struct sbridge_pvt *pvt; + u32 bg0 = 0, rowbits = 0, colbits = 0; + u32 amap_fine = 0, bank_xor_enable = 0; + + dimmno = (rank < 12) ? rank / 4 : 2; + pvt = mci->pvt_info; + amap_fine = pvt->channel[ch].dimm[dimmno].amap_fine; + bg0 = amap_fine ? 6 : 13; + rowbits = pvt->channel[ch].dimm[dimmno].rowbits; + colbits = pvt->channel[ch].dimm[dimmno].colbits; + bank_xor_enable = pvt->channel[ch].dimm[dimmno].bank_xor_enable; + + if (pvt->is_lockstep) { + pr_warn_once("LockStep row/column decode is not supported yet!\n"); + msg[0] = '\0'; + return false; + } + + if (pvt->is_close_pg) { + row = sb_bits(rank_addr, rowbits, sb_close_row); + col = sb_bits(rank_addr, colbits, sb_close_column); + col |= 0x400; /* C10 is autoprecharge, always set */ + bank_address = sb_bank_bits(rank_addr, 8, 9, bank_xor_enable, 22, 28); + bank_group = sb_bank_bits(rank_addr, 6, 7, bank_xor_enable, 20, 21); + } else { + row = sb_bits(rank_addr, rowbits, sb_open_row); + if (amap_fine) + col = sb_bits(rank_addr, colbits, sb_open_fine_column); + else + col = sb_bits(rank_addr, colbits, sb_open_column); + bank_address = sb_bank_bits(rank_addr, 18, 19, bank_xor_enable, 22, 23); + bank_group = sb_bank_bits(rank_addr, bg0, 17, bank_xor_enable, 20, 21); + } + + row &= (1u << rowbits) - 1; + + sprintf(msg, "row:0x%x col:0x%x bank_addr:%d bank_group:%d", + row, col, bank_address, bank_group); + return true; +} + +static bool sb_decode_ddr3(struct mem_ctl_info *mci, int ch, u8 rank, + u64 rank_addr, char *msg) +{ + pr_warn_once("DDR3 row/column decode not support yet!\n"); + msg[0] = '\0'; + return false; +} + static int get_memory_error_data(struct mem_ctl_info *mci, u64 addr, u8 *socket, u8 *ha, @@ -1937,12 +2043,13 @@ static int get_memory_error_data(struct mem_ctl_info *mci, int interleave_mode, shiftup = 0; unsigned int sad_interleave[MAX_INTERLEAVE]; u32 reg, dram_rule; - u8 ch_way, sck_way, pkg, sad_ha = 0; + u8 ch_way, sck_way, pkg, sad_ha = 0, rankid = 0; u32 tad_offset; u32 rir_way; u32 mb, gb; u64 ch_addr, offset, limit = 0, prv = 0; - + u64 rank_addr; + enum mem_type mtype; /* * Step 0) Check if the address is at special memory ranges @@ -2226,6 +2333,28 @@ static int get_memory_error_data(struct mem_ctl_info *mci, pci_read_config_dword(pvt->pci_tad[base_ch], rir_offset[n_rir][idx], ®); *rank = RIR_RNK_TGT(pvt->info.type, reg); + if (pvt->info.type == BROADWELL) { + if (pvt->is_close_pg) + shiftup = 6; + else + shiftup = 13; + + rank_addr = ch_addr >> shiftup; + rank_addr /= (1 << rir_way); + rank_addr <<= shiftup; + rank_addr |= ch_addr & GENMASK_ULL(shiftup - 1, 0); + rank_addr -= RIR_OFFSET(pvt->info.type, reg); + + mtype = pvt->info.get_memory_type(pvt); + rankid = *rank; + if (mtype == MEM_DDR4 || mtype == MEM_RDDR4) + sb_decode_ddr4(mci, base_ch, rankid, rank_addr, msg); + else + sb_decode_ddr3(mci, base_ch, rankid, rank_addr, msg); + } else { + msg[0] = '\0'; + } + edac_dbg(0, "RIR#%d: channel address 0x%08Lx < 0x%08Lx, RIR interleave %d, index %d\n", n_rir, ch_addr, @@ -2950,7 +3079,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, struct mem_ctl_info *new_mci; struct sbridge_pvt *pvt = mci->pvt_info; enum hw_event_mc_err_type tp_event; - char *optype, msg[256]; + char *optype, msg[256], msg_full[512]; bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); bool overflow = GET_BITFIELD(m->status, 62, 62); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); @@ -3089,18 +3218,17 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, */ if (!pvt->is_lockstep && !pvt->is_cur_addr_mirrored && !pvt->is_close_pg) channel = first_channel; - - snprintf(msg, sizeof(msg), - "%s%s area:%s err_code:%04x:%04x socket:%d ha:%d channel_mask:%ld rank:%d", + snprintf(msg_full, sizeof(msg_full), + "%s%s area:%s err_code:%04x:%04x socket:%d ha:%d channel_mask:%ld rank:%d %s", overflow ? " OVERFLOW" : "", (uncorrected_error && recoverable) ? " recoverable" : "", area_type, mscod, errcode, socket, ha, channel_mask, - rank); + rank, msg); - edac_dbg(0, "%s\n", msg); + edac_dbg(0, "%s\n", msg_full); /* FIXME: need support for channel mask */ @@ -3111,7 +3239,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, edac_mc_handle_error(tp_event, mci, core_err_cnt, m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0, channel, dimm, -1, - optype, msg); + optype, msg_full); return; err_parsing: edac_mc_handle_error(tp_event, mci, core_err_cnt, 0, 0, 0, diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 1abc020d49ab..7e2762f62eec 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -714,8 +714,13 @@ static int __init skx_init(void) skx_set_decode(skx_decode, skx_show_retry_rd_err_log); - if (nvdimm_count && skx_adxl_get() == -ENODEV) - skx_printk(KERN_NOTICE, "Only decoding DDR4 address!\n"); + if (nvdimm_count && skx_adxl_get() != -ENODEV) { + skx_set_decode(NULL, skx_show_retry_rd_err_log); + } else { + if (nvdimm_count) + skx_printk(KERN_NOTICE, "Only decoding DDR4 address!\n"); + skx_set_decode(skx_decode, skx_show_retry_rd_err_log); + } /* Ensure that the OPSTATE is set correctly for POLL or NMI */ opstate_init(); diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 19c17c5198c5..f0f8e98f6efb 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -27,9 +27,11 @@ static const char * const component_names[] = { [INDEX_MEMCTRL] = "MemoryControllerId", [INDEX_CHANNEL] = "ChannelId", [INDEX_DIMM] = "DimmSlotId", + [INDEX_CS] = "ChipSelect", [INDEX_NM_MEMCTRL] = "NmMemoryControllerId", [INDEX_NM_CHANNEL] = "NmChannelId", [INDEX_NM_DIMM] = "NmDimmSlotId", + [INDEX_NM_CS] = "NmChipSelect", }; static int component_indices[ARRAY_SIZE(component_names)]; @@ -40,7 +42,7 @@ static char *adxl_msg; static unsigned long adxl_nm_bitmap; static char skx_msg[MSG_SIZE]; -static skx_decode_f skx_decode; +static skx_decode_f driver_decode; static skx_show_retry_log_f skx_show_retry_rd_err_log; static u64 skx_tolm, skx_tohm; static LIST_HEAD(dev_edac_list); @@ -139,10 +141,13 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me (int)adxl_values[component_indices[INDEX_NM_CHANNEL]] : -1; res->dimm = (adxl_nm_bitmap & BIT_NM_DIMM) ? (int)adxl_values[component_indices[INDEX_NM_DIMM]] : -1; + res->cs = (adxl_nm_bitmap & BIT_NM_CS) ? + (int)adxl_values[component_indices[INDEX_NM_CS]] : -1; } else { res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]]; res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]]; res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]]; + res->cs = (int)adxl_values[component_indices[INDEX_CS]]; } if (res->imc > NUM_IMC - 1 || res->imc < 0) { @@ -173,6 +178,8 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me break; } + res->decoded_by_adxl = true; + return true; } @@ -183,7 +190,7 @@ void skx_set_mem_cfg(bool mem_cfg_2lm) void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log) { - skx_decode = decode; + driver_decode = decode; skx_show_retry_rd_err_log = show_retry_log; } @@ -591,19 +598,19 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, break; } } - if (adxl_component_count) { + if (res->decoded_by_adxl) { len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s", overflow ? " OVERFLOW" : "", (uncorrected_error && recoverable) ? " recoverable" : "", mscod, errcode, adxl_msg); } else { len = snprintf(skx_msg, MSG_SIZE, - "%s%s err_code:0x%04x:0x%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:0x%x col:0x%x", + "%s%s err_code:0x%04x:0x%04x ProcessorSocketId:0x%x MemoryControllerId:0x%x PhysicalRankId:0x%x Row:0x%x Column:0x%x Bank:0x%x BankGroup:0x%x", overflow ? " OVERFLOW" : "", (uncorrected_error && recoverable) ? " recoverable" : "", mscod, errcode, res->socket, res->imc, res->rank, - res->bank_group, res->bank_address, res->row, res->column); + res->row, res->column, res->bank_address, res->bank_group); } if (skx_show_retry_rd_err_log) @@ -649,13 +656,14 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, return NOTIFY_DONE; memset(&res, 0, sizeof(res)); + res.mce = mce; res.addr = mce->addr; - if (adxl_component_count) { - if (!skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce))) + /* Try driver decoder first */ + if (!(driver_decode && driver_decode(&res))) { + /* Then try firmware decoder (ACPI DSM methods) */ + if (!(adxl_component_count && skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce)))) return NOTIFY_DONE; - } else if (!skx_decode || !skx_decode(&res)) { - return NOTIFY_DONE; } mci = res.dev->imc[res.imc].mci; diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 03ac067a80b9..0cbadd3d2cd3 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -10,6 +10,7 @@ #define _SKX_COMM_EDAC_H #include +#include #define MSG_SIZE 1024 @@ -52,6 +53,9 @@ #define IS_DIMM_PRESENT(r) GET_BITFIELD(r, 15, 15) #define IS_NVDIMM_PRESENT(r, i) GET_BITFIELD(r, i, i) +#define MCI_MISC_ECC_MODE(m) (((m) >> 59) & 15) +#define MCI_MISC_ECC_DDRT 8 /* read from DDRT */ + /* * Each cpu socket contains some pci devices that provide global * information, and also some that are local to each of the two @@ -82,6 +86,7 @@ struct skx_dev { struct pci_dev *edev; u32 retry_rd_err_log_s; u32 retry_rd_err_log_d; + u32 retry_rd_err_log_d2; struct skx_dimm { u8 close_pg; u8 bank_xor_enable; @@ -108,18 +113,22 @@ enum { INDEX_MEMCTRL, INDEX_CHANNEL, INDEX_DIMM, + INDEX_CS, INDEX_NM_FIRST, INDEX_NM_MEMCTRL = INDEX_NM_FIRST, INDEX_NM_CHANNEL, INDEX_NM_DIMM, + INDEX_NM_CS, INDEX_MAX }; #define BIT_NM_MEMCTRL BIT_ULL(INDEX_NM_MEMCTRL) #define BIT_NM_CHANNEL BIT_ULL(INDEX_NM_CHANNEL) #define BIT_NM_DIMM BIT_ULL(INDEX_NM_DIMM) +#define BIT_NM_CS BIT_ULL(INDEX_NM_CS) struct decoded_addr { + struct mce *mce; struct skx_dev *dev; u64 addr; int socket; @@ -129,6 +138,7 @@ struct decoded_addr { int sktways; int chanways; int dimm; + int cs; int rank; int channel_rank; u64 rank_address; @@ -136,6 +146,7 @@ struct decoded_addr { int column; int bank_address; int bank_group; + bool decoded_by_adxl; }; struct res_config { @@ -154,7 +165,12 @@ struct res_config { int sad_all_offset; /* Offsets of retry_rd_err_log registers */ u32 *offsets_scrub; + u32 *offsets_scrub_hbm0; + u32 *offsets_scrub_hbm1; u32 *offsets_demand; + u32 *offsets_demand2; + u32 *offsets_demand_hbm0; + u32 *offsets_demand_hbm1; }; typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci, diff --git a/drivers/edac/wq.c b/drivers/edac/wq.c index d021d287eaec..ad3f516627c5 100644 --- a/drivers/edac/wq.c +++ b/drivers/edac/wq.c @@ -37,7 +37,6 @@ int edac_workqueue_setup(void) void edac_workqueue_teardown(void) { - flush_workqueue(wq); destroy_workqueue(wq); wq = NULL; } diff --git a/include/linux/edac.h b/include/linux/edac.h index e730b3468719..fa4bda2a70f6 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -231,21 +231,21 @@ enum mem_type { #define MEM_FLAG_DDR BIT(MEM_DDR) #define MEM_FLAG_RDDR BIT(MEM_RDDR) #define MEM_FLAG_RMBS BIT(MEM_RMBS) -#define MEM_FLAG_DDR2 BIT(MEM_DDR2) -#define MEM_FLAG_FB_DDR2 BIT(MEM_FB_DDR2) -#define MEM_FLAG_RDDR2 BIT(MEM_RDDR2) -#define MEM_FLAG_XDR BIT(MEM_XDR) -#define MEM_FLAG_DDR3 BIT(MEM_DDR3) -#define MEM_FLAG_RDDR3 BIT(MEM_RDDR3) -#define MEM_FLAG_LPDDR3 BIT(MEM_LPDDR3) -#define MEM_FLAG_DDR4 BIT(MEM_DDR4) -#define MEM_FLAG_RDDR4 BIT(MEM_RDDR4) -#define MEM_FLAG_LRDDR4 BIT(MEM_LRDDR4) -#define MEM_FLAG_LPDDR4 BIT(MEM_LPDDR4) -#define MEM_FLAG_DDR5 BIT(MEM_DDR5) -#define MEM_FLAG_RDDR5 BIT(MEM_RDDR5) -#define MEM_FLAG_LRDDR5 BIT(MEM_LRDDR5) -#define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) +#define MEM_FLAG_DDR2 BIT(MEM_DDR2) +#define MEM_FLAG_FB_DDR2 BIT(MEM_FB_DDR2) +#define MEM_FLAG_RDDR2 BIT(MEM_RDDR2) +#define MEM_FLAG_XDR BIT(MEM_XDR) +#define MEM_FLAG_DDR3 BIT(MEM_DDR3) +#define MEM_FLAG_RDDR3 BIT(MEM_RDDR3) +#define MEM_FLAG_LPDDR3 BIT(MEM_LPDDR3) +#define MEM_FLAG_DDR4 BIT(MEM_DDR4) +#define MEM_FLAG_RDDR4 BIT(MEM_RDDR4) +#define MEM_FLAG_LRDDR4 BIT(MEM_LRDDR4) +#define MEM_FLAG_LPDDR4 BIT(MEM_LPDDR4) +#define MEM_FLAG_DDR5 BIT(MEM_DDR5) +#define MEM_FLAG_RDDR5 BIT(MEM_RDDR5) +#define MEM_FLAG_LRDDR5 BIT(MEM_LRDDR5) +#define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) #define MEM_FLAG_WIO2 BIT(MEM_WIO2) #define MEM_FLAG_HBM2 BIT(MEM_HBM2)