From c43fa3b11e9ff7e6811ac678f196ea6bd97f3e64 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 13 Sep 2019 10:59:39 -0300 Subject: [PATCH 01/35] EDAC: i5100_edac: get rid of an unused var MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As reported by GCC with W=1: drivers/edac/i5100_edac.c:714:16: warning: variable ‘et’ set but not used [-Wunused-but-set-variable] 714 | unsigned long et; | ^~ It sounds some left over from some code before the addition of an udelay(). Acked-by: Borislav Petkov Acked-by: Tony Luck Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/i5100_edac.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c index 251f2b692785..12bebecb203b 100644 --- a/drivers/edac/i5100_edac.c +++ b/drivers/edac/i5100_edac.c @@ -713,7 +713,6 @@ static int i5100_read_spd_byte(const struct mem_ctl_info *mci, { struct i5100_priv *priv = mci->pvt_info; u16 w; - unsigned long et; pci_read_config_word(priv->mc, I5100_SPDDATA, &w); if (i5100_spddata_busy(w)) @@ -724,7 +723,6 @@ static int i5100_read_spd_byte(const struct mem_ctl_info *mci, 0, 0)); /* wait up to 100ms */ - et = jiffies + HZ / 10; udelay(100); while (1) { pci_read_config_word(priv->mc, I5100_SPDDATA, &w); From 9f95c8d5f84ae3e56f8fcfde2f2a2fd4326d7db6 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 13 Sep 2019 11:01:01 -0300 Subject: [PATCH 02/35] EDAC: i7300_edac: rename a kernel-doc var description One var was renamed, but the associated kernel-doc markup still points to the old name. Acked-by: Borislav Petkov Acked-by: Tony Luck Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/i7300_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c index 7bf910d54d11..3d4bd3bf317c 100644 --- a/drivers/edac/i7300_edac.c +++ b/drivers/edac/i7300_edac.c @@ -580,7 +580,7 @@ static void i7300_enable_error_reporting(struct mem_ctl_info *mci) * @ch: Channel number within the branch (0 or 1) * @branch: Branch number (0 or 1) * @dinfo: Pointer to DIMM info where dimm size is stored - * @p_csrow: Pointer to the struct csrow_info that corresponds to that element + * @dimm: Pointer to the struct dimm_info that corresponds to that element */ static int decode_mtr(struct i7300_pvt *pvt, int slot, int ch, int branch, From 48356e0d57783eec766bef60c88553ee53740b3c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 13 Sep 2019 11:03:23 -0300 Subject: [PATCH 03/35] EDAC: i7300_edac: fix a kernel-doc syntax The declaration of the kerneldoc entry is wrong, causing this warning: drivers/edac/i7300_edac.c:824: warning: Function parameter or member 'mir_no' not described in 'decode_mir' Acked-by: Borislav Petkov Acked-by: Tony Luck Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/i7300_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c index 3d4bd3bf317c..447d357c7a67 100644 --- a/drivers/edac/i7300_edac.c +++ b/drivers/edac/i7300_edac.c @@ -817,7 +817,7 @@ static int i7300_init_csrows(struct mem_ctl_info *mci) /** * decode_mir() - Decodes Memory Interleave Register (MIR) info - * @int mir_no: number of the MIR register to decode + * @mir_no: number of the MIR register to decode * @mir: array with the MIR data cached on the driver */ static void decode_mir(int mir_no, u16 mir[MAX_MIR]) From 1acd05e40cb08cd9cb6d11657df696ed582a3691 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 13 Sep 2019 11:07:14 -0300 Subject: [PATCH 04/35] EDAC: i5400_edac: print type at debug message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are 3 types of non-recoverable errors that the MC reports: - Fatal; - Non-fatal uncorrected - Non-fatal correctable While we don't add it to the log itself, it could be useful to have this at least for debug messages. This shuts up this warning: drivers/edac/i5400_edac.c: In function ‘i5400_proccess_non_recoverable_info’: drivers/edac/i5400_edac.c:524:8: warning: variable ‘type’ set but not used [-Wunused-but-set-variable] 524 | char *type = NULL; | ^~~~ Acked-by: Borislav Petkov Acked-by: Tony Luck Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/i5400_edac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c index 6f8bcdb9256a..52783b9bd0df 100644 --- a/drivers/edac/i5400_edac.c +++ b/drivers/edac/i5400_edac.c @@ -548,8 +548,8 @@ static void i5400_proccess_non_recoverable_info(struct mem_ctl_info *mci, ras = nrec_ras(info); cas = nrec_cas(info); - edac_dbg(0, "\t\tDIMM= %d Channels= %d,%d (Branch= %d DRAM Bank= %d Buffer ID = %d rdwr= %s ras= %d cas= %d)\n", - rank, channel, channel + 1, branch >> 1, bank, + edac_dbg(0, "\t\t%s DIMM= %d Channels= %d,%d (Branch= %d DRAM Bank= %d Buffer ID = %d rdwr= %s ras= %d cas= %d)\n", + type, rank, channel, channel + 1, branch >> 1, bank, buf_id, rdwr_str(rdwr), ras, cas); /* Only 1 bit will be on */ From bb66f867812d29a57a9a7d04c43d7f6fb48127ae Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 13 Sep 2019 11:13:41 -0300 Subject: [PATCH 05/35] EDAC: i5400_edac: get rid of some unused vars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are several temporary unused vars: drivers/edac/i5400_edac.c: In function ‘i5400_get_mc_regs’: drivers/edac/i5400_edac.c:1058:6: warning: variable ‘maxdimmperch’ set but not used [-Wunused-but-set-variable] 1058 | int maxdimmperch; | ^~~~~~~~~~~~ drivers/edac/i5400_edac.c:1057:6: warning: variable ‘maxch’ set but not used [-Wunused-but-set-variable] 1057 | int maxch; | ^~~~~ drivers/edac/i5400_edac.c: In function ‘i5400_init_dimms’: drivers/edac/i5400_edac.c:1174:6: warning: variable ‘max_dimms’ set but not used [-Wunused-but-set-variable] 1174 | int max_dimms; | ^~~~~~~~~ drivers/edac/i5400_edac.c:1173:14: warning: variable ‘channel_count’ set but not used [-Wunused-but-set-variable] 1173 | int ndimms, channel_count; | ^~~~~~~~~~~~~ Get rid of them. Acked-by: Borislav Petkov Acked-by: Tony Luck Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/i5400_edac.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c index 52783b9bd0df..8c86c6fd7da7 100644 --- a/drivers/edac/i5400_edac.c +++ b/drivers/edac/i5400_edac.c @@ -1054,8 +1054,6 @@ static void i5400_get_mc_regs(struct mem_ctl_info *mci) u32 actual_tolm; u16 limit; int slot_row; - int maxch; - int maxdimmperch; int way0, way1; pvt = mci->pvt_info; @@ -1065,9 +1063,6 @@ static void i5400_get_mc_regs(struct mem_ctl_info *mci) pci_read_config_dword(pvt->system_address, AMBASE + sizeof(u32), &pvt->u.ambase_top); - maxdimmperch = pvt->maxdimmperch; - maxch = pvt->maxch; - edac_dbg(2, "AMBASE= 0x%lx MAXCH= %d MAX-DIMM-Per-CH= %d\n", (long unsigned int)pvt->ambase, pvt->maxch, pvt->maxdimmperch); @@ -1170,17 +1165,13 @@ static int i5400_init_dimms(struct mem_ctl_info *mci) { struct i5400_pvt *pvt; struct dimm_info *dimm; - int ndimms, channel_count; - int max_dimms; + int ndimms; int mtr; int size_mb; int channel, slot; pvt = mci->pvt_info; - channel_count = pvt->maxch; - max_dimms = pvt->maxdimmperch; - ndimms = 0; /* From 323014d85d2699b2879ecb15cd06a15408e3e801 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 13 Sep 2019 11:17:16 -0300 Subject: [PATCH 06/35] EDAC: sb_edac: get rid of unused vars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are several vars unused on this driver, probably because it was a modified copy of another driver. Get rid of them. drivers/edac/sb_edac.c: In function ‘knl_get_dimm_capacity’: drivers/edac/sb_edac.c:1343:16: warning: variable ‘sad_size’ set but not used [-Wunused-but-set-variable] 1343 | u64 sad_base, sad_size, sad_limit = 0; | ^~~~~~~~ drivers/edac/sb_edac.c: In function ‘sbridge_mce_output_error’: drivers/edac/sb_edac.c:2955:8: warning: variable ‘type’ set but not used [-Wunused-but-set-variable] 2955 | char *type, *optype, msg[256]; | ^~~~ drivers/edac/sb_edac.c: In function ‘sbridge_unregister_mci’: drivers/edac/sb_edac.c:3203:22: warning: variable ‘pvt’ set but not used [-Wunused-but-set-variable] 3203 | struct sbridge_pvt *pvt; | ^~~ At top level: drivers/edac/sb_edac.c:266:18: warning: ‘correrrthrsld’ defined but not used [-Wunused-const-variable=] 266 | static const u32 correrrthrsld[] = { | ^~~~~~~~~~~~~ drivers/edac/sb_edac.c:257:18: warning: ‘correrrcnt’ defined but not used [-Wunused-const-variable=] 257 | static const u32 correrrcnt[] = { | ^~~~~~~~~~ Acked-by: Borislav Petkov Acked-by: Tony Luck Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/sb_edac.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index f743502ca9b7..a2fd39d330d6 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -254,18 +254,20 @@ static const u32 rir_offset[MAX_RIR_RANGES][MAX_RIR_WAY] = { * FIXME: Implement the error count reads directly */ -static const u32 correrrcnt[] = { - 0x104, 0x108, 0x10c, 0x110, -}; - #define RANK_ODD_OV(reg) GET_BITFIELD(reg, 31, 31) #define RANK_ODD_ERR_CNT(reg) GET_BITFIELD(reg, 16, 30) #define RANK_EVEN_OV(reg) GET_BITFIELD(reg, 15, 15) #define RANK_EVEN_ERR_CNT(reg) GET_BITFIELD(reg, 0, 14) +#if 0 /* Currently unused*/ +static const u32 correrrcnt[] = { + 0x104, 0x108, 0x10c, 0x110, +}; + static const u32 correrrthrsld[] = { 0x11c, 0x120, 0x124, 0x128, }; +#endif #define RANK_ODD_ERR_THRSLD(reg) GET_BITFIELD(reg, 16, 30) #define RANK_EVEN_ERR_THRSLD(reg) GET_BITFIELD(reg, 0, 14) @@ -1340,7 +1342,7 @@ static void knl_show_mc_route(u32 reg, char *s) */ static int knl_get_dimm_capacity(struct sbridge_pvt *pvt, u64 *mc_sizes) { - u64 sad_base, sad_size, sad_limit = 0; + u64 sad_base, sad_limit = 0; u64 tad_base, tad_size, tad_limit, tad_deadspace, tad_livespace; int sad_rule = 0; int tad_rule = 0; @@ -1427,7 +1429,6 @@ static int knl_get_dimm_capacity(struct sbridge_pvt *pvt, u64 *mc_sizes) edram_only = KNL_EDRAM_ONLY(dram_rule); sad_limit = pvt->info.sad_limit(dram_rule)+1; - sad_size = sad_limit - sad_base; pci_read_config_dword(pvt->pci_sad0, pvt->info.interleave_list[sad_rule], &interleave_reg); @@ -2952,7 +2953,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, struct mem_ctl_info *new_mci; struct sbridge_pvt *pvt = mci->pvt_info; enum hw_event_mc_err_type tp_event; - char *type, *optype, msg[256]; + char *optype, msg[256]; bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); bool overflow = GET_BITFIELD(m->status, 62, 62); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); @@ -2981,14 +2982,11 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, if (uncorrected_error) { core_err_cnt = 1; if (ripv) { - type = "FATAL"; tp_event = HW_EVENT_ERR_FATAL; } else { - type = "NON_FATAL"; tp_event = HW_EVENT_ERR_UNCORRECTED; } } else { - type = "CORRECTED"; tp_event = HW_EVENT_ERR_CORRECTED; } @@ -3200,7 +3198,6 @@ static struct notifier_block sbridge_mce_dec = { static void sbridge_unregister_mci(struct sbridge_dev *sbridge_dev) { struct mem_ctl_info *mci = sbridge_dev->mci; - struct sbridge_pvt *pvt; if (unlikely(!mci || !mci->pvt_info)) { edac_dbg(0, "MC: dev = %p\n", &sbridge_dev->pdev[0]->dev); @@ -3209,8 +3206,6 @@ static void sbridge_unregister_mci(struct sbridge_dev *sbridge_dev) return; } - pvt = mci->pvt_info; - edac_dbg(0, "MC: mci = %p, dev = %p\n", mci, &sbridge_dev->pdev[0]->dev); From f05390d30e20cccd8f8de981dee42bcdd8d2d137 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 13 Sep 2019 11:23:08 -0300 Subject: [PATCH 07/35] EDAC: skx_common: get rid of unused type var MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/edac/skx_common.c: In function ‘skx_mce_output_error’: drivers/edac/skx_common.c:478:8: warning: variable ‘type’ set but not used [-Wunused-but-set-variable] 478 | char *type, *optype; | ^~~~ Acked-by: Borislav Petkov Acked-by: Tony Luck Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/skx_common.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index d8ff63d91b86..83dd5da67a28 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -475,7 +475,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, struct decoded_addr *res) { enum hw_event_mc_err_type tp_event; - char *type, *optype; + char *optype; bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); bool overflow = GET_BITFIELD(m->status, 62, 62); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); @@ -490,14 +490,11 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, if (uncorrected_error) { core_err_cnt = 1; if (ripv) { - type = "FATAL"; tp_event = HW_EVENT_ERR_FATAL; } else { - type = "NON_FATAL"; tp_event = HW_EVENT_ERR_UNCORRECTED; } } else { - type = "CORRECTED"; tp_event = HW_EVENT_ERR_CORRECTED; } From 9816b4af4351bf9d33c2e635b6f14327823a9c2a Mon Sep 17 00:00:00 2001 From: Hanna Hawa Date: Mon, 23 Sep 2019 20:17:40 +0100 Subject: [PATCH 08/35] EDAC/device: Rework error logging API Make the main workhorse the "count" functions which can log a @count of errors. Have the current APIs edac_device_handle_{ce,ue}() call the _count() variants and this way keep the exported symbols number unchanged. [ bp: Rewrite. ] Signed-off-by: Hanna Hawa Signed-off-by: Borislav Petkov Cc: benh@amazon.com Cc: dwmw@amazon.co.uk Cc: hanochu@amazon.com Cc: James Morse Cc: jonnyc@amazon.com Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: ronenk@amazon.com Cc: talel@amazon.com Cc: Tony Luck Link: https://lkml.kernel.org/r/20190923191741.29322-2-hhhawa@amazon.com --- drivers/edac/edac_device.c | 50 ++++++++++++++++++++--------------- drivers/edac/edac_device.h | 54 ++++++++++++++++++++++++++++++-------- 2 files changed, 72 insertions(+), 32 deletions(-) diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index 65cf2b9355c4..8c4d947fb848 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -555,12 +555,16 @@ static inline int edac_device_get_panic_on_ue(struct edac_device_ctl_info return edac_dev->panic_on_ue; } -void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev, - int inst_nr, int block_nr, const char *msg) +void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev, + unsigned int count, int inst_nr, int block_nr, + const char *msg) { struct edac_device_instance *instance; struct edac_device_block *block = NULL; + if (!count) + return; + if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) { edac_device_printk(edac_dev, KERN_ERR, "INTERNAL ERROR: 'instance' out of range " @@ -582,27 +586,31 @@ void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev, if (instance->nr_blocks > 0) { block = instance->blocks + block_nr; - block->counters.ce_count++; + block->counters.ce_count += count; } /* Propagate the count up the 'totals' tree */ - instance->counters.ce_count++; - edac_dev->counters.ce_count++; + instance->counters.ce_count += count; + edac_dev->counters.ce_count += count; if (edac_device_get_log_ce(edac_dev)) edac_device_printk(edac_dev, KERN_WARNING, - "CE: %s instance: %s block: %s '%s'\n", - edac_dev->ctl_name, instance->name, - block ? block->name : "N/A", msg); + "CE: %s instance: %s block: %s count: %d '%s'\n", + edac_dev->ctl_name, instance->name, + block ? block->name : "N/A", count, msg); } -EXPORT_SYMBOL_GPL(edac_device_handle_ce); +EXPORT_SYMBOL_GPL(edac_device_handle_ce_count); -void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev, - int inst_nr, int block_nr, const char *msg) +void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev, + unsigned int count, int inst_nr, int block_nr, + const char *msg) { struct edac_device_instance *instance; struct edac_device_block *block = NULL; + if (!count) + return; + if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) { edac_device_printk(edac_dev, KERN_ERR, "INTERNAL ERROR: 'instance' out of range " @@ -624,22 +632,22 @@ void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev, if (instance->nr_blocks > 0) { block = instance->blocks + block_nr; - block->counters.ue_count++; + block->counters.ue_count += count; } /* Propagate the count up the 'totals' tree */ - instance->counters.ue_count++; - edac_dev->counters.ue_count++; + instance->counters.ue_count += count; + edac_dev->counters.ue_count += count; if (edac_device_get_log_ue(edac_dev)) edac_device_printk(edac_dev, KERN_EMERG, - "UE: %s instance: %s block: %s '%s'\n", - edac_dev->ctl_name, instance->name, - block ? block->name : "N/A", msg); + "UE: %s instance: %s block: %s count: %d '%s'\n", + edac_dev->ctl_name, instance->name, + block ? block->name : "N/A", count, msg); if (edac_device_get_panic_on_ue(edac_dev)) - panic("EDAC %s: UE instance: %s block %s '%s'\n", - edac_dev->ctl_name, instance->name, - block ? block->name : "N/A", msg); + panic("EDAC %s: UE instance: %s block %s count: %d '%s'\n", + edac_dev->ctl_name, instance->name, + block ? block->name : "N/A", count, msg); } -EXPORT_SYMBOL_GPL(edac_device_handle_ue); +EXPORT_SYMBOL_GPL(edac_device_handle_ue_count); diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h index 1aaba74ae411..c4c0e0bdce14 100644 --- a/drivers/edac/edac_device.h +++ b/drivers/edac/edac_device.h @@ -286,27 +286,60 @@ extern int edac_device_add_device(struct edac_device_ctl_info *edac_dev); extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev); /** - * edac_device_handle_ue(): - * perform a common output and handling of an 'edac_dev' UE event + * Log correctable errors. * * @edac_dev: pointer to struct &edac_device_ctl_info - * @inst_nr: number of the instance where the UE error happened - * @block_nr: number of the block where the UE error happened + * @inst_nr: number of the instance where the CE error happened + * @count: Number of errors to log. + * @block_nr: number of the block where the CE error happened * @msg: message to be printed */ -extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev, - int inst_nr, int block_nr, const char *msg); +void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev, + unsigned int count, int inst_nr, int block_nr, + const char *msg); + /** - * edac_device_handle_ce(): - * perform a common output and handling of an 'edac_dev' CE event + * Log uncorrectable errors. + * + * @edac_dev: pointer to struct &edac_device_ctl_info + * @inst_nr: number of the instance where the CE error happened + * @count: Number of errors to log. + * @block_nr: number of the block where the CE error happened + * @msg: message to be printed + */ +void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev, + unsigned int count, int inst_nr, int block_nr, + const char *msg); + +/** + * edac_device_handle_ce(): Log a single correctable error * * @edac_dev: pointer to struct &edac_device_ctl_info * @inst_nr: number of the instance where the CE error happened * @block_nr: number of the block where the CE error happened * @msg: message to be printed */ -extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev, - int inst_nr, int block_nr, const char *msg); +static inline void +edac_device_handle_ce(struct edac_device_ctl_info *edac_dev, int inst_nr, + int block_nr, const char *msg) +{ + edac_device_handle_ce_count(edac_dev, 1, inst_nr, block_nr, msg); +} + +/** + * edac_device_handle_ue(): Log a single uncorrectable error + * + * @edac_dev: pointer to struct &edac_device_ctl_info + * @inst_nr: number of the instance where the UE error happened + * @block_nr: number of the block where the UE error happened + * @msg: message to be printed + */ +static inline void +edac_device_handle_ue(struct edac_device_ctl_info *edac_dev, int inst_nr, + int block_nr, const char *msg) +{ + edac_device_handle_ue_count(edac_dev, 1, inst_nr, block_nr, msg); +} /** * edac_device_alloc_index: Allocate a unique device index number @@ -316,5 +349,4 @@ extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev, */ extern int edac_device_alloc_index(void); extern const char *edac_layer_name[]; - #endif From 29b8e84fbc23cb2b70317b745641ea0569426872 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 15 Aug 2019 14:18:59 -0700 Subject: [PATCH 09/35] EDAC, skx_common: Refactor so that we initialize "dev" in result of adxl decode. Simplifies the code a little. Acked-by: Aristeu Rozanski Signed-off-by: Tony Luck --- drivers/edac/skx_common.c | 48 +++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 83dd5da67a28..de06d58d7d2a 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -100,6 +100,7 @@ void __exit skx_adxl_put(void) static bool skx_adxl_decode(struct decoded_addr *res) { + struct skx_dev *d; int i, len = 0; if (res->addr >= skx_tohm || (res->addr >= skx_tolm && @@ -118,6 +119,24 @@ static bool skx_adxl_decode(struct decoded_addr *res) res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]]; res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]]; + if (res->imc > NUM_IMC - 1) { + skx_printk(KERN_ERR, "Bad imc %d\n", res->imc); + return false; + } + + list_for_each_entry(d, &dev_edac_list, list) { + if (d->imc[0].src_id == res->socket) { + res->dev = d; + break; + } + } + + if (!res->dev) { + skx_printk(KERN_ERR, "No device for src_id %d imc %d\n", + res->socket, res->imc); + return false; + } + for (i = 0; i < adxl_component_count; i++) { if (adxl_values[i] == ~0x0ull) continue; @@ -452,24 +471,6 @@ static void skx_unregister_mci(struct skx_imc *imc) edac_mc_free(mci); } -static struct mem_ctl_info *get_mci(int src_id, int lmc) -{ - struct skx_dev *d; - - if (lmc > NUM_IMC - 1) { - skx_printk(KERN_ERR, "Bad lmc %d\n", lmc); - return NULL; - } - - list_for_each_entry(d, &dev_edac_list, list) { - if (d->imc[0].src_id == src_id) - return d->imc[lmc].mci; - } - - skx_printk(KERN_ERR, "No mci for src_id %d lmc %d\n", src_id, lmc); - return NULL; -} - static void skx_mce_output_error(struct mem_ctl_info *mci, const struct mce *m, struct decoded_addr *res) @@ -580,15 +581,12 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, if (adxl_component_count) { if (!skx_adxl_decode(&res)) return NOTIFY_DONE; - - mci = get_mci(res.socket, res.imc); - } else { - if (!skx_decode || !skx_decode(&res)) - return NOTIFY_DONE; - - mci = res.dev->imc[res.imc].mci; + } else if (!skx_decode || !skx_decode(&res)) { + return NOTIFY_DONE; } + mci = res.dev->imc[res.imc].mci; + if (!mci) return NOTIFY_DONE; From e80634a75aba90e7485cd1fdb463fcac5d45f14d Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 15 Aug 2019 14:53:28 -0700 Subject: [PATCH 10/35] EDAC, skx: Retrieve and print retry_rd_err_log registers Skylake logs some additional useful information in per-channel registers in addition the the architectural status/addr/misc logged in the machine check bank. Pick up this information and add it to the EDAC log: retry_rd_err_[five 32-bit register values] Sorry, no definitions for these registers. OEMs and DIMM vendors will be able to use them to isolate which cells in the DIMM are causing problems. correrrcnt[per rank corrected error counts] Note that if additional errors are logged while these registers are being read, you may see a jumble of values some from earlier errors, others from later errors (since the registers report the most recent logged error). The correrrcnt registers provide error counts per possible rank. If these counts only change by one since the previous error logged for this channel, then it is safe to assume that the registers logged provide a coherent view of one error. With this change EDAC logs look like this: EDAC MC4: 1 CE memory read error on CPU_SrcID#2_MC#0_Chan#1_DIMM#0 (channel:1 slot:0 page:0x8f26018 offset:0x0 grain:32 syndrome:0x0 - err_code:0x0101:0x0091 socket:2 imc:0 rank:0 bg:0 ba:0 row:0x1f880 col:0x200 retry_rd_err_log[0001a209 00000000 00000001 04800001 0001f880] correrrcnt[0001 0000 0000 0000 0000 0000 0000 0000]) Acked-by: Aristeu Rozanski Signed-off-by: Tony Luck --- drivers/edac/skx_base.c | 51 ++++++++++++++++++++++++++++++++++++--- drivers/edac/skx_common.c | 12 ++++++--- drivers/edac/skx_common.h | 4 ++- 3 files changed, 60 insertions(+), 7 deletions(-) diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 0fcf3785e8f3..a8853e724d1f 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -46,7 +46,8 @@ static struct skx_dev *get_skx_dev(struct pci_bus *bus, u8 idx) } enum munittype { - CHAN0, CHAN1, CHAN2, SAD_ALL, UTIL_ALL, SAD + CHAN0, CHAN1, CHAN2, SAD_ALL, UTIL_ALL, SAD, + ERRCHAN0, ERRCHAN1, ERRCHAN2, }; struct munit { @@ -68,6 +69,9 @@ static const struct munit skx_all_munits[] = { { 0x2040, { PCI_DEVFN(10, 0), PCI_DEVFN(12, 0) }, 2, 2, CHAN0 }, { 0x2044, { PCI_DEVFN(10, 4), PCI_DEVFN(12, 4) }, 2, 2, CHAN1 }, { 0x2048, { PCI_DEVFN(11, 0), PCI_DEVFN(13, 0) }, 2, 2, CHAN2 }, + { 0x2043, { PCI_DEVFN(10, 3), PCI_DEVFN(12, 3) }, 2, 2, ERRCHAN0 }, + { 0x2047, { PCI_DEVFN(10, 7), PCI_DEVFN(12, 7) }, 2, 2, ERRCHAN1 }, + { 0x204b, { PCI_DEVFN(11, 3), PCI_DEVFN(13, 3) }, 2, 2, ERRCHAN2 }, { 0x208e, { }, 1, 0, SAD }, { } }; @@ -104,10 +108,18 @@ static int get_all_munits(const struct munit *m) } switch (m->mtype) { - case CHAN0: case CHAN1: case CHAN2: + case CHAN0: + case CHAN1: + case CHAN2: pci_dev_get(pdev); d->imc[i].chan[m->mtype].cdev = pdev; break; + case ERRCHAN0: + case ERRCHAN1: + case ERRCHAN2: + pci_dev_get(pdev); + d->imc[i].chan[m->mtype - ERRCHAN0].edev = pdev; + break; case SAD_ALL: pci_dev_get(pdev); d->sad_all = pdev; @@ -216,6 +228,39 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci) #define SKX_ILV_REMOTE(tgt) (((tgt) & 8) == 0) #define SKX_ILV_TARGET(tgt) ((tgt) & 7) +static void skx_show_retry_rd_err_log(struct decoded_addr *res, + char *msg, int len) +{ + u32 log0, log1, log2, log3, log4; + u32 corr0, corr1, corr2, corr3; + struct pci_dev *edev; + int n; + + edev = res->dev->imc[res->imc].chan[res->channel].edev; + + pci_read_config_dword(edev, 0x154, &log0); + pci_read_config_dword(edev, 0x148, &log1); + pci_read_config_dword(edev, 0x150, &log2); + pci_read_config_dword(edev, 0x15c, &log3); + pci_read_config_dword(edev, 0x114, &log4); + + n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x]", + log0, log1, log2, log3, log4); + + pci_read_config_dword(edev, 0x104, &corr0); + pci_read_config_dword(edev, 0x108, &corr1); + pci_read_config_dword(edev, 0x10c, &corr2); + pci_read_config_dword(edev, 0x110, &corr3); + + if (len - n > 0) + snprintf(msg + n, len - n, + " correrrcnt[%.4x %.4x %.4x %.4x %.4x %.4x %.4x %.4x]", + corr0 & 0xffff, corr0 >> 16, + corr1 & 0xffff, corr1 >> 16, + corr2 & 0xffff, corr2 >> 16, + corr3 & 0xffff, corr3 >> 16); +} + static bool skx_sad_decode(struct decoded_addr *res) { struct skx_dev *d = list_first_entry(skx_edac_list, typeof(*d), list); @@ -659,7 +704,7 @@ static int __init skx_init(void) } } - skx_set_decode(skx_decode); + skx_set_decode(skx_decode, skx_show_retry_rd_err_log); if (nvdimm_count && skx_adxl_get() == -ENODEV) skx_printk(KERN_NOTICE, "Only decoding DDR4 address!\n"); diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index de06d58d7d2a..95662a4ff4c4 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -37,6 +37,7 @@ static char *adxl_msg; static char skx_msg[MSG_SIZE]; static skx_decode_f skx_decode; +static skx_show_retry_log_f skx_show_retry_rd_err_log; static u64 skx_tolm, skx_tohm; static LIST_HEAD(dev_edac_list); @@ -150,9 +151,10 @@ static bool skx_adxl_decode(struct decoded_addr *res) return true; } -void skx_set_decode(skx_decode_f decode) +void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log) { skx_decode = decode; + skx_show_retry_rd_err_log = show_retry_log; } int skx_get_src_id(struct skx_dev *d, int off, u8 *id) @@ -481,6 +483,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, bool overflow = GET_BITFIELD(m->status, 62, 62); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); bool recoverable; + int len; u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52); u32 mscod = GET_BITFIELD(m->status, 16, 31); u32 errcode = GET_BITFIELD(m->status, 0, 15); @@ -537,12 +540,12 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, } } if (adxl_component_count) { - snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s", + len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s", overflow ? " OVERFLOW" : "", (uncorrected_error && recoverable) ? " recoverable" : "", mscod, errcode, adxl_msg); } else { - snprintf(skx_msg, MSG_SIZE, + len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:0x%x col:0x%x", overflow ? " OVERFLOW" : "", (uncorrected_error && recoverable) ? " recoverable" : "", @@ -551,6 +554,9 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, res->bank_group, res->bank_address, res->row, res->column); } + if (skx_show_retry_rd_err_log) + skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len); + edac_dbg(0, "%s\n", skx_msg); /* Call the helper to output message */ diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 08cc971a50ea..60d1ea669afd 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -64,6 +64,7 @@ struct skx_dev { u8 src_id, node_id; struct skx_channel { struct pci_dev *cdev; + struct pci_dev *edev; struct skx_dimm { u8 close_pg; u8 bank_xor_enable; @@ -113,10 +114,11 @@ struct decoded_addr { typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci); typedef bool (*skx_decode_f)(struct decoded_addr *res); +typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len); int __init skx_adxl_get(void); void __exit skx_adxl_put(void); -void skx_set_decode(skx_decode_f decode); +void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log); int skx_get_src_id(struct skx_dev *d, int off, u8 *id); int skx_get_node_id(struct skx_dev *d, u8 *id); From 5bbab3cf211b4b70415de05f428fa91fb454aa41 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 21 Sep 2019 18:32:46 +0200 Subject: [PATCH 11/35] EDAC/aspeed: Use devm_platform_ioremap_resource() in aspeed_probe() Simplify this function implementation by using a known wrapper function. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Borislav Petkov Acked-by: Joel Stanley Cc: Andrew Jeffery Cc: James Morse Cc: kernel-janitors@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-aspeed@lists.ozlabs.org Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Robert Richter Cc: Stefan Schaeckeler Cc: Tony Luck Link: https://lkml.kernel.org/r/baabb9e9-a1b2-3a04-9fb6-aa632de5f722@web.de --- drivers/edac/aspeed_edac.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/edac/aspeed_edac.c b/drivers/edac/aspeed_edac.c index 5634437bb39d..09a9e3de9595 100644 --- a/drivers/edac/aspeed_edac.c +++ b/drivers/edac/aspeed_edac.c @@ -281,16 +281,11 @@ static int aspeed_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct edac_mc_layer layers[2]; struct mem_ctl_info *mci; - struct resource *res; void __iomem *regs; u32 reg04; int rc; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENOENT; - - regs = devm_ioremap_resource(dev, res); + regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(regs)) return PTR_ERR(regs); From 466503d6b1b33be46ab87c6090f0ade6c6011cbc Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 22 Oct 2019 20:35:14 +0000 Subject: [PATCH 12/35] EDAC/amd64: Set grain per DIMM The following commit introduced a warning on error reports without a non-zero grain value. 3724ace582d9 ("EDAC/mc: Fix grain_bits calculation") The amd64_edac_mod module does not provide a value, so the warning will be given on the first reported memory error. Set the grain per DIMM to cacheline size (64 bytes). This is the current recommendation. Fixes: 3724ace582d9 ("EDAC/mc: Fix grain_bits calculation") Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Robert Richter Cc: Tony Luck Link: https://lkml.kernel.org/r/20191022203448.13962-7-Yazen.Ghannam@amd.com --- drivers/edac/amd64_edac.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index c1d4536ae466..cc5e56d752c8 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2936,6 +2936,7 @@ static int init_csrows_df(struct mem_ctl_info *mci) dimm->mtype = pvt->dram_type; dimm->edac_mode = edac_mode; dimm->dtype = dev_type; + dimm->grain = 64; } } @@ -3012,6 +3013,7 @@ static int init_csrows(struct mem_ctl_info *mci) dimm = csrow->channels[j]->dimm; dimm->mtype = pvt->dram_type; dimm->edac_mode = edac_mode; + dimm->grain = 64; } } From 38ddd4d1574530e1447b6ad91d27225d0f7662fb Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 22 Oct 2019 20:35:09 +0000 Subject: [PATCH 13/35] EDAC/amd64: Make struct amd64_family_type global The struct amd64_family_type doesn't change between multiple nodes and instances of the module, so make it global. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Robert Richter Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106012448.243970-2-Yazen.Ghannam@amd.com --- drivers/edac/amd64_edac.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index cc5e56d752c8..83c659e38084 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -16,6 +16,8 @@ module_param(ecc_enable_override, int, 0644); static struct msr __percpu *msrs; +static struct amd64_family_type *fam_type; + /* Per-node stuff */ static struct ecc_settings **ecc_stngs; @@ -3280,8 +3282,7 @@ f17h_determine_edac_ctl_cap(struct mem_ctl_info *mci, struct amd64_pvt *pvt) } } -static void setup_mci_misc_attrs(struct mem_ctl_info *mci, - struct amd64_family_type *fam) +static void setup_mci_misc_attrs(struct mem_ctl_info *mci) { struct amd64_pvt *pvt = mci->pvt_info; @@ -3300,7 +3301,7 @@ static void setup_mci_misc_attrs(struct mem_ctl_info *mci, mci->edac_cap = determine_edac_cap(pvt); mci->mod_name = EDAC_MOD_STR; - mci->ctl_name = fam->ctl_name; + mci->ctl_name = fam_type->ctl_name; mci->dev_name = pci_name(pvt->F3); mci->ctl_page_to_phys = NULL; @@ -3314,8 +3315,6 @@ static void setup_mci_misc_attrs(struct mem_ctl_info *mci, */ static struct amd64_family_type *per_family_init(struct amd64_pvt *pvt) { - struct amd64_family_type *fam_type = NULL; - pvt->ext_model = boot_cpu_data.x86_model >> 4; pvt->stepping = boot_cpu_data.x86_stepping; pvt->model = boot_cpu_data.x86_model; @@ -3422,7 +3421,6 @@ static void compute_num_umcs(void) static int init_one_instance(unsigned int nid) { struct pci_dev *F3 = node_to_amd_nb(nid)->misc; - struct amd64_family_type *fam_type = NULL; struct mem_ctl_info *mci = NULL; struct edac_mc_layer layers[2]; struct amd64_pvt *pvt = NULL; @@ -3499,7 +3497,7 @@ static int init_one_instance(unsigned int nid) mci->pvt_info = pvt; mci->pdev = &pvt->F3->dev; - setup_mci_misc_attrs(mci, fam_type); + setup_mci_misc_attrs(mci); if (init_csrows(mci)) mci->edac_cap = EDAC_FLAG_NONE; From 80355a3b2db9d0b713af5169e2cdd7f8fbfdad82 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 22 Oct 2019 20:35:10 +0000 Subject: [PATCH 14/35] EDAC/amd64: Gather hardware information early Split out gathering hardware information from init_one_instance() into a separate function hw_info_get(). This is necessary so that the information can be cached earlier and used to check if memory is populated and if ECC is enabled on a node. Also, define a function hw_info_put() to back out changes made in hw_info_get(). Check for an allocated PCI device (Function 0 for Family 17h or Function 1 for pre-Family 17h) before freeing, since hw_info_put() may be called before PCI siblings are reserved. Drop the family check when freeing pvt->umc. This will be NULL on pre-Family 17h systems. However, kfree() is safe and will check for a NULL pointer before freeing. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Robert Richter Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106012448.243970-3-Yazen.Ghannam@amd.com --- drivers/edac/amd64_edac.c | 101 +++++++++++++++++++------------------- 1 file changed, 51 insertions(+), 50 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 83c659e38084..6e1c739b7fad 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3418,34 +3418,15 @@ static void compute_num_umcs(void) edac_dbg(1, "Number of UMCs: %x", num_umcs); } -static int init_one_instance(unsigned int nid) +static int hw_info_get(struct amd64_pvt *pvt) { - struct pci_dev *F3 = node_to_amd_nb(nid)->misc; - struct mem_ctl_info *mci = NULL; - struct edac_mc_layer layers[2]; - struct amd64_pvt *pvt = NULL; u16 pci_id1, pci_id2; - int err = 0, ret; - - ret = -ENOMEM; - pvt = kzalloc(sizeof(struct amd64_pvt), GFP_KERNEL); - if (!pvt) - goto err_ret; - - pvt->mc_node_id = nid; - pvt->F3 = F3; - - ret = -EINVAL; - fam_type = per_family_init(pvt); - if (!fam_type) - goto err_free; + int ret = -EINVAL; if (pvt->fam >= 0x17) { pvt->umc = kcalloc(num_umcs, sizeof(struct amd64_umc), GFP_KERNEL); - if (!pvt->umc) { - ret = -ENOMEM; - goto err_free; - } + if (!pvt->umc) + return -ENOMEM; pci_id1 = fam_type->f0_id; pci_id2 = fam_type->f6_id; @@ -3454,21 +3435,37 @@ static int init_one_instance(unsigned int nid) pci_id2 = fam_type->f2_id; } - err = reserve_mc_sibling_devs(pvt, pci_id1, pci_id2); - if (err) - goto err_post_init; + ret = reserve_mc_sibling_devs(pvt, pci_id1, pci_id2); + if (ret) + return ret; read_mc_regs(pvt); + return 0; +} + +static void hw_info_put(struct amd64_pvt *pvt) +{ + if (pvt->F0 || pvt->F1) + free_mc_sibling_devs(pvt); + + kfree(pvt->umc); +} + +static int init_one_instance(struct amd64_pvt *pvt) +{ + struct mem_ctl_info *mci = NULL; + struct edac_mc_layer layers[2]; + int ret = -EINVAL; + /* * We need to determine how many memory channels there are. Then use * that information for calculating the size of the dynamic instance * tables in the 'mci' structure. */ - ret = -EINVAL; pvt->channel_count = pvt->ops->early_channel_count(pvt); if (pvt->channel_count < 0) - goto err_siblings; + return ret; ret = -ENOMEM; layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; @@ -3490,9 +3487,9 @@ static int init_one_instance(unsigned int nid) layers[1].size = 2; layers[1].is_virt_csrow = false; - mci = edac_mc_alloc(nid, ARRAY_SIZE(layers), layers, 0); + mci = edac_mc_alloc(pvt->mc_node_id, ARRAY_SIZE(layers), layers, 0); if (!mci) - goto err_siblings; + return ret; mci->pvt_info = pvt; mci->pdev = &pvt->F3->dev; @@ -3505,31 +3502,17 @@ static int init_one_instance(unsigned int nid) ret = -ENODEV; if (edac_mc_add_mc_with_groups(mci, amd64_edac_attr_groups)) { edac_dbg(1, "failed edac_mc_add_mc()\n"); - goto err_add_mc; + edac_mc_free(mci); + return ret; } return 0; - -err_add_mc: - edac_mc_free(mci); - -err_siblings: - free_mc_sibling_devs(pvt); - -err_post_init: - if (pvt->fam >= 0x17) - kfree(pvt->umc); - -err_free: - kfree(pvt); - -err_ret: - return ret; } static int probe_one_instance(unsigned int nid) { struct pci_dev *F3 = node_to_amd_nb(nid)->misc; + struct amd64_pvt *pvt = NULL; struct ecc_settings *s; int ret; @@ -3540,6 +3523,21 @@ static int probe_one_instance(unsigned int nid) ecc_stngs[nid] = s; + pvt = kzalloc(sizeof(struct amd64_pvt), GFP_KERNEL); + if (!pvt) + goto err_settings; + + pvt->mc_node_id = nid; + pvt->F3 = F3; + + fam_type = per_family_init(pvt); + if (!fam_type) + goto err_enable; + + ret = hw_info_get(pvt); + if (ret < 0) + goto err_enable; + if (!ecc_enabled(F3, nid)) { ret = 0; @@ -3556,7 +3554,7 @@ static int probe_one_instance(unsigned int nid) goto err_enable; } - ret = init_one_instance(nid); + ret = init_one_instance(pvt); if (ret < 0) { amd64_err("Error probing instance: %d\n", nid); @@ -3569,6 +3567,10 @@ static int probe_one_instance(unsigned int nid) return ret; err_enable: + hw_info_put(pvt); + kfree(pvt); + +err_settings: kfree(s); ecc_stngs[nid] = NULL; @@ -3595,14 +3597,13 @@ static void remove_one_instance(unsigned int nid) restore_ecc_error_reporting(s, nid, F3); - free_mc_sibling_devs(pvt); - kfree(ecc_stngs[nid]); ecc_stngs[nid] = NULL; /* Free the EDAC CORE resources */ mci->pvt_info = NULL; + hw_info_put(pvt); kfree(pvt); edac_mc_free(mci); } From 5e4c55276ae8758f5789722b384bb2ab3de3a24f Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 22 Oct 2019 20:35:11 +0000 Subject: [PATCH 15/35] EDAC/amd64: Save max number of controllers to family type The maximum number of memory controllers is fixed within a family/model group. In most cases, this has been fixed at 2, but some systems may have up to 8. The struct amd64_family_type already contains family/model-specific information, and this can be used rather than adding model checks to various functions. Create a new field in struct amd64_family_type for max_mcs. Set this when setting other family type information, and use this when needing the maximum number of memory controllers possible for a system. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Robert Richter Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106012448.243970-4-Yazen.Ghannam@amd.com --- drivers/edac/amd64_edac.c | 44 +++++++++++++-------------------------- drivers/edac/amd64_edac.h | 2 ++ 2 files changed, 16 insertions(+), 30 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 6e1c739b7fad..110ed0d27998 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -21,9 +21,6 @@ static struct amd64_family_type *fam_type; /* Per-node stuff */ static struct ecc_settings **ecc_stngs; -/* Number of Unified Memory Controllers */ -static u8 num_umcs; - /* * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching- @@ -456,7 +453,7 @@ static void get_cs_base_and_mask(struct amd64_pvt *pvt, int csrow, u8 dct, for (i = 0; i < pvt->csels[dct].m_cnt; i++) #define for_each_umc(i) \ - for (i = 0; i < num_umcs; i++) + for (i = 0; i < fam_type->max_mcs; i++) /* * @input_addr is an InputAddr associated with the node given by mci. Return the @@ -2226,6 +2223,7 @@ static struct amd64_family_type family_types[] = { .ctl_name = "K8", .f1_id = PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, .f2_id = PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, + .max_mcs = 2, .ops = { .early_channel_count = k8_early_channel_count, .map_sysaddr_to_csrow = k8_map_sysaddr_to_csrow, @@ -2236,6 +2234,7 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F10h", .f1_id = PCI_DEVICE_ID_AMD_10H_NB_MAP, .f2_id = PCI_DEVICE_ID_AMD_10H_NB_DRAM, + .max_mcs = 2, .ops = { .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, @@ -2246,6 +2245,7 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F15h", .f1_id = PCI_DEVICE_ID_AMD_15H_NB_F1, .f2_id = PCI_DEVICE_ID_AMD_15H_NB_F2, + .max_mcs = 2, .ops = { .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, @@ -2256,6 +2256,7 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F15h_M30h", .f1_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F1, .f2_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F2, + .max_mcs = 2, .ops = { .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, @@ -2266,6 +2267,7 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F15h_M60h", .f1_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F1, .f2_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F2, + .max_mcs = 2, .ops = { .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, @@ -2276,6 +2278,7 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F16h", .f1_id = PCI_DEVICE_ID_AMD_16H_NB_F1, .f2_id = PCI_DEVICE_ID_AMD_16H_NB_F2, + .max_mcs = 2, .ops = { .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, @@ -2286,6 +2289,7 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F16h_M30h", .f1_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F1, .f2_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F2, + .max_mcs = 2, .ops = { .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, @@ -2296,6 +2300,7 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h", .f0_id = PCI_DEVICE_ID_AMD_17H_DF_F0, .f6_id = PCI_DEVICE_ID_AMD_17H_DF_F6, + .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, @@ -2305,6 +2310,7 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h_M10h", .f0_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F0, .f6_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F6, + .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, @@ -2314,6 +2320,7 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h_M30h", .f0_id = PCI_DEVICE_ID_AMD_17H_M30H_DF_F0, .f6_id = PCI_DEVICE_ID_AMD_17H_M30H_DF_F6, + .max_mcs = 8, .ops = { .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, @@ -2323,6 +2330,7 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h_M70h", .f0_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F0, .f6_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F6, + .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, @@ -3402,29 +3410,13 @@ static const struct attribute_group *amd64_edac_attr_groups[] = { NULL }; -/* Set the number of Unified Memory Controllers in the system. */ -static void compute_num_umcs(void) -{ - u8 model = boot_cpu_data.x86_model; - - if (boot_cpu_data.x86 < 0x17) - return; - - if (model >= 0x30 && model <= 0x3f) - num_umcs = 8; - else - num_umcs = 2; - - edac_dbg(1, "Number of UMCs: %x", num_umcs); -} - static int hw_info_get(struct amd64_pvt *pvt) { u16 pci_id1, pci_id2; int ret = -EINVAL; if (pvt->fam >= 0x17) { - pvt->umc = kcalloc(num_umcs, sizeof(struct amd64_umc), GFP_KERNEL); + pvt->umc = kcalloc(fam_type->max_mcs, sizeof(struct amd64_umc), GFP_KERNEL); if (!pvt->umc) return -ENOMEM; @@ -3477,14 +3469,8 @@ static int init_one_instance(struct amd64_pvt *pvt) * Always allocate two channels since we can have setups with DIMMs on * only one channel. Also, this simplifies handling later for the price * of a couple of KBs tops. - * - * On Fam17h+, the number of controllers may be greater than two. So set - * the size equal to the maximum number of UMCs. */ - if (pvt->fam >= 0x17) - layers[1].size = num_umcs; - else - layers[1].size = 2; + layers[1].size = fam_type->max_mcs; layers[1].is_virt_csrow = false; mci = edac_mc_alloc(pvt->mc_node_id, ARRAY_SIZE(layers), layers, 0); @@ -3669,8 +3655,6 @@ static int __init amd64_edac_init(void) if (!msrs) goto err_free; - compute_num_umcs(); - for (i = 0; i < amd_nb_num(); i++) { err = probe_one_instance(i); if (err) { diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 8c3cda81e619..9be31688110b 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -479,6 +479,8 @@ struct low_ops { struct amd64_family_type { const char *ctl_name; u16 f0_id, f1_id, f2_id, f6_id; + /* Maximum number of memory controllers per die/node. */ + u8 max_mcs; struct low_ops ops; }; From 1c9b08bac5bf4f4825631c885eba84461e4eed79 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 22 Oct 2019 20:35:12 +0000 Subject: [PATCH 16/35] EDAC/amd64: Use cached data when checking for ECC ...now that the data is available earlier. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Robert Richter Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106012448.243970-5-Yazen.Ghannam@amd.com --- drivers/edac/amd64_edac.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 110ed0d27998..d38ba7f17753 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3202,31 +3202,27 @@ static const char *ecc_msg = "'ecc_enable_override'.\n" " (Note that use of the override may cause unknown side effects.)\n"; -static bool ecc_enabled(struct pci_dev *F3, u16 nid) +static bool ecc_enabled(struct amd64_pvt *pvt) { + u16 nid = pvt->mc_node_id; bool nb_mce_en = false; u8 ecc_en = 0, i; u32 value; if (boot_cpu_data.x86 >= 0x17) { u8 umc_en_mask = 0, ecc_en_mask = 0; + struct amd64_umc *umc; for_each_umc(i) { - u32 base = get_umc_base(i); + umc = &pvt->umc[i]; /* Only check enabled UMCs. */ - if (amd_smn_read(nid, base + UMCCH_SDP_CTRL, &value)) - continue; - - if (!(value & UMC_SDP_INIT)) + if (!(umc->sdp_ctrl & UMC_SDP_INIT)) continue; umc_en_mask |= BIT(i); - if (amd_smn_read(nid, base + UMCCH_UMC_CAP_HI, &value)) - continue; - - if (value & UMC_ECC_ENABLED) + if (umc->umc_cap_hi & UMC_ECC_ENABLED) ecc_en_mask |= BIT(i); } @@ -3239,7 +3235,7 @@ static bool ecc_enabled(struct pci_dev *F3, u16 nid) /* Assume UMC MCA banks are enabled. */ nb_mce_en = true; } else { - amd64_read_pci_cfg(F3, NBCFG, &value); + amd64_read_pci_cfg(pvt->F3, NBCFG, &value); ecc_en = !!(value & NBCFG_ECC_ENABLE); @@ -3524,7 +3520,7 @@ static int probe_one_instance(unsigned int nid) if (ret < 0) goto err_enable; - if (!ecc_enabled(F3, nid)) { + if (!ecc_enabled(pvt)) { ret = 0; if (!ecc_enable_override) From 582f94b5900a9bccca993dc16ca77364a0aa12a9 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Wed, 6 Nov 2019 01:25:01 +0000 Subject: [PATCH 17/35] EDAC/amd64: Check for memory before fully initializing an instance Return early before checking for ECC if the node does not have any populated memory. Free any cached hardware data before returning. Also, return 0 in this case since this is not a failure. Other nodes may have memory and the module should attempt to load an instance for them. Move printing of hardware information to after the instance is initialized, so that the information is only printed for nodes with memory. Return an error code when ECC is disabled. This check happens after checking for memory. The module should explicitly fail to load if memory is populated on a node and ECC is disabled. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Robert Richter Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106012448.243970-6-Yazen.Ghannam@amd.com --- drivers/edac/amd64_edac.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index d38ba7f17753..3aeb5173e200 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2848,8 +2848,6 @@ static void read_mc_regs(struct amd64_pvt *pvt) edac_dbg(1, " DIMM type: %s\n", edac_mem_types[pvt->dram_type]); determine_ecc_sym_sz(pvt); - - dump_misc_regs(pvt); } /* @@ -3491,6 +3489,19 @@ static int init_one_instance(struct amd64_pvt *pvt) return 0; } +static bool instance_has_memory(struct amd64_pvt *pvt) +{ + bool cs_enabled = false; + int cs = 0, dct = 0; + + for (dct = 0; dct < fam_type->max_mcs; dct++) { + for_each_chip_select(cs, dct, pvt) + cs_enabled |= csrow_enabled(cs, dct, pvt); + } + + return cs_enabled; +} + static int probe_one_instance(unsigned int nid) { struct pci_dev *F3 = node_to_amd_nb(nid)->misc; @@ -3520,8 +3531,14 @@ static int probe_one_instance(unsigned int nid) if (ret < 0) goto err_enable; + ret = 0; + if (!instance_has_memory(pvt)) { + amd64_info("Node %d: No DIMMs detected.\n", nid); + goto err_enable; + } + if (!ecc_enabled(pvt)) { - ret = 0; + ret = -ENODEV; if (!ecc_enable_override) goto err_enable; @@ -3546,6 +3563,8 @@ static int probe_one_instance(unsigned int nid) goto err_enable; } + dump_misc_regs(pvt); + return ret; err_enable: From 23f61b9fc5cc10d87f66e50518707eec2a0fbda1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 5 Nov 2019 20:07:51 +0000 Subject: [PATCH 18/35] EDAC/ghes: Fix locking and memory barrier issues The ghes registration and refcount is broken in several ways: * ghes_edac_register() returns with success for a 2nd instance even if a first instance's registration is still running. This is not correct as the first instance may fail later. A subsequent registration may not finish before the first. Parallel registrations must be avoided. * The refcount was increased even if a registration failed. This leads to stale counters preventing the device from being released. * The ghes refcount may not be decremented properly on unregistration. Always decrement the refcount once ghes_edac_unregister() is called to keep the refcount sane. * The ghes_pvt pointer is handed to the irq handler before registration finished. * The mci structure could be freed while the irq handler is running. Fix this by adding a mutex to ghes_edac_register(). This mutex serializes instances to register and unregister. The refcount is only increased if the registration succeeded. This makes sure the refcount is in a consistent state after registering or unregistering a device. Note: A spinlock cannot be used here as the code section may sleep. The ghes_pvt is protected by ghes_lock now. This ensures the pointer is not updated before registration was finished or while the irq handler is running. It is unset before unregistering the device including necessary (implicit) memory barriers making the changes visible to other CPUs. Thus, the device can not be used anymore by an interrupt. Also, rename ghes_init to ghes_refcount for better readability and switch to refcount API. A refcount is needed because there can be multiple GHES structures being defined (see ACPI 6.3 specification, 18.3.2.7 Generic Hardware Error Source, "Some platforms may describe multiple Generic Hardware Error Source structures with different notification types, ..."). Another approach to use the mci's device refcount (get_device()) and have a release function does not work here. A release function will be called only for device_release() with the last put_device() call. The device must be deleted *before* that with device_del(). This is only possible by maintaining an own refcount. [ bp: touchups. ] Fixes: 0fe5f281f749 ("EDAC, ghes: Model a single, logical memory controller") Fixes: 1e72e673b9d1 ("EDAC/ghes: Fix Use after free in ghes_edac remove path") Co-developed-by: James Morse Signed-off-by: James Morse Co-developed-by: Borislav Petkov Signed-off-by: Borislav Petkov Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: Mauro Carvalho Chehab Cc: Tony Luck Link: https://lkml.kernel.org/r/20191105200732.3053-1-rrichter@marvell.com --- drivers/edac/ghes_edac.c | 90 +++++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 24 deletions(-) diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 0bb62857ffb2..f6f6a688c009 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -26,9 +26,18 @@ struct ghes_edac_pvt { char msg[80]; }; -static atomic_t ghes_init = ATOMIC_INIT(0); +static refcount_t ghes_refcount = REFCOUNT_INIT(0); + +/* + * Access to ghes_pvt must be protected by ghes_lock. The spinlock + * also provides the necessary (implicit) memory barrier for the SMP + * case to make the pointer visible on another CPU. + */ static struct ghes_edac_pvt *ghes_pvt; +/* GHES registration mutex */ +static DEFINE_MUTEX(ghes_reg_mutex); + /* * Sync with other, potentially concurrent callers of * ghes_edac_report_mem_error(). We don't know what the @@ -79,9 +88,8 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg) (*num_dimm)++; } -static int get_dimm_smbios_index(u16 handle) +static int get_dimm_smbios_index(struct mem_ctl_info *mci, u16 handle) { - struct mem_ctl_info *mci = ghes_pvt->mci; int i; for (i = 0; i < mci->tot_dimms; i++) { @@ -198,14 +206,11 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) enum hw_event_mc_err_type type; struct edac_raw_error_desc *e; struct mem_ctl_info *mci; - struct ghes_edac_pvt *pvt = ghes_pvt; + struct ghes_edac_pvt *pvt; unsigned long flags; char *p; u8 grain_bits; - if (!pvt) - return; - /* * We can do the locking below because GHES defers error processing * from NMI to IRQ context. Whenever that changes, we'd at least @@ -216,6 +221,10 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) spin_lock_irqsave(&ghes_lock, flags); + pvt = ghes_pvt; + if (!pvt) + goto unlock; + mci = pvt->mci; e = &mci->error_desc; @@ -348,7 +357,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) p += sprintf(p, "DIMM DMI handle: 0x%.4x ", mem_err->mem_dev_handle); - index = get_dimm_smbios_index(mem_err->mem_dev_handle); + index = get_dimm_smbios_index(mci, mem_err->mem_dev_handle); if (index >= 0) { e->top_layer = index; e->enable_per_layer_report = true; @@ -443,6 +452,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) grain_bits, e->syndrome, pvt->detail_location); edac_raw_mc_handle_error(type, mci, e); + +unlock: spin_unlock_irqrestore(&ghes_lock, flags); } @@ -457,10 +468,12 @@ static struct acpi_platform_list plat_list[] = { int ghes_edac_register(struct ghes *ghes, struct device *dev) { bool fake = false; - int rc, num_dimm = 0; + int rc = 0, num_dimm = 0; struct mem_ctl_info *mci; + struct ghes_edac_pvt *pvt; struct edac_mc_layer layers[1]; struct ghes_edac_dimm_fill dimm_fill; + unsigned long flags; int idx = -1; if (IS_ENABLED(CONFIG_X86)) { @@ -472,11 +485,14 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) idx = 0; } + /* finish another registration/unregistration instance first */ + mutex_lock(&ghes_reg_mutex); + /* * We have only one logical memory controller to which all DIMMs belong. */ - if (atomic_inc_return(&ghes_init) > 1) - return 0; + if (refcount_inc_not_zero(&ghes_refcount)) + goto unlock; /* Get the number of DIMMs */ dmi_walk(ghes_edac_count_dimms, &num_dimm); @@ -494,12 +510,13 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_edac_pvt)); if (!mci) { pr_info("Can't allocate memory for EDAC data\n"); - return -ENOMEM; + rc = -ENOMEM; + goto unlock; } - ghes_pvt = mci->pvt_info; - ghes_pvt->ghes = ghes; - ghes_pvt->mci = mci; + pvt = mci->pvt_info; + pvt->ghes = ghes; + pvt->mci = mci; mci->pdev = dev; mci->mtype_cap = MEM_FLAG_EMPTY; @@ -541,23 +558,48 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) if (rc < 0) { pr_info("Can't register at EDAC core\n"); edac_mc_free(mci); - return -ENODEV; + rc = -ENODEV; + goto unlock; } - return 0; + + spin_lock_irqsave(&ghes_lock, flags); + ghes_pvt = pvt; + spin_unlock_irqrestore(&ghes_lock, flags); + + /* only increment on success */ + refcount_inc(&ghes_refcount); + +unlock: + mutex_unlock(&ghes_reg_mutex); + + return rc; } void ghes_edac_unregister(struct ghes *ghes) { struct mem_ctl_info *mci; + unsigned long flags; - if (!ghes_pvt) - return; + mutex_lock(&ghes_reg_mutex); - if (atomic_dec_return(&ghes_init)) - return; + if (!refcount_dec_and_test(&ghes_refcount)) + goto unlock; - mci = ghes_pvt->mci; + /* + * Wait for the irq handler being finished. + */ + spin_lock_irqsave(&ghes_lock, flags); + mci = ghes_pvt ? ghes_pvt->mci : NULL; ghes_pvt = NULL; - edac_mc_del_mc(mci->pdev); - edac_mc_free(mci); + spin_unlock_irqrestore(&ghes_lock, flags); + + if (!mci) + goto unlock; + + mci = edac_mc_del_mc(mci->pdev); + if (mci) + edac_mc_free(mci); + +unlock: + mutex_unlock(&ghes_reg_mutex); } From 7fdfee926be74dbde254308fd538310f11f6314b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 9 Nov 2019 10:00:54 +0100 Subject: [PATCH 19/35] EDAC/amd64: Get rid of the ECC disabled long message This message keeps flooding dmesg on boxes where ECC is disabled or the DIMMs do not support ECC but the module gets auto-probed. What's even worse is that autoprobing happens on every CPU due to the CPU-family matching the driver does and uevent being generated for each CPU device. What is more, this message is becoming even more useless on newer systems where forcing ECC is not recommended and it should be done in the BIOS so the BIOS can do all the necessary work, i.e., just setting a bit in an MSR is not enough anymore. So get rid of it. Signed-off-by: Borislav Petkov Cc: Yazen Ghannam Cc: linux-edac@vger.kernel.org Link: https://lkml.kernel.org/r/20191106160607.GC28380@zn.tnic --- drivers/edac/amd64_edac.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 3aeb5173e200..428ce98f6776 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3188,18 +3188,6 @@ static void restore_ecc_error_reporting(struct ecc_settings *s, u16 nid, amd64_warn("Error restoring NB MCGCTL settings!\n"); } -/* - * EDAC requires that the BIOS have ECC enabled before - * taking over the processing of ECC errors. A command line - * option allows to force-enable hardware ECC later in - * enable_ecc_error_reporting(). - */ -static const char *ecc_msg = - "ECC disabled in the BIOS or no ECC capability, module will not load.\n" - " Either enable ECC checking or force module loading by setting " - "'ecc_enable_override'.\n" - " (Note that use of the override may cause unknown side effects.)\n"; - static bool ecc_enabled(struct amd64_pvt *pvt) { u16 nid = pvt->mc_node_id; @@ -3246,11 +3234,10 @@ static bool ecc_enabled(struct amd64_pvt *pvt) amd64_info("Node %d: DRAM ECC %s.\n", nid, (ecc_en ? "enabled" : "disabled")); - if (!ecc_en || !nb_mce_en) { - amd64_info("%s", ecc_msg); + if (!ecc_en || !nb_mce_en) return false; - } - return true; + else + return true; } static inline void From bc9ad9e40dbc4c8874e806345df393a9cfeadad3 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:02 +0000 Subject: [PATCH 20/35] EDAC: Replace EDAC_DIMM_PTR() macro with edac_get_dimm() function The EDAC_DIMM_PTR() macro takes 3 arguments from struct mem_ctl_info. Clean up this interface to only pass the mci struct and replace this macro with a new function edac_get_dimm(). Also introduce an edac_get_dimm_by_index() function for later use. This allows it to get a DIMM pointer only by a given index. This can be useful if the DIMM's position within the layers of the memory controller or the exact size of the layers are unknown. Small style changes made for some hunks after applying the semantic patch. Semantic patch used: @@ expression mci, a, b,c; @@ -EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, a, b, c) +edac_get_dimm(mci, a, b, c) [ bp: Touchups. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Jason Baron Cc: Qiuxu Zhuo Cc: Tero Kristo Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-2-rrichter@marvell.com --- drivers/edac/ghes_edac.c | 7 +-- drivers/edac/i10nm_base.c | 3 +- drivers/edac/i3200_edac.c | 3 +- drivers/edac/i5000_edac.c | 5 +-- drivers/edac/i5100_edac.c | 3 +- drivers/edac/i5400_edac.c | 3 +- drivers/edac/i7300_edac.c | 3 +- drivers/edac/i7core_edac.c | 3 +- drivers/edac/ie31200_edac.c | 7 +-- drivers/edac/pnd2_edac.c | 4 +- drivers/edac/sb_edac.c | 2 +- drivers/edac/skx_base.c | 3 +- drivers/edac/ti_edac.c | 2 +- include/linux/edac.h | 89 ++++++++++++++++++++++++------------- 14 files changed, 74 insertions(+), 63 deletions(-) diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index f6f6a688c009..2275d6bf6bc5 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -106,9 +106,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg) if (dh->type == DMI_ENTRY_MEM_DEVICE) { struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh; - struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, - mci->n_layers, - dimm_fill->count, 0, 0); + struct dimm_info *dimm = edac_get_dimm(mci, dimm_fill->count, 0, 0); u16 rdr_mask = BIT(7) | BIT(13); if (entry->size == 0xffff) { @@ -544,8 +542,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) dimm_fill.mci = mci; dmi_walk(ghes_edac_dmidecode, &dimm_fill); } else { - struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, - mci->n_layers, 0, 0, 0); + struct dimm_info *dimm = edac_get_dimm(mci, 0, 0, 0); dimm->nr_pages = 1; dimm->grain = 128; diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index c370d5457e6b..059eccf0582b 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -154,8 +154,7 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci) ndimms = 0; for (j = 0; j < I10NM_NUM_DIMMS; j++) { - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, - mci->n_layers, i, j, 0); + dimm = edac_get_dimm(mci, i, j, 0); mtr = I10NM_GET_DIMMMTR(imc, i, j); mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i, j); edac_dbg(1, "dimmmtr 0x%x mcddrtcfg 0x%x (mc%d ch%d dimm%d)\n", diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c index 299b441647cd..432b375a4075 100644 --- a/drivers/edac/i3200_edac.c +++ b/drivers/edac/i3200_edac.c @@ -392,8 +392,7 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx) unsigned long nr_pages; for (j = 0; j < nr_channels; j++) { - struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, - mci->n_layers, i, j, 0); + struct dimm_info *dimm = edac_get_dimm(mci, i, j, 0); nr_pages = drb_to_nr_pages(drbs, stacked, j, i); if (nr_pages == 0) diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c index 078a7351bf05..1a6f69c859ab 100644 --- a/drivers/edac/i5000_edac.c +++ b/drivers/edac/i5000_edac.c @@ -1275,9 +1275,8 @@ static int i5000_init_csrows(struct mem_ctl_info *mci) if (!MTR_DIMMS_PRESENT(mtr)) continue; - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, - channel / MAX_BRANCHES, - channel % MAX_BRANCHES, slot); + dimm = edac_get_dimm(mci, channel / MAX_BRANCHES, + channel % MAX_BRANCHES, slot); csrow_megs = pvt->dimm_info[slot][channel].megabytes; dimm->grain = 8; diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c index 12bebecb203b..134586753311 100644 --- a/drivers/edac/i5100_edac.c +++ b/drivers/edac/i5100_edac.c @@ -858,8 +858,7 @@ static void i5100_init_csrows(struct mem_ctl_info *mci) if (!npages) continue; - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, - chan, rank, 0); + dimm = edac_get_dimm(mci, chan, rank, 0); dimm->nr_pages = npages; dimm->grain = 32; diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c index 8c86c6fd7da7..f131c05ade9f 100644 --- a/drivers/edac/i5400_edac.c +++ b/drivers/edac/i5400_edac.c @@ -1187,8 +1187,7 @@ static int i5400_init_dimms(struct mem_ctl_info *mci) if (!MTR_DIMMS_PRESENT(mtr)) continue; - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, - channel / 2, channel % 2, slot); + dimm = edac_get_dimm(mci, channel / 2, channel % 2, slot); size_mb = pvt->dimm_info[slot][channel].megabytes; diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c index 447d357c7a67..2e9bbe56cde9 100644 --- a/drivers/edac/i7300_edac.c +++ b/drivers/edac/i7300_edac.c @@ -794,8 +794,7 @@ static int i7300_init_csrows(struct mem_ctl_info *mci) for (ch = 0; ch < max_channel; ch++) { int channel = to_channel(ch, branch); - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, - mci->n_layers, branch, ch, slot); + dimm = edac_get_dimm(mci, branch, ch, slot); dinfo = &pvt->dimm_info[slot][channel]; diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index a71cca6eeb33..b3135b208f9a 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c @@ -585,8 +585,7 @@ static int get_dimm_config(struct mem_ctl_info *mci) if (!DIMM_PRESENT(dimm_dod[j])) continue; - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, - i, j, 0); + dimm = edac_get_dimm(mci, i, j, 0); banks = numbank(MC_DOD_NUMBANK(dimm_dod[j])); ranks = numrank(MC_DOD_NUMRANK(dimm_dod[j])); rows = numrow(MC_DOD_NUMROW(dimm_dod[j])); diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c index d26300f9cb07..4f65073f230b 100644 --- a/drivers/edac/ie31200_edac.c +++ b/drivers/edac/ie31200_edac.c @@ -490,9 +490,7 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx) if (dimm_info[j][i].dual_rank) { nr_pages = nr_pages / 2; - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, - mci->n_layers, (i * 2) + 1, - j, 0); + dimm = edac_get_dimm(mci, (i * 2) + 1, j, 0); dimm->nr_pages = nr_pages; edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages); dimm->grain = 8; /* just a guess */ @@ -503,8 +501,7 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx) dimm->dtype = DEV_UNKNOWN; dimm->edac_mode = EDAC_UNKNOWN; } - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, - mci->n_layers, i * 2, j, 0); + dimm = edac_get_dimm(mci, i * 2, j, 0); dimm->nr_pages = nr_pages; edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages); dimm->grain = 8; /* same guess */ diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index b1193be1ef1d..933f7722b893 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -1231,7 +1231,7 @@ static void apl_get_dimm_config(struct mem_ctl_info *mci) if (!(chan_mask & BIT(i))) continue; - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, 0, 0); + dimm = edac_get_dimm(mci, i, 0, 0); if (!dimm) { edac_dbg(0, "No allocated DIMM for channel %d\n", i); continue; @@ -1311,7 +1311,7 @@ static void dnv_get_dimm_config(struct mem_ctl_info *mci) if (!ranks_of_dimm[j]) continue; - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, j, 0); + dimm = edac_get_dimm(mci, i, j, 0); if (!dimm) { edac_dbg(0, "No allocated DIMM for channel %d DIMM %d\n", i, j); continue; diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index a2fd39d330d6..4957e8ee1879 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -1621,7 +1621,7 @@ static int __populate_dimms(struct mem_ctl_info *mci, } for (j = 0; j < max_dimms_per_channel; j++) { - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, j, 0); + dimm = edac_get_dimm(mci, i, j, 0); if (pvt->info.type == KNIGHTS_LANDING) { pci_read_config_dword(pvt->knl.pci_channel[i], knl_mtr_reg, &mtr); diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index a8853e724d1f..83545b4facb7 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -189,8 +189,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci) pci_read_config_dword(imc->chan[i].cdev, 0x8C, &amap); pci_read_config_dword(imc->chan[i].cdev, 0x400, &mcddrtcfg); for (j = 0; j < SKX_NUM_DIMMS; j++) { - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, - mci->n_layers, i, j, 0); + dimm = edac_get_dimm(mci, i, j, 0); pci_read_config_dword(imc->chan[i].cdev, 0x80 + 4 * j, &mtr); if (IS_DIMM_PRESENT(mtr)) { diff --git a/drivers/edac/ti_edac.c b/drivers/edac/ti_edac.c index 6ac26d1b929f..8be3e89a510e 100644 --- a/drivers/edac/ti_edac.c +++ b/drivers/edac/ti_edac.c @@ -135,7 +135,7 @@ static void ti_edac_setup_dimm(struct mem_ctl_info *mci, u32 type) u32 val; u32 memsize; - dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, 0, 0, 0); + dimm = edac_get_dimm(mci, 0, 0, 0); val = ti_edac_readl(edac, EMIF_SDRAM_CONFIG); diff --git a/include/linux/edac.h b/include/linux/edac.h index c19483b90079..280d109b9d05 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -403,37 +403,6 @@ struct edac_mc_layer { __i; \ }) -/** - * EDAC_DIMM_PTR - Macro responsible to get a pointer inside a pointer array - * for the element given by [layer0,layer1,layer2] position - * - * @layers: a struct edac_mc_layer array, describing how many elements - * were allocated for each layer - * @var: name of the var where we want to get the pointer - * (like mci->dimms) - * @nlayers: Number of layers at the @layers array - * @layer0: layer0 position - * @layer1: layer1 position. Unused if n_layers < 2 - * @layer2: layer2 position. Unused if n_layers < 3 - * - * For 1 layer, this macro returns "var[layer0]"; - * - * For 2 layers, this macro is similar to allocate a bi-dimensional array - * and to return "var[layer0][layer1]"; - * - * For 3 layers, this macro is similar to allocate a tri-dimensional array - * and to return "var[layer0][layer1][layer2]"; - */ -#define EDAC_DIMM_PTR(layers, var, nlayers, layer0, layer1, layer2) ({ \ - typeof(*var) __p; \ - int ___i = EDAC_DIMM_OFF(layers, nlayers, layer0, layer1, layer2); \ - if (___i < 0) \ - __p = NULL; \ - else \ - __p = (var)[___i]; \ - __p; \ -}) - struct dimm_info { struct device dev; @@ -669,4 +638,60 @@ struct mem_ctl_info { bool fake_inject_ue; u16 fake_inject_count; }; -#endif + +/** + * edac_get_dimm_by_index - Get DIMM info at @index from a memory + * controller + * + * @mci: MC descriptor struct mem_ctl_info + * @index: index in the memory controller's DIMM array + * + * Returns a struct dimm_info * or NULL on failure. + */ +static inline struct dimm_info * +edac_get_dimm_by_index(struct mem_ctl_info *mci, int index) +{ + if (index < 0 || index >= mci->tot_dimms) + return NULL; + + return mci->dimms[index]; +} + +/** + * edac_get_dimm - Get DIMM info from a memory controller given by + * [layer0,layer1,layer2] position + * + * @mci: MC descriptor struct mem_ctl_info + * @layer0: layer0 position + * @layer1: layer1 position. Unused if n_layers < 2 + * @layer2: layer2 position. Unused if n_layers < 3 + * + * For 1 layer, this function returns "dimms[layer0]"; + * + * For 2 layers, this function is similar to allocating a two-dimensional + * array and returning "dimms[layer0][layer1]"; + * + * For 3 layers, this function is similar to allocating a tri-dimensional + * array and returning "dimms[layer0][layer1][layer2]"; + */ +static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci, + int layer0, int layer1, int layer2) +{ + int index; + + if (layer0 < 0 + || (mci->n_layers > 1 && layer1 < 0) + || (mci->n_layers > 2 && layer2 < 0)) + return NULL; + + index = layer0; + + if (mci->n_layers > 1) + index = index * mci->layers[1].size + layer1; + + if (mci->n_layers > 2) + index = index * mci->layers[2].size + layer2; + + return edac_get_dimm_by_index(mci, index); +} +#endif /* _LINUX_EDAC_H_ */ From 977b1ce7c117905b3138dc727ed25f8af2ba2902 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:04 +0000 Subject: [PATCH 21/35] EDAC: Remove EDAC_DIMM_OFF() macro The EDAC_DIMM_OFF() macro takes 5 arguments to get the DIMM's index. Simplify this by storing the index in struct dimm_info to avoid its calculation and remove the EDAC_DIMM_OFF() macro. The index can be directly used then. Another advantage is that edac_mc_alloc() could be used even if the exact size of the layers is unknown. Only the number of DIMMs would be needed. Rename iterator variable to idx, while at it. The name is more handy, esp. when searching for it in the code. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-3-rrichter@marvell.com --- drivers/edac/edac_mc.c | 30 +++++++++++------------- drivers/edac/edac_mc_sysfs.c | 16 ++----------- include/linux/edac.h | 45 ++++-------------------------------- 3 files changed, 20 insertions(+), 71 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index e6fd079783bd..cb5356411251 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -314,25 +314,27 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, struct dimm_info *dimm; u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; unsigned int pos[EDAC_MAX_LAYERS]; - unsigned int size, tot_dimms = 1, count = 1; + unsigned int idx, size, tot_dimms = 1, count = 1; unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0; void *pvt, *p, *ptr = NULL; - int i, j, row, chn, n, len, off; + int i, j, row, chn, n, len; bool per_rank = false; BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0); + /* * Calculate the total amount of dimms and csrows/cschannels while * in the old API emulation mode */ - for (i = 0; i < n_layers; i++) { - tot_dimms *= layers[i].size; - if (layers[i].is_virt_csrow) - tot_csrows *= layers[i].size; - else - tot_channels *= layers[i].size; + for (idx = 0; idx < n_layers; idx++) { + tot_dimms *= layers[idx].size; - if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT) + if (layers[idx].is_virt_csrow) + tot_csrows *= layers[idx].size; + else + tot_channels *= layers[idx].size; + + if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT) per_rank = true; } @@ -425,19 +427,15 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, memset(&pos, 0, sizeof(pos)); row = 0; chn = 0; - for (i = 0; i < tot_dimms; i++) { + for (idx = 0; idx < tot_dimms; idx++) { chan = mci->csrows[row]->channels[chn]; - off = EDAC_DIMM_OFF(layer, n_layers, pos[0], pos[1], pos[2]); - if (off < 0 || off >= tot_dimms) { - edac_mc_printk(mci, KERN_ERR, "EDAC core bug: EDAC_DIMM_OFF is trying to do an illegal data access\n"); - goto error; - } dimm = kzalloc(sizeof(**mci->dimms), GFP_KERNEL); if (!dimm) goto error; - mci->dimms[off] = dimm; + mci->dimms[idx] = dimm; dimm->mci = mci; + dimm->idx = idx; /* * Copy DIMM location and initialize it. diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 32d016f1ecd1..91e4c8f155af 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -557,14 +557,8 @@ static ssize_t dimmdev_ce_count_show(struct device *dev, { struct dimm_info *dimm = to_dimm(dev); u32 count; - int off; - off = EDAC_DIMM_OFF(dimm->mci->layers, - dimm->mci->n_layers, - dimm->location[0], - dimm->location[1], - dimm->location[2]); - count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][off]; + count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx]; return sprintf(data, "%u\n", count); } @@ -574,14 +568,8 @@ static ssize_t dimmdev_ue_count_show(struct device *dev, { struct dimm_info *dimm = to_dimm(dev); u32 count; - int off; - off = EDAC_DIMM_OFF(dimm->mci->layers, - dimm->mci->n_layers, - dimm->location[0], - dimm->location[1], - dimm->location[2]); - count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][off]; + count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx]; return sprintf(data, "%u\n", count); } diff --git a/include/linux/edac.h b/include/linux/edac.h index 280d109b9d05..f4ebb14bc406 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -362,47 +362,6 @@ struct edac_mc_layer { */ #define EDAC_MAX_LAYERS 3 -/** - * EDAC_DIMM_OFF - Macro responsible to get a pointer offset inside a pointer - * array for the element given by [layer0,layer1,layer2] - * position - * - * @layers: a struct edac_mc_layer array, describing how many elements - * were allocated for each layer - * @nlayers: Number of layers at the @layers array - * @layer0: layer0 position - * @layer1: layer1 position. Unused if n_layers < 2 - * @layer2: layer2 position. Unused if n_layers < 3 - * - * For 1 layer, this macro returns "var[layer0] - var"; - * - * For 2 layers, this macro is similar to allocate a bi-dimensional array - * and to return "var[layer0][layer1] - var"; - * - * For 3 layers, this macro is similar to allocate a tri-dimensional array - * and to return "var[layer0][layer1][layer2] - var". - * - * A loop could be used here to make it more generic, but, as we only have - * 3 layers, this is a little faster. - * - * By design, layers can never be 0 or more than 3. If that ever happens, - * a NULL is returned, causing an OOPS during the memory allocation routine, - * with would point to the developer that he's doing something wrong. - */ -#define EDAC_DIMM_OFF(layers, nlayers, layer0, layer1, layer2) ({ \ - int __i; \ - if ((nlayers) == 1) \ - __i = layer0; \ - else if ((nlayers) == 2) \ - __i = (layer1) + ((layers[1]).size * (layer0)); \ - else if ((nlayers) == 3) \ - __i = (layer2) + ((layers[2]).size * ((layer1) + \ - ((layers[1]).size * (layer0)))); \ - else \ - __i = -EINVAL; \ - __i; \ -}) - struct dimm_info { struct device dev; @@ -412,6 +371,7 @@ struct dimm_info { unsigned int location[EDAC_MAX_LAYERS]; struct mem_ctl_info *mci; /* the parent */ + unsigned int idx; /* index within the parent dimm array */ u32 grain; /* granularity of reported error in bytes */ enum dev_type dtype; /* memory device type */ @@ -654,6 +614,9 @@ edac_get_dimm_by_index(struct mem_ctl_info *mci, int index) if (index < 0 || index >= mci->tot_dimms) return NULL; + if (WARN_ON_ONCE(mci->dimms[index]->idx != index)) + return NULL; + return mci->dimms[index]; } From c498afaf7df87f44e7cb383c135baec52b5259be Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:07 +0000 Subject: [PATCH 22/35] EDAC: Introduce an mci_for_each_dimm() iterator Introduce an mci_for_each_dimm() iterator. It returns a pointer to a struct dimm_info. This makes the declaration and use of an index obsolete and avoids access to internal data of struct mci (direct array access etc). [ bp: push the struct dimm_info *dimm; declaration into the CONFIG_EDAC_DEBUG block. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-4-rrichter@marvell.com --- drivers/edac/edac_mc.c | 19 +++++++++++-------- drivers/edac/edac_mc_sysfs.c | 29 ++++++++++++----------------- drivers/edac/ghes_edac.c | 9 +++++---- drivers/edac/i5100_edac.c | 13 +++++-------- include/linux/edac.h | 7 +++++++ 5 files changed, 40 insertions(+), 37 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index cb5356411251..88e24ece3e1e 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -145,15 +145,18 @@ static void edac_mc_dump_channel(struct rank_info *chan) edac_dbg(4, " channel->dimm = %p\n", chan->dimm); } -static void edac_mc_dump_dimm(struct dimm_info *dimm, int number) +static void edac_mc_dump_dimm(struct dimm_info *dimm) { char location[80]; + if (!dimm->nr_pages) + return; + edac_dimm_info_location(dimm, location, sizeof(location)); edac_dbg(4, "%s%i: %smapped as virtual row %d, chan %d\n", dimm->mci->csbased ? "rank" : "dimm", - number, location, dimm->csrow, dimm->cschannel); + dimm->idx, location, dimm->csrow, dimm->cschannel); edac_dbg(4, " dimm = %p\n", dimm); edac_dbg(4, " dimm->label = '%s'\n", dimm->label); edac_dbg(4, " dimm->nr_pages = 0x%x\n", dimm->nr_pages); @@ -712,6 +715,7 @@ int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci, edac_mc_dump_mci(mci); if (edac_debug_level >= 4) { + struct dimm_info *dimm; int i; for (i = 0; i < mci->nr_csrows; i++) { @@ -728,9 +732,9 @@ int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci, if (csrow->channels[j]->dimm->nr_pages) edac_mc_dump_channel(csrow->channels[j]); } - for (i = 0; i < mci->tot_dimms; i++) - if (mci->dimms[i]->nr_pages) - edac_mc_dump_dimm(mci->dimms[i], i); + + mci_for_each_dimm(mci, dimm) + edac_mc_dump_dimm(dimm); } #endif mutex_lock(&mem_ctls_mutex); @@ -1088,6 +1092,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, const char *msg, const char *other_detail) { + struct dimm_info *dimm; char *p; int row = -1, chan = -1; int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer }; @@ -1148,9 +1153,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, p = e->label; *p = '\0'; - for (i = 0; i < mci->tot_dimms; i++) { - struct dimm_info *dimm = mci->dimms[i]; - + mci_for_each_dimm(mci, dimm) { if (top_layer >= 0 && top_layer != dimm->location[0]) continue; if (mid_layer >= 0 && mid_layer != dimm->location[1]) diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 91e4c8f155af..0367554e7437 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -621,8 +621,7 @@ static const struct device_type dimm_attr_type = { /* Create a DIMM object under specifed memory controller device */ static int edac_create_dimm_object(struct mem_ctl_info *mci, - struct dimm_info *dimm, - int index) + struct dimm_info *dimm) { int err; dimm->mci = mci; @@ -632,9 +631,9 @@ static int edac_create_dimm_object(struct mem_ctl_info *mci, dimm->dev.parent = &mci->dev; if (mci->csbased) - dev_set_name(&dimm->dev, "rank%d", index); + dev_set_name(&dimm->dev, "rank%d", dimm->idx); else - dev_set_name(&dimm->dev, "dimm%d", index); + dev_set_name(&dimm->dev, "dimm%d", dimm->idx); dev_set_drvdata(&dimm->dev, dimm); pm_runtime_forbid(&mci->dev); @@ -916,7 +915,8 @@ static const struct device_type mci_attr_type = { int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, const struct attribute_group **groups) { - int i, err; + struct dimm_info *dimm; + int err; /* get the /sys/devices/system/edac subsys reference */ mci->dev.type = &mci_attr_type; @@ -940,13 +940,12 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, /* * Create the dimm/rank devices */ - for (i = 0; i < mci->tot_dimms; i++) { - struct dimm_info *dimm = mci->dimms[i]; + mci_for_each_dimm(mci, dimm) { /* Only expose populated DIMMs */ if (!dimm->nr_pages) continue; - err = edac_create_dimm_object(mci, dimm, i); + err = edac_create_dimm_object(mci, dimm); if (err) goto fail_unregister_dimm; } @@ -961,12 +960,9 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, return 0; fail_unregister_dimm: - for (i--; i >= 0; i--) { - struct dimm_info *dimm = mci->dimms[i]; - if (!dimm->nr_pages) - continue; - - device_unregister(&dimm->dev); + mci_for_each_dimm(mci, dimm) { + if (device_is_registered(&dimm->dev)) + device_unregister(&dimm->dev); } device_unregister(&mci->dev); @@ -978,7 +974,7 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, */ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci) { - int i; + struct dimm_info *dimm; edac_dbg(0, "\n"); @@ -989,8 +985,7 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci) edac_delete_csrow_objects(mci); #endif - for (i = 0; i < mci->tot_dimms; i++) { - struct dimm_info *dimm = mci->dimms[i]; + mci_for_each_dimm(mci, dimm) { if (dimm->nr_pages == 0) continue; edac_dbg(1, "unregistering device %s\n", dev_name(&dimm->dev)); diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 2275d6bf6bc5..fbc8b1de3ddd 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -90,12 +90,13 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg) static int get_dimm_smbios_index(struct mem_ctl_info *mci, u16 handle) { - int i; + struct dimm_info *dimm; - for (i = 0; i < mci->tot_dimms; i++) { - if (mci->dimms[i]->smbios_handle == handle) - return i; + mci_for_each_dimm(mci, dimm) { + if (dimm->smbios_handle == handle) + return dimm->idx; } + return -1; } diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c index 134586753311..0ddc41e47a96 100644 --- a/drivers/edac/i5100_edac.c +++ b/drivers/edac/i5100_edac.c @@ -846,20 +846,17 @@ static void i5100_init_interleaving(struct pci_dev *pdev, static void i5100_init_csrows(struct mem_ctl_info *mci) { - int i; struct i5100_priv *priv = mci->pvt_info; + struct dimm_info *dimm; - for (i = 0; i < mci->tot_dimms; i++) { - struct dimm_info *dimm; - const unsigned long npages = i5100_npages(mci, i); - const unsigned int chan = i5100_csrow_to_chan(mci, i); - const unsigned int rank = i5100_csrow_to_rank(mci, i); + mci_for_each_dimm(mci, dimm) { + const unsigned long npages = i5100_npages(mci, dimm->idx); + const unsigned int chan = i5100_csrow_to_chan(mci, dimm->idx); + const unsigned int rank = i5100_csrow_to_rank(mci, dimm->idx); if (!npages) continue; - dimm = edac_get_dimm(mci, chan, rank, 0); - dimm->nr_pages = npages; dimm->grain = 32; dimm->dtype = (priv->mtr[chan][rank].width == 4) ? diff --git a/include/linux/edac.h b/include/linux/edac.h index f4ebb14bc406..31f99d48b024 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -599,6 +599,13 @@ struct mem_ctl_info { u16 fake_inject_count; }; +#define mci_for_each_dimm(mci, dimm) \ + for ((dimm) = (mci)->dimms[0]; \ + (dimm); \ + (dimm) = (dimm)->idx + 1 < (mci)->tot_dimms \ + ? (mci)->dimms[(dimm)->idx + 1] \ + : NULL) + /** * edac_get_dimm_by_index - Get DIMM info at @index from a memory * controller From d260e8ff5195d3080cc11ae1f35cf4e6eb1a5d24 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:09 +0000 Subject: [PATCH 23/35] EDAC/mc: Do not BUG_ON() in edac_mc_alloc() No need to crash the system in case edac_mc_alloc() is called with invalid arguments, just warn and return. This would cause a checkpatch warning when touching the code later, so just fix it. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-5-rrichter@marvell.com --- drivers/edac/edac_mc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 88e24ece3e1e..6b3c4a65c9c8 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -323,7 +323,8 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, int i, j, row, chn, n, len; bool per_rank = false; - BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0); + if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0)) + return NULL; /* * Calculate the total amount of dimms and csrows/cschannels while From 47bec6b4c399fafbf984431c6a15baac06729829 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:11 +0000 Subject: [PATCH 24/35] EDAC/mc: Remove needless zero string termination The e string to which this is pointing to has already been cleared earlier in the function so remove the needless zero string termination. [ bp: Correct the commit message. ] Suggested-by: Joe Perches Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-6-rrichter@marvell.com --- drivers/edac/edac_mc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 6b3c4a65c9c8..ab2b48978888 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -1184,7 +1184,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, } strcpy(p, dimm->label); p += strlen(p); - *p = '\0'; /* * get csrow/channel of the DIMM, in order to allow From 0d8292e003effface2a613e5369deb0a913516d8 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:14 +0000 Subject: [PATCH 25/35] EDAC/mc: Reduce indentation level in edac_mc_handle_error() Reduce the indentation level in edac_mc_handle_error() a bit. No functional changes. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-7-rrichter@marvell.com --- drivers/edac/edac_mc.c | 57 +++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index ab2b48978888..614841035bfe 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -1172,36 +1172,37 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, * channel/memory controller/... may be affected. * Also, don't show errors for empty DIMM slots. */ - if (e->enable_per_layer_report && dimm->nr_pages) { - if (n_labels >= EDAC_MAX_LABELS) { - e->enable_per_layer_report = false; - break; - } - n_labels++; - if (p != e->label) { - strcpy(p, OTHER_LABEL); - p += strlen(OTHER_LABEL); - } - strcpy(p, dimm->label); - p += strlen(p); + if (!e->enable_per_layer_report || !dimm->nr_pages) + continue; - /* - * get csrow/channel of the DIMM, in order to allow - * incrementing the compat API counters - */ - edac_dbg(4, "%s csrows map: (%d,%d)\n", - mci->csbased ? "rank" : "dimm", - dimm->csrow, dimm->cschannel); - if (row == -1) - row = dimm->csrow; - else if (row >= 0 && row != dimm->csrow) - row = -2; - - if (chan == -1) - chan = dimm->cschannel; - else if (chan >= 0 && chan != dimm->cschannel) - chan = -2; + if (n_labels >= EDAC_MAX_LABELS) { + e->enable_per_layer_report = false; + break; } + n_labels++; + if (p != e->label) { + strcpy(p, OTHER_LABEL); + p += strlen(OTHER_LABEL); + } + strcpy(p, dimm->label); + p += strlen(p); + + /* + * get csrow/channel of the DIMM, in order to allow + * incrementing the compat API counters + */ + edac_dbg(4, "%s csrows map: (%d,%d)\n", + mci->csbased ? "rank" : "dimm", + dimm->csrow, dimm->cschannel); + if (row == -1) + row = dimm->csrow; + else if (row >= 0 && row != dimm->csrow) + row = -2; + + if (chan == -1) + chan = dimm->cschannel; + else if (chan >= 0 && chan != dimm->cschannel) + chan = -2; } if (!e->enable_per_layer_report) { From 98edb865bd3ee2a67e51e0d947208f3a2129a460 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:18 +0000 Subject: [PATCH 26/35] EDAC: Remove misleading comment in struct edac_raw_error_desc There never has been such function edac_raw_error_desc_clean() and in function ghes_edac_report_mem_error() the whole struct is zero'ed including the string arrays. Remove that comment. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-9-rrichter@marvell.com --- include/linux/edac.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/edac.h b/include/linux/edac.h index 31f99d48b024..cc31b9742684 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -457,15 +457,10 @@ struct errcount_attribute_data { * (typically, a memory controller error) */ struct edac_raw_error_desc { - /* - * NOTE: everything before grain won't be cleaned by - * edac_raw_error_desc_clean() - */ char location[LOCATION_SIZE]; char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * EDAC_MAX_LABELS]; long grain; - /* the vars below and grain will be cleaned on every new error report */ u16 error_count; int top_layer; int mid_layer; From 7c1049317042a37638788bb8a892dbd75b742655 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:20 +0000 Subject: [PATCH 27/35] EDAC/ghes: Use standard kernel macros for page calculations Use standard macros for page calculations. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: James Morse Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-10-rrichter@marvell.com --- drivers/edac/ghes_edac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index fbc8b1de3ddd..5dc53d3512ae 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -319,8 +319,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) /* Error address */ if (mem_err->validation_bits & CPER_MEM_VALID_PA) { - e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT; - e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK; + e->page_frame_number = PHYS_PFN(mem_err->physical_addr); + e->offset_in_page = offset_in_page(mem_err->physical_addr); } /* Error grain */ From 7088e29e0423d3195e09079b4f849ec4837e5a75 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:23 +0000 Subject: [PATCH 28/35] EDAC/ghes: Fix grain calculation The current code to convert a physical address mask to a grain (defined as granularity in bytes) is: e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK); This is broken in several ways: 1) It calculates to wrong grain values. E.g., a physical address mask of ~0xfff should give a grain of 0x1000. Without considering PAGE_MASK, there is an off-by-one. Things are worse when also filtering it with ~PAGE_MASK. This will calculate to a grain with the upper bits set. In the example it even calculates to ~0. 2) The grain does not depend on and is unrelated to the kernel's page-size. The page-size only matters when unmapping memory in memory_failure(). Smaller grains are wrongly rounded up to the page-size, on architectures with a configurable page-size (e.g. arm64) this could round up to the even bigger page-size of the hypervisor. Fix this with: e->grain = ~mem_err->physical_addr_mask + 1; The grain_bits are defined as: grain = 1 << grain_bits; Change also the grain_bits calculation accordingly, it is the same formula as in edac_mc.c now and the code can be unified. The value in ->physical_addr_mask coming from firmware is assumed to be contiguous, but this is not sanity-checked. However, in case the mask is non-contiguous, a conversion to grain_bits effectively converts the grain bit mask to a power of 2 by rounding it up. Suggested-by: James Morse Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-11-rrichter@marvell.com --- drivers/edac/ghes_edac.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 5dc53d3512ae..e24fbbac5556 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -230,6 +230,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) /* Cleans the error report buffer */ memset(e, 0, sizeof (*e)); e->error_count = 1; + e->grain = 1; strcpy(e->label, "unknown label"); e->msg = pvt->msg; e->other_detail = pvt->other_detail; @@ -325,7 +326,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) /* Error grain */ if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) - e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK); + e->grain = ~mem_err->physical_addr_mask + 1; /* Memory error location, mapped on e->location */ p = e->location; @@ -441,8 +442,13 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) if (p > pvt->other_detail) *(p - 1) = '\0'; + /* Sanity-check driver-supplied grain value. */ + if (WARN_ON_ONCE(!e->grain)) + e->grain = 1; + + grain_bits = fls_long(e->grain - 1); + /* Generate the trace event */ - grain_bits = fls_long(e->grain); snprintf(pvt->detail_location, sizeof(pvt->detail_location), "APEI location: %s %s", e->location, e->other_detail); trace_mc_event(type, e->msg, e->label, e->error_count, From 501eb40d2b85ae70a617a3b46dbb519fb9c76532 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:25 +0000 Subject: [PATCH 29/35] EDAC/ghes: Remove intermediate buffer pvt->detail_location detail_location[] is used to collect two location strings so they can be passed as one to trace_mc_event(). Instead of having an extra copy step, assemble the location string in other_detail[] from the beginning. Using other_detail[] to call trace_mc_event() is now the same as in edac_mc.c and code can be unified. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: James Morse Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-12-rrichter@marvell.com --- drivers/edac/ghes_edac.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index e24fbbac5556..1fd782c73a35 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -21,8 +21,7 @@ struct ghes_edac_pvt { struct mem_ctl_info *mci; /* Buffers for the error handling routine */ - char detail_location[240]; - char other_detail[160]; + char other_detail[400]; char msg[80]; }; @@ -369,6 +368,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) /* All other fields are mapped on e->other_detail */ p = pvt->other_detail; + p += snprintf(p, sizeof(pvt->other_detail), + "APEI location: %s ", e->location); if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) { u64 status = mem_err->error_status; @@ -449,12 +450,10 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) grain_bits = fls_long(e->grain - 1); /* Generate the trace event */ - snprintf(pvt->detail_location, sizeof(pvt->detail_location), - "APEI location: %s %s", e->location, e->other_detail); trace_mc_event(type, e->msg, e->label, e->error_count, mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer, (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, - grain_bits, e->syndrome, pvt->detail_location); + grain_bits, e->syndrome, e->other_detail); edac_raw_mc_handle_error(type, mci, e); From 787d899914aae1c8cdb467931cfd7c369ef8d37b Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:27 +0000 Subject: [PATCH 30/35] EDAC: Unify the mc_event tracepoint call The code in ghes_edac.c and edac_mc.c for grain_bits calculation and calling trace_mc_event() is now the same. Move it to a single location in edac_raw_mc_handle_error(). The only difference is the missing IS_ENABLED(CONFIG_RAS) switch, but this is needed for ghes too. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-13-rrichter@marvell.com --- drivers/edac/edac_mc.c | 30 +++++++++++++++--------------- drivers/edac/ghes_edac.c | 13 ------------- 2 files changed, 15 insertions(+), 28 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 614841035bfe..7243b88f81d8 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -1058,6 +1058,21 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type, { char detail[80]; int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer }; + u8 grain_bits; + + /* Sanity-check driver-supplied grain value. */ + if (WARN_ON_ONCE(!e->grain)) + e->grain = 1; + + grain_bits = fls_long(e->grain - 1); + + /* Report the error via the trace interface */ + if (IS_ENABLED(CONFIG_RAS)) + trace_mc_event(type, e->msg, e->label, e->error_count, + mci->mc_idx, e->top_layer, e->mid_layer, + e->low_layer, + (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, + grain_bits, e->syndrome, e->other_detail); /* Memory type dependent details about the error */ if (type == HW_EVENT_ERR_CORRECTED) { @@ -1098,7 +1113,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, int row = -1, chan = -1; int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer }; int i, n_labels = 0; - u8 grain_bits; struct edac_raw_error_desc *e = &mci->error_desc; edac_dbg(3, "MC%d\n", mci->mc_idx); @@ -1236,20 +1250,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, if (p > e->location) *(p - 1) = '\0'; - /* Sanity-check driver-supplied grain value. */ - if (WARN_ON_ONCE(!e->grain)) - e->grain = 1; - - grain_bits = fls_long(e->grain - 1); - - /* Report the error via the trace interface */ - if (IS_ENABLED(CONFIG_RAS)) - trace_mc_event(type, e->msg, e->label, e->error_count, - mci->mc_idx, e->top_layer, e->mid_layer, - e->low_layer, - (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, - grain_bits, e->syndrome, e->other_detail); - edac_raw_mc_handle_error(type, mci, e); } EXPORT_SYMBOL_GPL(edac_mc_handle_error); diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 1fd782c73a35..47f4e7f90ef0 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -207,7 +207,6 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) struct ghes_edac_pvt *pvt; unsigned long flags; char *p; - u8 grain_bits; /* * We can do the locking below because GHES defers error processing @@ -443,18 +442,6 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) if (p > pvt->other_detail) *(p - 1) = '\0'; - /* Sanity-check driver-supplied grain value. */ - if (WARN_ON_ONCE(!e->grain)) - e->grain = 1; - - grain_bits = fls_long(e->grain - 1); - - /* Generate the trace event */ - trace_mc_event(type, e->msg, e->label, e->error_count, - mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer, - (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, - grain_bits, e->syndrome, e->other_detail); - edac_raw_mc_handle_error(type, mci, e); unlock: From 778f3a9673ac48bcf4c4ee640b2b29578cded8c1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:30 +0000 Subject: [PATCH 31/35] EDAC/Documentation: Describe CPER module definition and DIMM ranks Update on CPER DIMM naming convention and DIMM ranks. [ bp: Touchups. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-doc@vger.kernel.org" Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Jonathan Corbet Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-14-rrichter@marvell.com --- Documentation/admin-guide/ras.rst | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst index 2b20f5f7380d..0310db624964 100644 --- a/Documentation/admin-guide/ras.rst +++ b/Documentation/admin-guide/ras.rst @@ -330,9 +330,12 @@ There can be multiple csrows and multiple channels. .. [#f4] Nowadays, the term DIMM (Dual In-line Memory Module) is widely used to refer to a memory module, although there are other memory - packaging alternatives, like SO-DIMM, SIMM, etc. Along this document, - and inside the EDAC system, the term "dimm" is used for all memory - modules, even when they use a different kind of packaging. + packaging alternatives, like SO-DIMM, SIMM, etc. The UEFI + specification (Version 2.7) defines a memory module in the Common + Platform Error Record (CPER) section to be an SMBIOS Memory Device + (Type 17). Along this document, and inside the EDAC subsystem, the term + "dimm" is used for all memory modules, even when they use a + different kind of packaging. Memory controllers allow for several csrows, with 8 csrows being a typical value. Yet, the actual number of csrows depends on the layout of @@ -349,12 +352,14 @@ controllers. The following example will assume 2 channels: | | ``ch0`` | ``ch1`` | +============+===========+===========+ | ``csrow0`` | DIMM_A0 | DIMM_B0 | - +------------+ | | - | ``csrow1`` | | | + | | rank0 | rank0 | + +------------+ - | - | + | ``csrow1`` | rank1 | rank1 | +------------+-----------+-----------+ | ``csrow2`` | DIMM_A1 | DIMM_B1 | - +------------+ | | - | ``csrow3`` | | | + | | rank0 | rank0 | + +------------+ - | - | + | ``csrow3`` | rank1 | rank1 | +------------+-----------+-----------+ In the above example, there are 4 physical slots on the motherboard @@ -374,11 +379,13 @@ which the memory DIMM is placed. Thus, when 1 DIMM is placed in each Channel, the csrows cross both DIMMs. Memory DIMMs come single or dual "ranked". A rank is a populated csrow. -Thus, 2 single ranked DIMMs, placed in slots DIMM_A0 and DIMM_B0 above -will have just one csrow (csrow0). csrow1 will be empty. On the other -hand, when 2 dual ranked DIMMs are similarly placed, then both csrow0 -and csrow1 will be populated. The pattern repeats itself for csrow2 and -csrow3. +In the example above 2 dual ranked DIMMs are similarly placed. Thus, +both csrow0 and csrow1 are populated. On the other hand, when 2 single +ranked DIMMs are placed in slots DIMM_A0 and DIMM_B0, then they will +have just one csrow (csrow0) and csrow1 will be empty. The pattern +repeats itself for csrow2 and csrow3. Also note that some memory +controllers don't have any logic to identify the memory module, see +``rankX`` directories below. The representation of the above is reflected in the directory tree in EDAC's sysfs interface. Starting in directory From 16214bd9e43a31683a7073664b000029bba00354 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 21 Nov 2019 21:36:57 +0000 Subject: [PATCH 32/35] EDAC/ghes: Do not warn when incrementing refcount on 0 The following warning from the refcount framework is seen during ghes initialization: EDAC MC0: Giving out device to module ghes_edac.c controller ghes_edac: DEV ghes (INTERRUPT) ------------[ cut here ]------------ refcount_t: increment on 0; use-after-free. WARNING: CPU: 36 PID: 1 at lib/refcount.c:156 refcount_inc_checked [...] Call trace: refcount_inc_checked ghes_edac_register ghes_probe ... It warns if the refcount is incremented from zero. This warning is reasonable as a kernel object is typically created with a refcount of one and freed once the refcount is zero. Afterwards the object would be "used-after-free". For GHES, the refcount is initialized with zero, and that is why this message is seen when initializing the first instance. However, whenever the refcount is zero, the device will be allocated and registered. Since the ghes_reg_mutex protects the refcount and serializes allocation and freeing of ghes devices, a use-after-free cannot happen here. Instead of using refcount_inc() for the first instance, use refcount_set(). This can be used here because the refcount is zero at this point and can not change due to its protection by the mutex. Fixes: 23f61b9fc5cc ("EDAC/ghes: Fix locking and memory barrier issues") Reported-by: John Garry Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Tested-by: John Garry Cc: Cc: James Morse Cc: Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Cc: Tony Luck Cc: Link: https://lkml.kernel.org/r/20191121213628.21244-1-rrichter@marvell.com --- drivers/edac/ghes_edac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 47f4e7f90ef0..b99080d8a10c 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -556,8 +556,8 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) ghes_pvt = pvt; spin_unlock_irqrestore(&ghes_lock, flags); - /* only increment on success */ - refcount_inc(&ghes_refcount); + /* only set on success */ + refcount_set(&ghes_refcount, 1); unlock: mutex_unlock(&ghes_reg_mutex); From 56d9e7bd3fa0f105b6670021d167744bc50ae4fe Mon Sep 17 00:00:00 2001 From: Meng Li Date: Thu, 21 Nov 2019 12:30:46 -0600 Subject: [PATCH 33/35] EDAC/altera: Use fast register IO for S10 IRQs When an IRQ occurs, regmap_{read,write,...}() is invoked in atomic context. Regmap must indicate register IO is fast so that a spinlock is used instead of a mutex to avoid sleeping in atomic context: lock_acquire __mutex_lock mutex_lock_nested regmap_lock_mutex regmap_write a10_eccmgr_irq_unmask unmask_irq.part.0 irq_enable __irq_startup irq_startup __setup_irq request_threaded_irq devm_request_threaded_irq altr_sdram_probe Mark it so. [ bp: Massage. ] Fixes: 3dab6bd52687 ("EDAC, altera: Add support for Stratix10 SDRAM EDAC") Reported-by: Meng Li Signed-off-by: Meng Li Signed-off-by: Thor Thayer Signed-off-by: Borislav Petkov Cc: James Morse Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Robert Richter Cc: stable Cc: Tony Luck Link: https://lkml.kernel.org/r/1574361048-17572-2-git-send-email-thor.thayer@linux.intel.com --- drivers/edac/altera_edac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index fbda4b876afd..0be3d1b17f03 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -560,6 +560,7 @@ static const struct regmap_config s10_sdram_regmap_cfg = { .reg_write = s10_protected_reg_write, .use_single_read = true, .use_single_write = true, + .fast_io = true, }; /************** ***********/ From 08a260d968d27b7c1abfc9ea32d64a2dc7af0449 Mon Sep 17 00:00:00 2001 From: Thor Thayer Date: Thu, 7 Nov 2019 14:01:29 -0600 Subject: [PATCH 34/35] EDAC/altera: Cleanup the ECC Manager Cleanup the ECC Manager peripheral test in probe function as suggested by James. Remove the check for Stratix10. Suggested-by: James Morse Signed-off-by: Thor Thayer Signed-off-by: Borislav Petkov Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Robert Richter Cc: Tony Luck Link: https://lkml.kernel.org/r/1573156890-26891-2-git-send-email-thor.thayer@linux.intel.com --- drivers/edac/altera_edac.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 0be3d1b17f03..3e86cf327ad0 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -1014,11 +1014,6 @@ static int socfpga_is_a10(void) return of_machine_is_compatible("altr,socfpga-arria10"); } -static int socfpga_is_s10(void) -{ - return of_machine_is_compatible("altr,socfpga-stratix10"); -} - static __init int __maybe_unused altr_init_a10_ecc_block(struct device_node *np, u32 irq_mask, u32 ecc_ctrl_en_mask, bool dual_port) @@ -1126,9 +1121,6 @@ static int __init __maybe_unused altr_init_a10_ecc_device_type(char *compat) int irq; struct device_node *child, *np; - if (!socfpga_is_a10() && !socfpga_is_s10()) - return -ENODEV; - np = of_find_compatible_node(NULL, NULL, "altr,socfpga-a10-ecc-manager"); if (!np) { @@ -2271,18 +2263,7 @@ static int altr_edac_a10_probe(struct platform_device *pdev) if (!of_device_is_available(child)) continue; - if (of_device_is_compatible(child, "altr,socfpga-a10-l2-ecc") || - of_device_is_compatible(child, "altr,socfpga-a10-ocram-ecc") || - of_device_is_compatible(child, "altr,socfpga-eth-mac-ecc") || - of_device_is_compatible(child, "altr,socfpga-nand-ecc") || - of_device_is_compatible(child, "altr,socfpga-dma-ecc") || - of_device_is_compatible(child, "altr,socfpga-usb-ecc") || - of_device_is_compatible(child, "altr,socfpga-qspi-ecc") || -#ifdef CONFIG_EDAC_ALTERA_SDRAM - of_device_is_compatible(child, "altr,sdram-edac-s10") || -#endif - of_device_is_compatible(child, "altr,socfpga-sdmmc-ecc")) - + if (of_match_node(altr_edac_a10_device_of_match, child)) altr_edac_a10_device_add(edac, child); #ifdef CONFIG_EDAC_ALTERA_SDRAM From 5781823fd0d39082bfe2bbc20408aaa85a6e06ad Mon Sep 17 00:00:00 2001 From: Thor Thayer Date: Thu, 21 Nov 2019 12:30:48 -0600 Subject: [PATCH 35/35] EDAC/altera: Use the Altera System Manager driver Simplify by using the Altera System Manager driver that abstracts the differences between ARM32 and ARM64. Also allows the removal of the Arria10 test function since this is handled by the System Manager driver. Signed-off-by: Thor Thayer Signed-off-by: Borislav Petkov Cc: James Morse Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Meng.Li@windriver.com Cc: Robert Richter Cc: Tony Luck Link: https://lkml.kernel.org/r/1574361048-17572-4-git-send-email-thor.thayer@linux.intel.com --- drivers/edac/altera_edac.c | 132 +++---------------------------------- 1 file changed, 8 insertions(+), 124 deletions(-) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 3e86cf327ad0..e91cf1147a4e 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -275,7 +276,6 @@ static int a10_unmask_irq(struct platform_device *pdev, u32 mask) return ret; } -static int socfpga_is_a10(void); static int altr_sdram_probe(struct platform_device *pdev) { const struct of_device_id *id; @@ -399,7 +399,7 @@ static int altr_sdram_probe(struct platform_device *pdev) goto err; /* Only the Arria10 has separate IRQs */ - if (socfpga_is_a10()) { + if (of_machine_is_compatible("altr,socfpga-arria10")) { /* Arria10 specific initialization */ res = a10_init(mc_vbase); if (res < 0) @@ -502,69 +502,6 @@ module_platform_driver(altr_sdram_edac_driver); #endif /* CONFIG_EDAC_ALTERA_SDRAM */ -/**************** Stratix 10 EDAC Memory Controller Functions ************/ - -/** - * s10_protected_reg_write - * Write to a protected SMC register. - * @context: Not used. - * @reg: Address of register - * @value: Value to write - * Return: INTEL_SIP_SMC_STATUS_OK (0) on success - * INTEL_SIP_SMC_REG_ERROR on error - * INTEL_SIP_SMC_RETURN_UNKNOWN_FUNCTION if not supported - */ -static int s10_protected_reg_write(void *context, unsigned int reg, - unsigned int val) -{ - struct arm_smccc_res result; - unsigned long offset = (unsigned long)context; - - arm_smccc_smc(INTEL_SIP_SMC_REG_WRITE, offset + reg, val, 0, 0, - 0, 0, 0, &result); - - return (int)result.a0; -} - -/** - * s10_protected_reg_read - * Read the status of a protected SMC register - * @context: Not used. - * @reg: Address of register - * @value: Value read. - * Return: INTEL_SIP_SMC_STATUS_OK (0) on success - * INTEL_SIP_SMC_REG_ERROR on error - * INTEL_SIP_SMC_RETURN_UNKNOWN_FUNCTION if not supported - */ -static int s10_protected_reg_read(void *context, unsigned int reg, - unsigned int *val) -{ - struct arm_smccc_res result; - unsigned long offset = (unsigned long)context; - - arm_smccc_smc(INTEL_SIP_SMC_REG_READ, offset + reg, 0, 0, 0, - 0, 0, 0, &result); - - *val = (unsigned int)result.a1; - - return (int)result.a0; -} - -static const struct regmap_config s10_sdram_regmap_cfg = { - .name = "s10_ddr", - .reg_bits = 32, - .reg_stride = 4, - .val_bits = 32, - .max_register = 0xffd12228, - .reg_read = s10_protected_reg_read, - .reg_write = s10_protected_reg_write, - .use_single_read = true, - .use_single_write = true, - .fast_io = true, -}; - -/************** ***********/ - /************************* EDAC Parent Probe *************************/ static const struct of_device_id altr_edac_device_of_match[]; @@ -1009,11 +946,6 @@ static int __maybe_unused altr_init_memory_port(void __iomem *ioaddr, int port) return ret; } -static int socfpga_is_a10(void) -{ - return of_machine_is_compatible("altr,socfpga-arria10"); -} - static __init int __maybe_unused altr_init_a10_ecc_block(struct device_node *np, u32 irq_mask, u32 ecc_ctrl_en_mask, bool dual_port) @@ -1029,34 +961,10 @@ altr_init_a10_ecc_block(struct device_node *np, u32 irq_mask, /* Get the ECC Manager - parent of the device EDACs */ np_eccmgr = of_get_parent(np); - if (socfpga_is_a10()) { - ecc_mgr_map = syscon_regmap_lookup_by_phandle(np_eccmgr, - "altr,sysmgr-syscon"); - } else { - struct device_node *sysmgr_np; - struct resource res; - uintptr_t base; + ecc_mgr_map = + altr_sysmgr_regmap_lookup_by_phandle(np_eccmgr, + "altr,sysmgr-syscon"); - sysmgr_np = of_parse_phandle(np_eccmgr, - "altr,sysmgr-syscon", 0); - if (!sysmgr_np) { - edac_printk(KERN_ERR, EDAC_DEVICE, - "Unable to find altr,sysmgr-syscon\n"); - return -ENODEV; - } - - if (of_address_to_resource(sysmgr_np, 0, &res)) { - of_node_put(sysmgr_np); - return -ENOMEM; - } - - /* Need physical address for SMCC call */ - base = res.start; - - ecc_mgr_map = regmap_init(NULL, NULL, (void *)base, - &s10_sdram_regmap_cfg); - of_node_put(sysmgr_np); - } of_node_put(np_eccmgr); if (IS_ERR(ecc_mgr_map)) { edac_printk(KERN_ERR, EDAC_DEVICE, @@ -2171,33 +2079,9 @@ static int altr_edac_a10_probe(struct platform_device *pdev) platform_set_drvdata(pdev, edac); INIT_LIST_HEAD(&edac->a10_ecc_devices); - if (socfpga_is_a10()) { - edac->ecc_mgr_map = - syscon_regmap_lookup_by_phandle(pdev->dev.of_node, - "altr,sysmgr-syscon"); - } else { - struct device_node *sysmgr_np; - struct resource res; - uintptr_t base; - - sysmgr_np = of_parse_phandle(pdev->dev.of_node, - "altr,sysmgr-syscon", 0); - if (!sysmgr_np) { - edac_printk(KERN_ERR, EDAC_DEVICE, - "Unable to find altr,sysmgr-syscon\n"); - return -ENODEV; - } - - if (of_address_to_resource(sysmgr_np, 0, &res)) - return -ENOMEM; - - /* Need physical address for SMCC call */ - base = res.start; - - edac->ecc_mgr_map = devm_regmap_init(&pdev->dev, NULL, - (void *)base, - &s10_sdram_regmap_cfg); - } + edac->ecc_mgr_map = + altr_sysmgr_regmap_lookup_by_phandle(pdev->dev.of_node, + "altr,sysmgr-syscon"); if (IS_ERR(edac->ecc_mgr_map)) { edac_printk(KERN_ERR, EDAC_DEVICE,