linux-stable/include/acpi/ghes.h
James Morse 7f17b4a121 ACPI: APEI: Kick the memory_failure() queue for synchronous errors
memory_failure() offlines or repairs pages of memory that have been
discovered to be corrupt. These may be detected by an external
component, (e.g. the memory controller), and notified via an IRQ.
In this case the work is queued as not all of memory_failure()s work
can happen in IRQ context.

If the error was detected as a result of user-space accessing a
corrupt memory location the CPU may take an abort instead. On arm64
this is a 'synchronous external abort', and on a firmware first
system it is replayed using NOTIFY_SEA.

This notification has NMI like properties, (it can interrupt
IRQ-masked code), so the memory_failure() work is queued. If we
return to user-space before the queued memory_failure() work is
processed, we will take the fault again. This loop may cause platform
firmware to exceed some threshold and reboot when Linux could have
recovered from this error.

For NMIlike notifications keep track of whether memory_failure() work
was queued, and make task_work pending to flush out the queue.
To save memory allocations, the task_work is allocated as part of
the ghes_estatus_node, and free()ing it back to the pool is deferred.

Signed-off-by: James Morse <james.morse@arm.com>
Tested-by: Tyler Baicar <baicar@os.amperecomputing.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2020-05-19 19:51:11 +02:00

130 lines
3 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef GHES_H
#define GHES_H
#include <acpi/apei.h>
#include <acpi/hed.h>
/*
* One struct ghes is created for each generic hardware error source.
* It provides the context for APEI hardware error timer/IRQ/SCI/NMI
* handler.
*
* estatus: memory buffer for error status block, allocated during
* HEST parsing.
*/
#define GHES_EXITING 0x0002
struct ghes {
union {
struct acpi_hest_generic *generic;
struct acpi_hest_generic_v2 *generic_v2;
};
struct acpi_hest_generic_status *estatus;
unsigned long flags;
union {
struct list_head list;
struct timer_list timer;
unsigned int irq;
};
};
struct ghes_estatus_node {
struct llist_node llnode;
struct acpi_hest_generic *generic;
struct ghes *ghes;
int task_work_cpu;
struct callback_head task_work;
};
struct ghes_estatus_cache {
u32 estatus_len;
atomic_t count;
struct acpi_hest_generic *generic;
unsigned long long time_in;
struct rcu_head rcu;
};
enum {
GHES_SEV_NO = 0x0,
GHES_SEV_CORRECTED = 0x1,
GHES_SEV_RECOVERABLE = 0x2,
GHES_SEV_PANIC = 0x3,
};
int ghes_estatus_pool_init(int num_ghes);
/* From drivers/edac/ghes_edac.c */
#ifdef CONFIG_EDAC_GHES
void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err);
int ghes_edac_register(struct ghes *ghes, struct device *dev);
void ghes_edac_unregister(struct ghes *ghes);
#else
static inline void ghes_edac_report_mem_error(int sev,
struct cper_sec_mem_err *mem_err)
{
}
static inline int ghes_edac_register(struct ghes *ghes, struct device *dev)
{
return -ENODEV;
}
static inline void ghes_edac_unregister(struct ghes *ghes)
{
}
#endif
static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata)
{
return gdata->revision >> 8;
}
static inline void *acpi_hest_get_payload(struct acpi_hest_generic_data *gdata)
{
if (acpi_hest_get_version(gdata) >= 3)
return (void *)(((struct acpi_hest_generic_data_v300 *)(gdata)) + 1);
return gdata + 1;
}
static inline int acpi_hest_get_error_length(struct acpi_hest_generic_data *gdata)
{
return ((struct acpi_hest_generic_data *)(gdata))->error_data_length;
}
static inline int acpi_hest_get_size(struct acpi_hest_generic_data *gdata)
{
if (acpi_hest_get_version(gdata) >= 3)
return sizeof(struct acpi_hest_generic_data_v300);
return sizeof(struct acpi_hest_generic_data);
}
static inline int acpi_hest_get_record_size(struct acpi_hest_generic_data *gdata)
{
return (acpi_hest_get_size(gdata) + acpi_hest_get_error_length(gdata));
}
static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata)
{
return (void *)(gdata) + acpi_hest_get_record_size(gdata);
}
#define apei_estatus_for_each_section(estatus, section) \
for (section = (struct acpi_hest_generic_data *)(estatus + 1); \
(void *)section - (void *)(estatus + 1) < estatus->data_length; \
section = acpi_hest_get_next(section))
#ifdef CONFIG_ACPI_APEI_SEA
int ghes_notify_sea(void);
#else
static inline int ghes_notify_sea(void) { return -ENOENT; }
#endif
#endif /* GHES_H */