- Convert the hw error storm handling into a finer-grained, per-bank
solution which allows for more timely detection and reporting of errors - Start a documentation section which will hold down relevant RAS features description and how they should be used - Add new AMD error bank types - Slim down and remove error type descriptions from the kernel side of error decoding to rasdaemon which can be used from now on to decode hw errors on AMD - Mark pages containing uncorrectable errors as poison so that kdump can avoid them and thus not cause another panic - The usual cleanups and fixlets -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmWamqkACgkQEsHwGGHe VUrkMw/+Jv//z5pKFMXF2GlI/Xefh8pXDB57D22B1T6zNJm+Qq1b58U44WpxoU24 b9gPtkgxjzA3JwoQG+cBDkCSs1xLV3McUS0UoEqQ28QvFN+mzxYKu4ura3F+rcZG PiCT4gPEYgZirYl0PKYeypnBPq3Krx/RYeTUE0vQ9HtmeBCmH71X1egyB4TqFbFU 7ui9aITLAnLBwgO6On1qviGPvJoEDGAsQ656XoY0Js+8dUeYwAI4qpaaUwtXUo1D ARGzss55/qTRo2G+IkDDhbJ8e8G+eX22oa0n1tdhNFBqfYAN6gM+t4NiFlNnn+18 nlbaSZfDygciE8DVjEnkVIrRJtkq7uj0dk7LvnqEI2y7J0LybHojC3hDKrqYLK3o PRgfPwmykOCwZQldGFYbShvmY8KoEQc/V9OWi/+A/M/uTJsForQmHn78Z2YkO9kG K6VaLuYszSCqz47wf56pHBwtMrivEPmkcxaz9ErkK90vM/NmV7kfLl+R8IK8apNJ cJkuLBjfgGpBP+AlpXGl9OE0lRJK5MCbEBlbGBBl58REBaB4DNkM4QHItrUSRR82 ADLcfLAIRWT8UwwXieDbWF1jb+4L+IJnXCKGCCQ7+eYxcFI9V9TABD2B8io+Dzvz evZwLCPKmjuPc2CMcDu/eUdBKLNTn3QAoN/NLcVmzJ23lguQW/M= =o3UM -----END PGP SIGNATURE----- Merge tag 'ras_core_for_v6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull x86 RAS updates from Borislav Petkov: - Convert the hw error storm handling into a finer-grained, per-bank solution which allows for more timely detection and reporting of errors - Start a documentation section which will hold down relevant RAS features description and how they should be used - Add new AMD error bank types - Slim down and remove error type descriptions from the kernel side of error decoding to rasdaemon which can be used from now on to decode hw errors on AMD - Mark pages containing uncorrectable errors as poison so that kdump can avoid them and thus not cause another panic - The usual cleanups and fixlets * tag 'ras_core_for_v6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Handle Intel threshold interrupt storms x86/mce: Add per-bank CMCI storm mitigation x86/mce: Remove old CMCI storm mitigation code Documentation: Begin a RAS section x86/MCE/AMD: Add new MA_LLC, USR_DP, and USR_CP bank types EDAC/mce_amd: Remove SMCA Extended Error code descriptions x86/mce/amd, EDAC/mce_amd: Move long names to decoder module x86/mce/inject: Clear test status value x86/mce: Remove redundant check from mce_device_create() x86/mce: Mark fatal MCE's page as poison to avoid panic in the kdump kernel
This commit is contained in:
commit
3edbe8afb6
26
Documentation/RAS/ras.rst
Normal file
26
Documentation/RAS/ras.rst
Normal file
@ -0,0 +1,26 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Reliability, Availability and Serviceability features
|
||||
=====================================================
|
||||
|
||||
This documents different aspects of the RAS functionality present in the
|
||||
kernel.
|
||||
|
||||
Error decoding
|
||||
---------------
|
||||
|
||||
* x86
|
||||
|
||||
Error decoding on AMD systems should be done using the rasdaemon tool:
|
||||
https://github.com/mchehab/rasdaemon/
|
||||
|
||||
While the daemon is running, it would automatically log and decode
|
||||
errors. If not, one can still decode such errors by supplying the
|
||||
hardware information from the error::
|
||||
|
||||
$ rasdaemon -p --status <STATUS> --ipid <IPID> --smca
|
||||
|
||||
Also, the user can pass particular family and model to decode the error
|
||||
string::
|
||||
|
||||
$ rasdaemon -p --status <STATUS> --ipid <IPID> --smca --family <CPU Family> --model <CPU Model> --bank <BANK_NUM>
|
@ -113,6 +113,7 @@ to ReStructured Text format, or are simply too old.
|
||||
:maxdepth: 1
|
||||
|
||||
staging/index
|
||||
RAS/ras
|
||||
|
||||
|
||||
Translations
|
||||
|
@ -311,6 +311,7 @@ enum smca_bank_types {
|
||||
SMCA_PIE, /* Power, Interrupts, etc. */
|
||||
SMCA_UMC, /* Unified Memory Controller */
|
||||
SMCA_UMC_V2,
|
||||
SMCA_MA_LLC, /* Memory Attached Last Level Cache */
|
||||
SMCA_PB, /* Parameter Block */
|
||||
SMCA_PSP, /* Platform Security Processor */
|
||||
SMCA_PSP_V2,
|
||||
@ -326,6 +327,8 @@ enum smca_bank_types {
|
||||
SMCA_SHUB, /* System HUB Unit */
|
||||
SMCA_SATA, /* SATA Unit */
|
||||
SMCA_USB, /* USB Unit */
|
||||
SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */
|
||||
SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */
|
||||
SMCA_GMI_PCS, /* GMI PCS Unit */
|
||||
SMCA_XGMI_PHY, /* xGMI PHY Unit */
|
||||
SMCA_WAFL_PHY, /* WAFL PHY Unit */
|
||||
@ -333,7 +336,6 @@ enum smca_bank_types {
|
||||
N_SMCA_BANK_TYPES
|
||||
};
|
||||
|
||||
extern const char *smca_get_long_name(enum smca_bank_types t);
|
||||
extern bool amd_mce_is_memory_error(struct mce *m);
|
||||
|
||||
extern int mce_threshold_create_device(unsigned int cpu);
|
||||
|
@ -87,42 +87,40 @@ struct smca_bank {
|
||||
static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
|
||||
static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
|
||||
|
||||
struct smca_bank_name {
|
||||
const char *name; /* Short name for sysfs */
|
||||
const char *long_name; /* Long name for pretty-printing */
|
||||
};
|
||||
|
||||
static struct smca_bank_name smca_names[] = {
|
||||
[SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" },
|
||||
[SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
|
||||
[SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
|
||||
[SMCA_DE] = { "decode_unit", "Decode Unit" },
|
||||
[SMCA_RESERVED] = { "reserved", "Reserved" },
|
||||
[SMCA_EX] = { "execution_unit", "Execution Unit" },
|
||||
[SMCA_FP] = { "floating_point", "Floating Point Unit" },
|
||||
[SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
|
||||
[SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
|
||||
[SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
|
||||
static const char * const smca_names[] = {
|
||||
[SMCA_LS ... SMCA_LS_V2] = "load_store",
|
||||
[SMCA_IF] = "insn_fetch",
|
||||
[SMCA_L2_CACHE] = "l2_cache",
|
||||
[SMCA_DE] = "decode_unit",
|
||||
[SMCA_RESERVED] = "reserved",
|
||||
[SMCA_EX] = "execution_unit",
|
||||
[SMCA_FP] = "floating_point",
|
||||
[SMCA_L3_CACHE] = "l3_cache",
|
||||
[SMCA_CS ... SMCA_CS_V2] = "coherent_slave",
|
||||
[SMCA_PIE] = "pie",
|
||||
|
||||
/* UMC v2 is separate because both of them can exist in a single system. */
|
||||
[SMCA_UMC] = { "umc", "Unified Memory Controller" },
|
||||
[SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" },
|
||||
[SMCA_PB] = { "param_block", "Parameter Block" },
|
||||
[SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
|
||||
[SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
|
||||
[SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
|
||||
[SMCA_MPDMA] = { "mpdma", "MPDMA Unit" },
|
||||
[SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
|
||||
[SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" },
|
||||
[SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" },
|
||||
[SMCA_NBIF] = { "nbif", "NBIF Unit" },
|
||||
[SMCA_SHUB] = { "shub", "System Hub Unit" },
|
||||
[SMCA_SATA] = { "sata", "SATA Unit" },
|
||||
[SMCA_USB] = { "usb", "USB Unit" },
|
||||
[SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" },
|
||||
[SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
|
||||
[SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
|
||||
[SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" },
|
||||
[SMCA_UMC] = "umc",
|
||||
[SMCA_UMC_V2] = "umc_v2",
|
||||
[SMCA_MA_LLC] = "ma_llc",
|
||||
[SMCA_PB] = "param_block",
|
||||
[SMCA_PSP ... SMCA_PSP_V2] = "psp",
|
||||
[SMCA_SMU ... SMCA_SMU_V2] = "smu",
|
||||
[SMCA_MP5] = "mp5",
|
||||
[SMCA_MPDMA] = "mpdma",
|
||||
[SMCA_NBIO] = "nbio",
|
||||
[SMCA_PCIE ... SMCA_PCIE_V2] = "pcie",
|
||||
[SMCA_XGMI_PCS] = "xgmi_pcs",
|
||||
[SMCA_NBIF] = "nbif",
|
||||
[SMCA_SHUB] = "shub",
|
||||
[SMCA_SATA] = "sata",
|
||||
[SMCA_USB] = "usb",
|
||||
[SMCA_USR_DP] = "usr_dp",
|
||||
[SMCA_USR_CP] = "usr_cp",
|
||||
[SMCA_GMI_PCS] = "gmi_pcs",
|
||||
[SMCA_XGMI_PHY] = "xgmi_phy",
|
||||
[SMCA_WAFL_PHY] = "wafl_phy",
|
||||
[SMCA_GMI_PHY] = "gmi_phy",
|
||||
};
|
||||
|
||||
static const char *smca_get_name(enum smca_bank_types t)
|
||||
@ -130,18 +128,9 @@ static const char *smca_get_name(enum smca_bank_types t)
|
||||
if (t >= N_SMCA_BANK_TYPES)
|
||||
return NULL;
|
||||
|
||||
return smca_names[t].name;
|
||||
return smca_names[t];
|
||||
}
|
||||
|
||||
const char *smca_get_long_name(enum smca_bank_types t)
|
||||
{
|
||||
if (t >= N_SMCA_BANK_TYPES)
|
||||
return NULL;
|
||||
|
||||
return smca_names[t].long_name;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(smca_get_long_name);
|
||||
|
||||
enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
|
||||
{
|
||||
struct smca_bank *b;
|
||||
@ -178,6 +167,7 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
{ SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
|
||||
{ SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
|
||||
{ SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
|
||||
{ SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
|
||||
|
||||
/* Unified Memory Controller MCA type */
|
||||
{ SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
|
||||
@ -212,6 +202,8 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
{ SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
|
||||
{ SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
|
||||
{ SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
|
||||
{ SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
|
||||
{ SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
|
||||
{ SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
|
||||
{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
|
||||
{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
|
||||
|
@ -44,6 +44,7 @@
|
||||
#include <linux/sync_core.h>
|
||||
#include <linux/task_work.h>
|
||||
#include <linux/hardirq.h>
|
||||
#include <linux/kexec.h>
|
||||
|
||||
#include <asm/intel-family.h>
|
||||
#include <asm/processor.h>
|
||||
@ -233,6 +234,7 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
|
||||
struct llist_node *pending;
|
||||
struct mce_evt_llist *l;
|
||||
int apei_err = 0;
|
||||
struct page *p;
|
||||
|
||||
/*
|
||||
* Allow instrumentation around external facilities usage. Not that it
|
||||
@ -286,6 +288,20 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
|
||||
if (!fake_panic) {
|
||||
if (panic_timeout == 0)
|
||||
panic_timeout = mca_cfg.panic_timeout;
|
||||
|
||||
/*
|
||||
* Kdump skips the poisoned page in order to avoid
|
||||
* touching the error bits again. Poison the page even
|
||||
* if the error is fatal and the machine is about to
|
||||
* panic.
|
||||
*/
|
||||
if (kexec_crash_loaded()) {
|
||||
if (final && (final->status & MCI_STATUS_ADDRV)) {
|
||||
p = pfn_to_online_page(final->addr >> PAGE_SHIFT);
|
||||
if (p)
|
||||
SetPageHWPoison(p);
|
||||
}
|
||||
}
|
||||
panic(msg);
|
||||
} else
|
||||
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
|
||||
@ -670,6 +686,16 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
|
||||
barrier();
|
||||
m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
|
||||
|
||||
/*
|
||||
* Update storm tracking here, before checking for the
|
||||
* MCI_STATUS_VAL bit. Valid corrected errors count
|
||||
* towards declaring, or maintaining, storm status. No
|
||||
* error in a bank counts towards avoiding, or ending,
|
||||
* storm status.
|
||||
*/
|
||||
if (!mca_cfg.cmci_disabled)
|
||||
mce_track_storm(&m);
|
||||
|
||||
/* If this entry is not valid, ignore it */
|
||||
if (!(m.status & MCI_STATUS_VAL))
|
||||
continue;
|
||||
@ -1601,13 +1627,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
|
||||
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
|
||||
static DEFINE_PER_CPU(struct timer_list, mce_timer);
|
||||
|
||||
static unsigned long mce_adjust_timer_default(unsigned long interval)
|
||||
{
|
||||
return interval;
|
||||
}
|
||||
|
||||
static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
|
||||
|
||||
static void __start_timer(struct timer_list *t, unsigned long interval)
|
||||
{
|
||||
unsigned long when = jiffies + interval;
|
||||
@ -1637,15 +1656,9 @@ static void mce_timer_fn(struct timer_list *t)
|
||||
|
||||
iv = __this_cpu_read(mce_next_interval);
|
||||
|
||||
if (mce_available(this_cpu_ptr(&cpu_info))) {
|
||||
if (mce_available(this_cpu_ptr(&cpu_info)))
|
||||
mc_poll_banks();
|
||||
|
||||
if (mce_intel_cmci_poll()) {
|
||||
iv = mce_adjust_timer(iv);
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Alert userspace if needed. If we logged an MCE, reduce the polling
|
||||
* interval, otherwise increase the polling interval.
|
||||
@ -1655,23 +1668,29 @@ static void mce_timer_fn(struct timer_list *t)
|
||||
else
|
||||
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
|
||||
|
||||
done:
|
||||
__this_cpu_write(mce_next_interval, iv);
|
||||
__start_timer(t, iv);
|
||||
if (mce_get_storm_mode()) {
|
||||
__start_timer(t, HZ);
|
||||
} else {
|
||||
__this_cpu_write(mce_next_interval, iv);
|
||||
__start_timer(t, iv);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure that the timer is firing in @interval from now.
|
||||
* When a storm starts on any bank on this CPU, switch to polling
|
||||
* once per second. When the storm ends, revert to the default
|
||||
* polling interval.
|
||||
*/
|
||||
void mce_timer_kick(unsigned long interval)
|
||||
void mce_timer_kick(bool storm)
|
||||
{
|
||||
struct timer_list *t = this_cpu_ptr(&mce_timer);
|
||||
unsigned long iv = __this_cpu_read(mce_next_interval);
|
||||
|
||||
__start_timer(t, interval);
|
||||
mce_set_storm_mode(storm);
|
||||
|
||||
if (interval < iv)
|
||||
__this_cpu_write(mce_next_interval, interval);
|
||||
if (storm)
|
||||
__start_timer(t, HZ);
|
||||
else
|
||||
__this_cpu_write(mce_next_interval, check_interval * HZ);
|
||||
}
|
||||
|
||||
/* Must not be called in IRQ context where del_timer_sync() can deadlock */
|
||||
@ -1995,7 +2014,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
|
||||
|
||||
intel_init_cmci();
|
||||
intel_init_lmce();
|
||||
mce_adjust_timer = cmci_intel_adjust_timer;
|
||||
}
|
||||
|
||||
static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
|
||||
@ -2008,7 +2026,6 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
|
||||
switch (c->x86_vendor) {
|
||||
case X86_VENDOR_INTEL:
|
||||
mce_intel_feature_init(c);
|
||||
mce_adjust_timer = cmci_intel_adjust_timer;
|
||||
break;
|
||||
|
||||
case X86_VENDOR_AMD: {
|
||||
@ -2568,9 +2585,6 @@ static int mce_device_create(unsigned int cpu)
|
||||
int err;
|
||||
int i, j;
|
||||
|
||||
if (!mce_available(&boot_cpu_data))
|
||||
return -EIO;
|
||||
|
||||
dev = per_cpu(mce_device, cpu);
|
||||
if (dev)
|
||||
return 0;
|
||||
@ -2665,8 +2679,6 @@ static void mce_reenable_cpu(void)
|
||||
|
||||
static int mce_cpu_dead(unsigned int cpu)
|
||||
{
|
||||
mce_intel_hcpu_update(cpu);
|
||||
|
||||
/* intentionally ignoring frozen here */
|
||||
if (!cpuhp_tasks_frozen)
|
||||
cmci_rediscover();
|
||||
|
@ -746,6 +746,7 @@ static void check_hw_inj_possible(void)
|
||||
|
||||
wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status);
|
||||
rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status);
|
||||
wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0);
|
||||
|
||||
if (!status) {
|
||||
hw_injection_possible = false;
|
||||
|
@ -41,15 +41,6 @@
|
||||
*/
|
||||
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
|
||||
|
||||
/*
|
||||
* CMCI storm detection backoff counter
|
||||
*
|
||||
* During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
|
||||
* encountered an error. If not, we decrement it by one. We signal the end of
|
||||
* the CMCI storm when it reaches 0.
|
||||
*/
|
||||
static DEFINE_PER_CPU(int, cmci_backoff_cnt);
|
||||
|
||||
/*
|
||||
* cmci_discover_lock protects against parallel discovery attempts
|
||||
* which could race against each other.
|
||||
@ -63,22 +54,26 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
|
||||
*/
|
||||
static DEFINE_SPINLOCK(cmci_poll_lock);
|
||||
|
||||
/* Linux non-storm CMCI threshold (may be overridden by BIOS) */
|
||||
#define CMCI_THRESHOLD 1
|
||||
#define CMCI_POLL_INTERVAL (30 * HZ)
|
||||
#define CMCI_STORM_INTERVAL (HZ)
|
||||
#define CMCI_STORM_THRESHOLD 15
|
||||
|
||||
static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
|
||||
static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
|
||||
static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
|
||||
/*
|
||||
* MCi_CTL2 threshold for each bank when there is no storm.
|
||||
* Default value for each bank may have been set by BIOS.
|
||||
*/
|
||||
static u16 cmci_threshold[MAX_NR_BANKS];
|
||||
|
||||
enum {
|
||||
CMCI_STORM_NONE,
|
||||
CMCI_STORM_ACTIVE,
|
||||
CMCI_STORM_SUBSIDED,
|
||||
};
|
||||
|
||||
static atomic_t cmci_storm_on_cpus;
|
||||
/*
|
||||
* High threshold to limit CMCI rate during storms. Max supported is
|
||||
* 0x7FFF. Use this slightly smaller value so it has a distinctive
|
||||
* signature when some asks "Why am I not seeing all corrected errors?"
|
||||
* A high threshold is used instead of just disabling CMCI for a
|
||||
* bank because both corrected and uncorrected errors may be logged
|
||||
* in the same bank and signalled with CMCI. The threshold only applies
|
||||
* to corrected errors, so keeping CMCI enabled means that uncorrected
|
||||
* errors will still be processed in a timely fashion.
|
||||
*/
|
||||
#define CMCI_STORM_THRESHOLD 32749
|
||||
|
||||
static int cmci_supported(int *banks)
|
||||
{
|
||||
@ -134,122 +129,29 @@ static bool lmce_supported(void)
|
||||
return tmp & FEAT_CTL_LMCE_ENABLED;
|
||||
}
|
||||
|
||||
bool mce_intel_cmci_poll(void)
|
||||
/*
|
||||
* Set a new CMCI threshold value. Preserve the state of the
|
||||
* MCI_CTL2_CMCI_EN bit in case this happens during a
|
||||
* cmci_rediscover() operation.
|
||||
*/
|
||||
static void cmci_set_threshold(int bank, int thresh)
|
||||
{
|
||||
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Reset the counter if we've logged an error in the last poll
|
||||
* during the storm.
|
||||
*/
|
||||
if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
|
||||
this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
|
||||
else
|
||||
this_cpu_dec(cmci_backoff_cnt);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void mce_intel_hcpu_update(unsigned long cpu)
|
||||
{
|
||||
if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
|
||||
atomic_dec(&cmci_storm_on_cpus);
|
||||
|
||||
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
|
||||
}
|
||||
|
||||
static void cmci_toggle_interrupt_mode(bool on)
|
||||
{
|
||||
unsigned long flags, *owned;
|
||||
int bank;
|
||||
unsigned long flags;
|
||||
u64 val;
|
||||
|
||||
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
|
||||
owned = this_cpu_ptr(mce_banks_owned);
|
||||
for_each_set_bit(bank, owned, MAX_NR_BANKS) {
|
||||
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
|
||||
|
||||
if (on)
|
||||
val |= MCI_CTL2_CMCI_EN;
|
||||
else
|
||||
val &= ~MCI_CTL2_CMCI_EN;
|
||||
|
||||
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
|
||||
}
|
||||
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
|
||||
val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
|
||||
wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh);
|
||||
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
|
||||
}
|
||||
|
||||
unsigned long cmci_intel_adjust_timer(unsigned long interval)
|
||||
void mce_intel_handle_storm(int bank, bool on)
|
||||
{
|
||||
if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
|
||||
(__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
|
||||
mce_notify_irq();
|
||||
return CMCI_STORM_INTERVAL;
|
||||
}
|
||||
|
||||
switch (__this_cpu_read(cmci_storm_state)) {
|
||||
case CMCI_STORM_ACTIVE:
|
||||
|
||||
/*
|
||||
* We switch back to interrupt mode once the poll timer has
|
||||
* silenced itself. That means no events recorded and the timer
|
||||
* interval is back to our poll interval.
|
||||
*/
|
||||
__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
|
||||
if (!atomic_sub_return(1, &cmci_storm_on_cpus))
|
||||
pr_notice("CMCI storm subsided: switching to interrupt mode\n");
|
||||
|
||||
fallthrough;
|
||||
|
||||
case CMCI_STORM_SUBSIDED:
|
||||
/*
|
||||
* We wait for all CPUs to go back to SUBSIDED state. When that
|
||||
* happens we switch back to interrupt mode.
|
||||
*/
|
||||
if (!atomic_read(&cmci_storm_on_cpus)) {
|
||||
__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
|
||||
cmci_toggle_interrupt_mode(true);
|
||||
cmci_recheck();
|
||||
}
|
||||
return CMCI_POLL_INTERVAL;
|
||||
default:
|
||||
|
||||
/* We have shiny weather. Let the poll do whatever it thinks. */
|
||||
return interval;
|
||||
}
|
||||
}
|
||||
|
||||
static bool cmci_storm_detect(void)
|
||||
{
|
||||
unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
|
||||
unsigned long ts = __this_cpu_read(cmci_time_stamp);
|
||||
unsigned long now = jiffies;
|
||||
int r;
|
||||
|
||||
if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
|
||||
return true;
|
||||
|
||||
if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
|
||||
cnt++;
|
||||
} else {
|
||||
cnt = 1;
|
||||
__this_cpu_write(cmci_time_stamp, now);
|
||||
}
|
||||
__this_cpu_write(cmci_storm_cnt, cnt);
|
||||
|
||||
if (cnt <= CMCI_STORM_THRESHOLD)
|
||||
return false;
|
||||
|
||||
cmci_toggle_interrupt_mode(false);
|
||||
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
|
||||
r = atomic_add_return(1, &cmci_storm_on_cpus);
|
||||
mce_timer_kick(CMCI_STORM_INTERVAL);
|
||||
this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
|
||||
|
||||
if (r == 1)
|
||||
pr_notice("CMCI storm detected: switching to poll mode\n");
|
||||
return true;
|
||||
if (on)
|
||||
cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
|
||||
else
|
||||
cmci_set_threshold(bank, cmci_threshold[bank]);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -260,78 +162,133 @@ static bool cmci_storm_detect(void)
|
||||
*/
|
||||
static void intel_threshold_interrupt(void)
|
||||
{
|
||||
if (cmci_storm_detect())
|
||||
return;
|
||||
|
||||
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
|
||||
}
|
||||
|
||||
/*
|
||||
* Check all the reasons why current CPU cannot claim
|
||||
* ownership of a bank.
|
||||
* 1: CPU already owns this bank
|
||||
* 2: BIOS owns this bank
|
||||
* 3: Some other CPU owns this bank
|
||||
*/
|
||||
static bool cmci_skip_bank(int bank, u64 *val)
|
||||
{
|
||||
unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
|
||||
|
||||
if (test_bit(bank, owned))
|
||||
return true;
|
||||
|
||||
/* Skip banks in firmware first mode */
|
||||
if (test_bit(bank, mce_banks_ce_disabled))
|
||||
return true;
|
||||
|
||||
rdmsrl(MSR_IA32_MCx_CTL2(bank), *val);
|
||||
|
||||
/* Already owned by someone else? */
|
||||
if (*val & MCI_CTL2_CMCI_EN) {
|
||||
clear_bit(bank, owned);
|
||||
__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Decide which CMCI interrupt threshold to use:
|
||||
* 1: If this bank is in storm mode from whichever CPU was
|
||||
* the previous owner, stay in storm mode.
|
||||
* 2: If ignoring any threshold set by BIOS, set Linux default
|
||||
* 3: Try to honor BIOS threshold (unless buggy BIOS set it at zero).
|
||||
*/
|
||||
static u64 cmci_pick_threshold(u64 val, int *bios_zero_thresh)
|
||||
{
|
||||
if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
|
||||
return val;
|
||||
|
||||
if (!mca_cfg.bios_cmci_threshold) {
|
||||
val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
|
||||
val |= CMCI_THRESHOLD;
|
||||
} else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
|
||||
/*
|
||||
* If bios_cmci_threshold boot option was specified
|
||||
* but the threshold is zero, we'll try to initialize
|
||||
* it to 1.
|
||||
*/
|
||||
*bios_zero_thresh = 1;
|
||||
val |= CMCI_THRESHOLD;
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to claim ownership of a bank.
|
||||
*/
|
||||
static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_wrong_thresh)
|
||||
{
|
||||
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
|
||||
|
||||
val |= MCI_CTL2_CMCI_EN;
|
||||
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
|
||||
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
|
||||
|
||||
/* If the enable bit did not stick, this bank should be polled. */
|
||||
if (!(val & MCI_CTL2_CMCI_EN)) {
|
||||
WARN_ON(!test_bit(bank, this_cpu_ptr(mce_poll_banks)));
|
||||
storm->banks[bank].poll_only = true;
|
||||
return;
|
||||
}
|
||||
|
||||
/* This CPU successfully set the enable bit. */
|
||||
set_bit(bank, (void *)this_cpu_ptr(&mce_banks_owned));
|
||||
|
||||
if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) {
|
||||
pr_notice("CPU%d BANK%d CMCI inherited storm\n", smp_processor_id(), bank);
|
||||
mce_inherit_storm(bank);
|
||||
cmci_storm_begin(bank);
|
||||
} else {
|
||||
__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
|
||||
}
|
||||
|
||||
/*
|
||||
* We are able to set thresholds for some banks that
|
||||
* had a threshold of 0. This means the BIOS has not
|
||||
* set the thresholds properly or does not work with
|
||||
* this boot option. Note down now and report later.
|
||||
*/
|
||||
if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
|
||||
(val & MCI_CTL2_CMCI_THRESHOLD_MASK))
|
||||
*bios_wrong_thresh = 1;
|
||||
|
||||
/* Save default threshold for each bank */
|
||||
if (cmci_threshold[bank] == 0)
|
||||
cmci_threshold[bank] = val & MCI_CTL2_CMCI_THRESHOLD_MASK;
|
||||
}
|
||||
|
||||
/*
|
||||
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
|
||||
* on this CPU. Use the algorithm recommended in the SDM to discover shared
|
||||
* banks.
|
||||
* banks. Called during initial bootstrap, and also for hotplug CPU operations
|
||||
* to rediscover/reassign machine check banks.
|
||||
*/
|
||||
static void cmci_discover(int banks)
|
||||
{
|
||||
unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
|
||||
int bios_wrong_thresh = 0;
|
||||
unsigned long flags;
|
||||
int i;
|
||||
int bios_wrong_thresh = 0;
|
||||
|
||||
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
|
||||
for (i = 0; i < banks; i++) {
|
||||
u64 val;
|
||||
int bios_zero_thresh = 0;
|
||||
|
||||
if (test_bit(i, owned))
|
||||
if (cmci_skip_bank(i, &val))
|
||||
continue;
|
||||
|
||||
/* Skip banks in firmware first mode */
|
||||
if (test_bit(i, mce_banks_ce_disabled))
|
||||
continue;
|
||||
|
||||
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
|
||||
|
||||
/* Already owned by someone else? */
|
||||
if (val & MCI_CTL2_CMCI_EN) {
|
||||
clear_bit(i, owned);
|
||||
__clear_bit(i, this_cpu_ptr(mce_poll_banks));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!mca_cfg.bios_cmci_threshold) {
|
||||
val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
|
||||
val |= CMCI_THRESHOLD;
|
||||
} else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
|
||||
/*
|
||||
* If bios_cmci_threshold boot option was specified
|
||||
* but the threshold is zero, we'll try to initialize
|
||||
* it to 1.
|
||||
*/
|
||||
bios_zero_thresh = 1;
|
||||
val |= CMCI_THRESHOLD;
|
||||
}
|
||||
|
||||
val |= MCI_CTL2_CMCI_EN;
|
||||
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
|
||||
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
|
||||
|
||||
/* Did the enable bit stick? -- the bank supports CMCI */
|
||||
if (val & MCI_CTL2_CMCI_EN) {
|
||||
set_bit(i, owned);
|
||||
__clear_bit(i, this_cpu_ptr(mce_poll_banks));
|
||||
/*
|
||||
* We are able to set thresholds for some banks that
|
||||
* had a threshold of 0. This means the BIOS has not
|
||||
* set the thresholds properly or does not work with
|
||||
* this boot option. Note down now and report later.
|
||||
*/
|
||||
if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
|
||||
(val & MCI_CTL2_CMCI_THRESHOLD_MASK))
|
||||
bios_wrong_thresh = 1;
|
||||
} else {
|
||||
WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
|
||||
}
|
||||
val = cmci_pick_threshold(val, &bios_zero_thresh);
|
||||
cmci_claim_bank(i, val, bios_zero_thresh, &bios_wrong_thresh);
|
||||
}
|
||||
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
|
||||
if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
|
||||
@ -370,6 +327,9 @@ static void __cmci_disable_bank(int bank)
|
||||
val &= ~MCI_CTL2_CMCI_EN;
|
||||
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
|
||||
__clear_bit(bank, this_cpu_ptr(mce_banks_owned));
|
||||
|
||||
if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
|
||||
cmci_storm_end(bank);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -41,9 +41,7 @@ struct dentry *mce_get_debugfs_dir(void);
|
||||
extern mce_banks_t mce_banks_ce_disabled;
|
||||
|
||||
#ifdef CONFIG_X86_MCE_INTEL
|
||||
unsigned long cmci_intel_adjust_timer(unsigned long interval);
|
||||
bool mce_intel_cmci_poll(void);
|
||||
void mce_intel_hcpu_update(unsigned long cpu);
|
||||
void mce_intel_handle_storm(int bank, bool on);
|
||||
void cmci_disable_bank(int bank);
|
||||
void intel_init_cmci(void);
|
||||
void intel_init_lmce(void);
|
||||
@ -51,9 +49,7 @@ void intel_clear_lmce(void);
|
||||
bool intel_filter_mce(struct mce *m);
|
||||
bool intel_mce_usable_address(struct mce *m);
|
||||
#else
|
||||
# define cmci_intel_adjust_timer mce_adjust_timer_default
|
||||
static inline bool mce_intel_cmci_poll(void) { return false; }
|
||||
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
|
||||
static inline void mce_intel_handle_storm(int bank, bool on) { }
|
||||
static inline void cmci_disable_bank(int bank) { }
|
||||
static inline void intel_init_cmci(void) { }
|
||||
static inline void intel_init_lmce(void) { }
|
||||
@ -62,7 +58,63 @@ static inline bool intel_filter_mce(struct mce *m) { return false; }
|
||||
static inline bool intel_mce_usable_address(struct mce *m) { return false; }
|
||||
#endif
|
||||
|
||||
void mce_timer_kick(unsigned long interval);
|
||||
void mce_timer_kick(bool storm);
|
||||
|
||||
#ifdef CONFIG_X86_MCE_THRESHOLD
|
||||
void cmci_storm_begin(unsigned int bank);
|
||||
void cmci_storm_end(unsigned int bank);
|
||||
void mce_track_storm(struct mce *mce);
|
||||
void mce_inherit_storm(unsigned int bank);
|
||||
bool mce_get_storm_mode(void);
|
||||
void mce_set_storm_mode(bool storm);
|
||||
#else
|
||||
static inline void cmci_storm_begin(unsigned int bank) {}
|
||||
static inline void cmci_storm_end(unsigned int bank) {}
|
||||
static inline void mce_track_storm(struct mce *mce) {}
|
||||
static inline void mce_inherit_storm(unsigned int bank) {}
|
||||
static inline bool mce_get_storm_mode(void) { return false; }
|
||||
static inline void mce_set_storm_mode(bool storm) {}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* history: Bitmask tracking errors occurrence. Each set bit
|
||||
* represents an error seen.
|
||||
*
|
||||
* timestamp: Last time (in jiffies) that the bank was polled.
|
||||
* in_storm_mode: Is this bank in storm mode?
|
||||
* poll_only: Bank does not support CMCI, skip storm tracking.
|
||||
*/
|
||||
struct storm_bank {
|
||||
u64 history;
|
||||
u64 timestamp;
|
||||
bool in_storm_mode;
|
||||
bool poll_only;
|
||||
};
|
||||
|
||||
#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE)
|
||||
|
||||
/* How many errors within the history buffer mark the start of a storm. */
|
||||
#define STORM_BEGIN_THRESHOLD 5
|
||||
|
||||
/*
|
||||
* How many polls of machine check bank without an error before declaring
|
||||
* the storm is over. Since it is tracked by the bitmasks in the history
|
||||
* field of struct storm_bank the mask is 30 bits [0 ... 29].
|
||||
*/
|
||||
#define STORM_END_POLL_THRESHOLD 29
|
||||
|
||||
/*
|
||||
* banks: per-cpu, per-bank details
|
||||
* stormy_bank_count: count of MC banks in storm state
|
||||
* poll_mode: CPU is in poll mode
|
||||
*/
|
||||
struct mca_storm_desc {
|
||||
struct storm_bank banks[MAX_NR_BANKS];
|
||||
u8 stormy_bank_count;
|
||||
bool poll_mode;
|
||||
};
|
||||
|
||||
DECLARE_PER_CPU(struct mca_storm_desc, storm_desc);
|
||||
|
||||
#ifdef CONFIG_ACPI_APEI
|
||||
int apei_write_mce(struct mce *m);
|
||||
|
@ -29,3 +29,118 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
|
||||
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
|
||||
apic_eoi();
|
||||
}
|
||||
|
||||
DEFINE_PER_CPU(struct mca_storm_desc, storm_desc);
|
||||
|
||||
void mce_inherit_storm(unsigned int bank)
|
||||
{
|
||||
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
|
||||
|
||||
/*
|
||||
* Previous CPU owning this bank had put it into storm mode,
|
||||
* but the precise history of that storm is unknown. Assume
|
||||
* the worst (all recent polls of the bank found a valid error
|
||||
* logged). This will avoid the new owner prematurely declaring
|
||||
* the storm has ended.
|
||||
*/
|
||||
storm->banks[bank].history = ~0ull;
|
||||
storm->banks[bank].timestamp = jiffies;
|
||||
}
|
||||
|
||||
bool mce_get_storm_mode(void)
|
||||
{
|
||||
return __this_cpu_read(storm_desc.poll_mode);
|
||||
}
|
||||
|
||||
void mce_set_storm_mode(bool storm)
|
||||
{
|
||||
__this_cpu_write(storm_desc.poll_mode, storm);
|
||||
}
|
||||
|
||||
static void mce_handle_storm(unsigned int bank, bool on)
|
||||
{
|
||||
switch (boot_cpu_data.x86_vendor) {
|
||||
case X86_VENDOR_INTEL:
|
||||
mce_intel_handle_storm(bank, on);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cmci_storm_begin(unsigned int bank)
|
||||
{
|
||||
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
|
||||
|
||||
__set_bit(bank, this_cpu_ptr(mce_poll_banks));
|
||||
storm->banks[bank].in_storm_mode = true;
|
||||
|
||||
/*
|
||||
* If this is the first bank on this CPU to enter storm mode
|
||||
* start polling.
|
||||
*/
|
||||
if (++storm->stormy_bank_count == 1)
|
||||
mce_timer_kick(true);
|
||||
}
|
||||
|
||||
void cmci_storm_end(unsigned int bank)
|
||||
{
|
||||
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
|
||||
|
||||
__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
|
||||
storm->banks[bank].history = 0;
|
||||
storm->banks[bank].in_storm_mode = false;
|
||||
|
||||
/* If no banks left in storm mode, stop polling. */
|
||||
if (!this_cpu_dec_return(storm_desc.stormy_bank_count))
|
||||
mce_timer_kick(false);
|
||||
}
|
||||
|
||||
void mce_track_storm(struct mce *mce)
|
||||
{
|
||||
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
|
||||
unsigned long now = jiffies, delta;
|
||||
unsigned int shift = 1;
|
||||
u64 history = 0;
|
||||
|
||||
/* No tracking needed for banks that do not support CMCI */
|
||||
if (storm->banks[mce->bank].poll_only)
|
||||
return;
|
||||
|
||||
/*
|
||||
* When a bank is in storm mode it is polled once per second and
|
||||
* the history mask will record about the last minute of poll results.
|
||||
* If it is not in storm mode, then the bank is only checked when
|
||||
* there is a CMCI interrupt. Check how long it has been since
|
||||
* this bank was last checked, and adjust the amount of "shift"
|
||||
* to apply to history.
|
||||
*/
|
||||
if (!storm->banks[mce->bank].in_storm_mode) {
|
||||
delta = now - storm->banks[mce->bank].timestamp;
|
||||
shift = (delta + HZ) / HZ;
|
||||
}
|
||||
|
||||
/* If it has been a long time since the last poll, clear history. */
|
||||
if (shift < NUM_HISTORY_BITS)
|
||||
history = storm->banks[mce->bank].history << shift;
|
||||
|
||||
storm->banks[mce->bank].timestamp = now;
|
||||
|
||||
/* History keeps track of corrected errors. VAL=1 && UC=0 */
|
||||
if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce))
|
||||
history |= 1;
|
||||
|
||||
storm->banks[mce->bank].history = history;
|
||||
|
||||
if (storm->banks[mce->bank].in_storm_mode) {
|
||||
if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0))
|
||||
return;
|
||||
printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank);
|
||||
mce_handle_storm(mce->bank, false);
|
||||
cmci_storm_end(mce->bank);
|
||||
} else {
|
||||
if (hweight64(history) < STORM_BEGIN_THRESHOLD)
|
||||
return;
|
||||
printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank);
|
||||
mce_handle_storm(mce->bank, true);
|
||||
cmci_storm_begin(mce->bank);
|
||||
}
|
||||
}
|
||||
|
@ -143,482 +143,6 @@ static const char * const mc6_mce_desc[] = {
|
||||
"Status Register File",
|
||||
};
|
||||
|
||||
/* Scalable MCA error strings */
|
||||
static const char * const smca_ls_mce_desc[] = {
|
||||
"Load queue parity error",
|
||||
"Store queue parity error",
|
||||
"Miss address buffer payload parity error",
|
||||
"Level 1 TLB parity error",
|
||||
"DC Tag error type 5",
|
||||
"DC Tag error type 6",
|
||||
"DC Tag error type 1",
|
||||
"Internal error type 1",
|
||||
"Internal error type 2",
|
||||
"System Read Data Error Thread 0",
|
||||
"System Read Data Error Thread 1",
|
||||
"DC Tag error type 2",
|
||||
"DC Data error type 1 and poison consumption",
|
||||
"DC Data error type 2",
|
||||
"DC Data error type 3",
|
||||
"DC Tag error type 4",
|
||||
"Level 2 TLB parity error",
|
||||
"PDC parity error",
|
||||
"DC Tag error type 3",
|
||||
"DC Tag error type 5",
|
||||
"L2 Fill Data error",
|
||||
};
|
||||
|
||||
static const char * const smca_ls2_mce_desc[] = {
|
||||
"An ECC error was detected on a data cache read by a probe or victimization",
|
||||
"An ECC error or L2 poison was detected on a data cache read by a load",
|
||||
"An ECC error was detected on a data cache read-modify-write by a store",
|
||||
"An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization",
|
||||
"An ECC error or poison bit mismatch was detected on a tag read by a load",
|
||||
"An ECC error or poison bit mismatch was detected on a tag read by a store",
|
||||
"An ECC error was detected on an EMEM read by a load",
|
||||
"An ECC error was detected on an EMEM read-modify-write by a store",
|
||||
"A parity error was detected in an L1 TLB entry by any access",
|
||||
"A parity error was detected in an L2 TLB entry by any access",
|
||||
"A parity error was detected in a PWC entry by any access",
|
||||
"A parity error was detected in an STQ entry by any access",
|
||||
"A parity error was detected in an LDQ entry by any access",
|
||||
"A parity error was detected in a MAB entry by any access",
|
||||
"A parity error was detected in an SCB entry state field by any access",
|
||||
"A parity error was detected in an SCB entry address field by any access",
|
||||
"A parity error was detected in an SCB entry data field by any access",
|
||||
"A parity error was detected in a WCB entry by any access",
|
||||
"A poisoned line was detected in an SCB entry by any access",
|
||||
"A SystemReadDataError error was reported on read data returned from L2 for a load",
|
||||
"A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
|
||||
"A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
|
||||
"A hardware assertion error was reported",
|
||||
"A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
|
||||
};
|
||||
|
||||
static const char * const smca_if_mce_desc[] = {
|
||||
"Op Cache Microtag Probe Port Parity Error",
|
||||
"IC Microtag or Full Tag Multi-hit Error",
|
||||
"IC Full Tag Parity Error",
|
||||
"IC Data Array Parity Error",
|
||||
"Decoupling Queue PhysAddr Parity Error",
|
||||
"L0 ITLB Parity Error",
|
||||
"L1 ITLB Parity Error",
|
||||
"L2 ITLB Parity Error",
|
||||
"BPQ Thread 0 Snoop Parity Error",
|
||||
"BPQ Thread 1 Snoop Parity Error",
|
||||
"L1 BTB Multi-Match Error",
|
||||
"L2 BTB Multi-Match Error",
|
||||
"L2 Cache Response Poison Error",
|
||||
"System Read Data Error",
|
||||
"Hardware Assertion Error",
|
||||
"L1-TLB Multi-Hit",
|
||||
"L2-TLB Multi-Hit",
|
||||
"BSR Parity Error",
|
||||
"CT MCE",
|
||||
};
|
||||
|
||||
static const char * const smca_l2_mce_desc[] = {
|
||||
"L2M Tag Multiple-Way-Hit error",
|
||||
"L2M Tag or State Array ECC Error",
|
||||
"L2M Data Array ECC Error",
|
||||
"Hardware Assert Error",
|
||||
};
|
||||
|
||||
static const char * const smca_de_mce_desc[] = {
|
||||
"Micro-op cache tag parity error",
|
||||
"Micro-op cache data parity error",
|
||||
"Instruction buffer parity error",
|
||||
"Micro-op queue parity error",
|
||||
"Instruction dispatch queue parity error",
|
||||
"Fetch address FIFO parity error",
|
||||
"Patch RAM data parity error",
|
||||
"Patch RAM sequencer parity error",
|
||||
"Micro-op buffer parity error",
|
||||
"Hardware Assertion MCA Error",
|
||||
};
|
||||
|
||||
static const char * const smca_ex_mce_desc[] = {
|
||||
"Watchdog Timeout error",
|
||||
"Physical register file parity error",
|
||||
"Flag register file parity error",
|
||||
"Immediate displacement register file parity error",
|
||||
"Address generator payload parity error",
|
||||
"EX payload parity error",
|
||||
"Checkpoint queue parity error",
|
||||
"Retire dispatch queue parity error",
|
||||
"Retire status queue parity error",
|
||||
"Scheduling queue parity error",
|
||||
"Branch buffer queue parity error",
|
||||
"Hardware Assertion error",
|
||||
"Spec Map parity error",
|
||||
"Retire Map parity error",
|
||||
};
|
||||
|
||||
static const char * const smca_fp_mce_desc[] = {
|
||||
"Physical register file (PRF) parity error",
|
||||
"Freelist (FL) parity error",
|
||||
"Schedule queue parity error",
|
||||
"NSQ parity error",
|
||||
"Retire queue (RQ) parity error",
|
||||
"Status register file (SRF) parity error",
|
||||
"Hardware assertion",
|
||||
};
|
||||
|
||||
static const char * const smca_l3_mce_desc[] = {
|
||||
"Shadow Tag Macro ECC Error",
|
||||
"Shadow Tag Macro Multi-way-hit Error",
|
||||
"L3M Tag ECC Error",
|
||||
"L3M Tag Multi-way-hit Error",
|
||||
"L3M Data ECC Error",
|
||||
"SDP Parity Error or SystemReadDataError from XI",
|
||||
"L3 Victim Queue Parity Error",
|
||||
"L3 Hardware Assertion",
|
||||
};
|
||||
|
||||
static const char * const smca_cs_mce_desc[] = {
|
||||
"Illegal Request",
|
||||
"Address Violation",
|
||||
"Security Violation",
|
||||
"Illegal Response",
|
||||
"Unexpected Response",
|
||||
"Request or Probe Parity Error",
|
||||
"Read Response Parity Error",
|
||||
"Atomic Request Parity Error",
|
||||
"Probe Filter ECC Error",
|
||||
};
|
||||
|
||||
static const char * const smca_cs2_mce_desc[] = {
|
||||
"Illegal Request",
|
||||
"Address Violation",
|
||||
"Security Violation",
|
||||
"Illegal Response",
|
||||
"Unexpected Response",
|
||||
"Request or Probe Parity Error",
|
||||
"Read Response Parity Error",
|
||||
"Atomic Request Parity Error",
|
||||
"SDP read response had no match in the CS queue",
|
||||
"Probe Filter Protocol Error",
|
||||
"Probe Filter ECC Error",
|
||||
"SDP read response had an unexpected RETRY error",
|
||||
"Counter overflow error",
|
||||
"Counter underflow error",
|
||||
};
|
||||
|
||||
static const char * const smca_pie_mce_desc[] = {
|
||||
"Hardware Assert",
|
||||
"Register security violation",
|
||||
"Link Error",
|
||||
"Poison data consumption",
|
||||
"A deferred error was detected in the DF"
|
||||
};
|
||||
|
||||
static const char * const smca_umc_mce_desc[] = {
|
||||
"DRAM ECC error",
|
||||
"Data poison error",
|
||||
"SDP parity error",
|
||||
"Advanced peripheral bus error",
|
||||
"Address/Command parity error",
|
||||
"Write data CRC error",
|
||||
"DCQ SRAM ECC error",
|
||||
"AES SRAM ECC error",
|
||||
};
|
||||
|
||||
static const char * const smca_umc2_mce_desc[] = {
|
||||
"DRAM ECC error",
|
||||
"Data poison error",
|
||||
"SDP parity error",
|
||||
"Reserved",
|
||||
"Address/Command parity error",
|
||||
"Write data parity error",
|
||||
"DCQ SRAM ECC error",
|
||||
"Reserved",
|
||||
"Read data parity error",
|
||||
"Rdb SRAM ECC error",
|
||||
"RdRsp SRAM ECC error",
|
||||
"LM32 MP errors",
|
||||
};
|
||||
|
||||
static const char * const smca_pb_mce_desc[] = {
|
||||
"An ECC error in the Parameter Block RAM array",
|
||||
};
|
||||
|
||||
static const char * const smca_psp_mce_desc[] = {
|
||||
"An ECC or parity error in a PSP RAM instance",
|
||||
};
|
||||
|
||||
static const char * const smca_psp2_mce_desc[] = {
|
||||
"High SRAM ECC or parity error",
|
||||
"Low SRAM ECC or parity error",
|
||||
"Instruction Cache Bank 0 ECC or parity error",
|
||||
"Instruction Cache Bank 1 ECC or parity error",
|
||||
"Instruction Tag Ram 0 parity error",
|
||||
"Instruction Tag Ram 1 parity error",
|
||||
"Data Cache Bank 0 ECC or parity error",
|
||||
"Data Cache Bank 1 ECC or parity error",
|
||||
"Data Cache Bank 2 ECC or parity error",
|
||||
"Data Cache Bank 3 ECC or parity error",
|
||||
"Data Tag Bank 0 parity error",
|
||||
"Data Tag Bank 1 parity error",
|
||||
"Data Tag Bank 2 parity error",
|
||||
"Data Tag Bank 3 parity error",
|
||||
"Dirty Data Ram parity error",
|
||||
"TLB Bank 0 parity error",
|
||||
"TLB Bank 1 parity error",
|
||||
"System Hub Read Buffer ECC or parity error",
|
||||
};
|
||||
|
||||
static const char * const smca_smu_mce_desc[] = {
|
||||
"An ECC or parity error in an SMU RAM instance",
|
||||
};
|
||||
|
||||
static const char * const smca_smu2_mce_desc[] = {
|
||||
"High SRAM ECC or parity error",
|
||||
"Low SRAM ECC or parity error",
|
||||
"Data Cache Bank A ECC or parity error",
|
||||
"Data Cache Bank B ECC or parity error",
|
||||
"Data Tag Cache Bank A ECC or parity error",
|
||||
"Data Tag Cache Bank B ECC or parity error",
|
||||
"Instruction Cache Bank A ECC or parity error",
|
||||
"Instruction Cache Bank B ECC or parity error",
|
||||
"Instruction Tag Cache Bank A ECC or parity error",
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
"System Hub Read Buffer ECC or parity error",
|
||||
"PHY RAM ECC error",
|
||||
};
|
||||
|
||||
static const char * const smca_mp5_mce_desc[] = {
|
||||
"High SRAM ECC or parity error",
|
||||
"Low SRAM ECC or parity error",
|
||||
"Data Cache Bank A ECC or parity error",
|
||||
"Data Cache Bank B ECC or parity error",
|
||||
"Data Tag Cache Bank A ECC or parity error",
|
||||
"Data Tag Cache Bank B ECC or parity error",
|
||||
"Instruction Cache Bank A ECC or parity error",
|
||||
"Instruction Cache Bank B ECC or parity error",
|
||||
"Instruction Tag Cache Bank A ECC or parity error",
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
};
|
||||
|
||||
static const char * const smca_mpdma_mce_desc[] = {
|
||||
"Main SRAM [31:0] bank ECC or parity error",
|
||||
"Main SRAM [63:32] bank ECC or parity error",
|
||||
"Main SRAM [95:64] bank ECC or parity error",
|
||||
"Main SRAM [127:96] bank ECC or parity error",
|
||||
"Data Cache Bank A ECC or parity error",
|
||||
"Data Cache Bank B ECC or parity error",
|
||||
"Data Tag Cache Bank A ECC or parity error",
|
||||
"Data Tag Cache Bank B ECC or parity error",
|
||||
"Instruction Cache Bank A ECC or parity error",
|
||||
"Instruction Cache Bank B ECC or parity error",
|
||||
"Instruction Tag Cache Bank A ECC or parity error",
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
"Data Cache Bank A ECC or parity error",
|
||||
"Data Cache Bank B ECC or parity error",
|
||||
"Data Tag Cache Bank A ECC or parity error",
|
||||
"Data Tag Cache Bank B ECC or parity error",
|
||||
"Instruction Cache Bank A ECC or parity error",
|
||||
"Instruction Cache Bank B ECC or parity error",
|
||||
"Instruction Tag Cache Bank A ECC or parity error",
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
"Data Cache Bank A ECC or parity error",
|
||||
"Data Cache Bank B ECC or parity error",
|
||||
"Data Tag Cache Bank A ECC or parity error",
|
||||
"Data Tag Cache Bank B ECC or parity error",
|
||||
"Instruction Cache Bank A ECC or parity error",
|
||||
"Instruction Cache Bank B ECC or parity error",
|
||||
"Instruction Tag Cache Bank A ECC or parity error",
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
"System Hub Read Buffer ECC or parity error",
|
||||
"MPDMA TVF DVSEC Memory ECC or parity error",
|
||||
"MPDMA TVF MMIO Mailbox0 ECC or parity error",
|
||||
"MPDMA TVF MMIO Mailbox1 ECC or parity error",
|
||||
"MPDMA TVF Doorbell Memory ECC or parity error",
|
||||
"MPDMA TVF SDP Slave Memory 0 ECC or parity error",
|
||||
"MPDMA TVF SDP Slave Memory 1 ECC or parity error",
|
||||
"MPDMA TVF SDP Slave Memory 2 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 0 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 1 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 2 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 3 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 4 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 5 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 6 ECC or parity error",
|
||||
"MPDMA PTE Command FIFO ECC or parity error",
|
||||
"MPDMA PTE Hub Data FIFO ECC or parity error",
|
||||
"MPDMA PTE Internal Data FIFO ECC or parity error",
|
||||
"MPDMA PTE Command Memory DMA ECC or parity error",
|
||||
"MPDMA PTE Command Memory Internal ECC or parity error",
|
||||
"MPDMA PTE DMA Completion FIFO ECC or parity error",
|
||||
"MPDMA PTE Tablewalk Completion FIFO ECC or parity error",
|
||||
"MPDMA PTE Descriptor Completion FIFO ECC or parity error",
|
||||
"MPDMA PTE ReadOnly Completion FIFO ECC or parity error",
|
||||
"MPDMA PTE DirectWrite Completion FIFO ECC or parity error",
|
||||
"SDP Watchdog Timer expired",
|
||||
};
|
||||
|
||||
static const char * const smca_nbio_mce_desc[] = {
|
||||
"ECC or Parity error",
|
||||
"PCIE error",
|
||||
"SDP ErrEvent error",
|
||||
"SDP Egress Poison Error",
|
||||
"IOHC Internal Poison Error",
|
||||
};
|
||||
|
||||
static const char * const smca_pcie_mce_desc[] = {
|
||||
"CCIX PER Message logging",
|
||||
"CCIX Read Response with Status: Non-Data Error",
|
||||
"CCIX Write Response with Status: Non-Data Error",
|
||||
"CCIX Read Response with Status: Data Error",
|
||||
"CCIX Non-okay write response with data error",
|
||||
};
|
||||
|
||||
static const char * const smca_pcie2_mce_desc[] = {
|
||||
"SDP Parity Error logging",
|
||||
};
|
||||
|
||||
static const char * const smca_xgmipcs_mce_desc[] = {
|
||||
"Data Loss Error",
|
||||
"Training Error",
|
||||
"Flow Control Acknowledge Error",
|
||||
"Rx Fifo Underflow Error",
|
||||
"Rx Fifo Overflow Error",
|
||||
"CRC Error",
|
||||
"BER Exceeded Error",
|
||||
"Tx Vcid Data Error",
|
||||
"Replay Buffer Parity Error",
|
||||
"Data Parity Error",
|
||||
"Replay Fifo Overflow Error",
|
||||
"Replay Fifo Underflow Error",
|
||||
"Elastic Fifo Overflow Error",
|
||||
"Deskew Error",
|
||||
"Flow Control CRC Error",
|
||||
"Data Startup Limit Error",
|
||||
"FC Init Timeout Error",
|
||||
"Recovery Timeout Error",
|
||||
"Ready Serial Timeout Error",
|
||||
"Ready Serial Attempt Error",
|
||||
"Recovery Attempt Error",
|
||||
"Recovery Relock Attempt Error",
|
||||
"Replay Attempt Error",
|
||||
"Sync Header Error",
|
||||
"Tx Replay Timeout Error",
|
||||
"Rx Replay Timeout Error",
|
||||
"LinkSub Tx Timeout Error",
|
||||
"LinkSub Rx Timeout Error",
|
||||
"Rx CMD Packet Error",
|
||||
};
|
||||
|
||||
static const char * const smca_xgmiphy_mce_desc[] = {
|
||||
"RAM ECC Error",
|
||||
"ARC instruction buffer parity error",
|
||||
"ARC data buffer parity error",
|
||||
"PHY APB error",
|
||||
};
|
||||
|
||||
static const char * const smca_nbif_mce_desc[] = {
|
||||
"Timeout error from GMI",
|
||||
"SRAM ECC error",
|
||||
"NTB Error Event",
|
||||
"SDP Parity error",
|
||||
};
|
||||
|
||||
static const char * const smca_sata_mce_desc[] = {
|
||||
"Parity error for port 0",
|
||||
"Parity error for port 1",
|
||||
"Parity error for port 2",
|
||||
"Parity error for port 3",
|
||||
"Parity error for port 4",
|
||||
"Parity error for port 5",
|
||||
"Parity error for port 6",
|
||||
"Parity error for port 7",
|
||||
};
|
||||
|
||||
static const char * const smca_usb_mce_desc[] = {
|
||||
"Parity error or ECC error for S0 RAM0",
|
||||
"Parity error or ECC error for S0 RAM1",
|
||||
"Parity error or ECC error for S0 RAM2",
|
||||
"Parity error for PHY RAM0",
|
||||
"Parity error for PHY RAM1",
|
||||
"AXI Slave Response error",
|
||||
};
|
||||
|
||||
static const char * const smca_gmipcs_mce_desc[] = {
|
||||
"Data Loss Error",
|
||||
"Training Error",
|
||||
"Replay Parity Error",
|
||||
"Rx Fifo Underflow Error",
|
||||
"Rx Fifo Overflow Error",
|
||||
"CRC Error",
|
||||
"BER Exceeded Error",
|
||||
"Tx Fifo Underflow Error",
|
||||
"Replay Buffer Parity Error",
|
||||
"Tx Overflow Error",
|
||||
"Replay Fifo Overflow Error",
|
||||
"Replay Fifo Underflow Error",
|
||||
"Elastic Fifo Overflow Error",
|
||||
"Deskew Error",
|
||||
"Offline Error",
|
||||
"Data Startup Limit Error",
|
||||
"FC Init Timeout Error",
|
||||
"Recovery Timeout Error",
|
||||
"Ready Serial Timeout Error",
|
||||
"Ready Serial Attempt Error",
|
||||
"Recovery Attempt Error",
|
||||
"Recovery Relock Attempt Error",
|
||||
"Deskew Abort Error",
|
||||
"Rx Buffer Error",
|
||||
"Rx LFDS Fifo Overflow Error",
|
||||
"Rx LFDS Fifo Underflow Error",
|
||||
"LinkSub Tx Timeout Error",
|
||||
"LinkSub Rx Timeout Error",
|
||||
"Rx CMD Packet Error",
|
||||
"LFDS Training Timeout Error",
|
||||
"LFDS FC Init Timeout Error",
|
||||
"Data Loss Error",
|
||||
};
|
||||
|
||||
struct smca_mce_desc {
|
||||
const char * const *descs;
|
||||
unsigned int num_descs;
|
||||
};
|
||||
|
||||
static struct smca_mce_desc smca_mce_descs[] = {
|
||||
[SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
|
||||
[SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) },
|
||||
[SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
|
||||
[SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
|
||||
[SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
|
||||
[SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) },
|
||||
[SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
|
||||
[SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
|
||||
[SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
|
||||
[SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
|
||||
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
|
||||
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
|
||||
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
|
||||
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
|
||||
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
|
||||
[SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) },
|
||||
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
|
||||
[SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) },
|
||||
[SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
|
||||
[SMCA_MPDMA] = { smca_mpdma_mce_desc, ARRAY_SIZE(smca_mpdma_mce_desc) },
|
||||
[SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) },
|
||||
[SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) },
|
||||
[SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) },
|
||||
[SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) },
|
||||
/* NBIF and SHUB have the same error descriptions, for now. */
|
||||
[SMCA_NBIF] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
|
||||
[SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
|
||||
[SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) },
|
||||
[SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) },
|
||||
[SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) },
|
||||
/* All the PHY bank types have the same error descriptions, for now. */
|
||||
[SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
|
||||
[SMCA_WAFL_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
|
||||
[SMCA_GMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
|
||||
};
|
||||
|
||||
static bool f12h_mc0_mce(u16 ec, u8 xec)
|
||||
{
|
||||
bool ret = false;
|
||||
@ -1163,11 +687,51 @@ static void decode_mc6_mce(struct mce *m)
|
||||
pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
|
||||
}
|
||||
|
||||
static const char * const smca_long_names[] = {
|
||||
[SMCA_LS ... SMCA_LS_V2] = "Load Store Unit",
|
||||
[SMCA_IF] = "Instruction Fetch Unit",
|
||||
[SMCA_L2_CACHE] = "L2 Cache",
|
||||
[SMCA_DE] = "Decode Unit",
|
||||
[SMCA_RESERVED] = "Reserved",
|
||||
[SMCA_EX] = "Execution Unit",
|
||||
[SMCA_FP] = "Floating Point Unit",
|
||||
[SMCA_L3_CACHE] = "L3 Cache",
|
||||
[SMCA_CS ... SMCA_CS_V2] = "Coherent Slave",
|
||||
[SMCA_PIE] = "Power, Interrupts, etc.",
|
||||
|
||||
/* UMC v2 is separate because both of them can exist in a single system. */
|
||||
[SMCA_UMC] = "Unified Memory Controller",
|
||||
[SMCA_UMC_V2] = "Unified Memory Controller v2",
|
||||
[SMCA_PB] = "Parameter Block",
|
||||
[SMCA_PSP ... SMCA_PSP_V2] = "Platform Security Processor",
|
||||
[SMCA_SMU ... SMCA_SMU_V2] = "System Management Unit",
|
||||
[SMCA_MP5] = "Microprocessor 5 Unit",
|
||||
[SMCA_MPDMA] = "MPDMA Unit",
|
||||
[SMCA_NBIO] = "Northbridge IO Unit",
|
||||
[SMCA_PCIE ... SMCA_PCIE_V2] = "PCI Express Unit",
|
||||
[SMCA_XGMI_PCS] = "Ext Global Memory Interconnect PCS Unit",
|
||||
[SMCA_NBIF] = "NBIF Unit",
|
||||
[SMCA_SHUB] = "System Hub Unit",
|
||||
[SMCA_SATA] = "SATA Unit",
|
||||
[SMCA_USB] = "USB Unit",
|
||||
[SMCA_GMI_PCS] = "Global Memory Interconnect PCS Unit",
|
||||
[SMCA_XGMI_PHY] = "Ext Global Memory Interconnect PHY Unit",
|
||||
[SMCA_WAFL_PHY] = "WAFL PHY Unit",
|
||||
[SMCA_GMI_PHY] = "Global Memory Interconnect PHY Unit",
|
||||
};
|
||||
|
||||
static const char *smca_get_long_name(enum smca_bank_types t)
|
||||
{
|
||||
if (t >= N_SMCA_BANK_TYPES)
|
||||
return NULL;
|
||||
|
||||
return smca_long_names[t];
|
||||
}
|
||||
|
||||
/* Decode errors according to Scalable MCA specification */
|
||||
static void decode_smca_error(struct mce *m)
|
||||
{
|
||||
enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
|
||||
const char *ip_name;
|
||||
u8 xec = XEC(m->status, xec_mask);
|
||||
|
||||
if (bank_type >= N_SMCA_BANK_TYPES)
|
||||
@ -1178,13 +742,7 @@ static void decode_smca_error(struct mce *m)
|
||||
return;
|
||||
}
|
||||
|
||||
ip_name = smca_get_long_name(bank_type);
|
||||
|
||||
pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
|
||||
|
||||
/* Only print the decode of valid error codes */
|
||||
if (xec < smca_mce_descs[bank_type].num_descs)
|
||||
pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
|
||||
pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec);
|
||||
|
||||
if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) &&
|
||||
xec == 0 && decode_dram_ecc)
|
||||
|
Loading…
Reference in New Issue
Block a user