- First part of a series to move the AMD address translation code from
arch/x86/ to amd64_edac as that is its only user anyway - Some MCE error injection improvements to the AMD side - Reorganization of the #MC handler code and the facilities it calls to make it noinstr-safe - Add support for new AMD MCA bank types and non-uniform banks layout - The usual set of cleanups and fixes -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmHcGZ4ACgkQEsHwGGHe VUr6Zw//WBvNvfV/akQGsvVo94G0DaF+buYB+Tl1p0goMd7QfKA5iHxjB1alEJC2 dTchIr7pjiiE3nr4svuWLLQZamx8kMwQNqipioBHXg3YThj0wD4PbUOC9TlIceBR 3yxVbvwlD7Y7sb2PII6IMlagzTiIeW0/ps29DHFr5vqDBvEanNdAHoV/h2vQi+76 Ma96psIxzTMSk11yGB6l9k66EASCdDGBU7sODjup7wuQmuRaQ/1oJAWY0wIJvJez frjpaz/YKmlTwTf9bxoJbky2FkeBsD4yXXUGwjDgMq0EyUUaeSbvaQkm8gSHX9Yr VDDv1WvT6QIw6x7Wc4skS8lWmZghNBbAHOoNS31BPJ2IDmFWkF5Q2bNEuHrtU4EC 0mkNeyN6x48L/F8j/1aE/tm+SjiGexZX4zhi6MNWReTV140I1zqQq/r7CCu5+MEa PAB1YH/96k2dMPT6mbFrRIFJmkDuBuZOAkuwYWEjO/XjPl2SGBGj1jKolWW3qjRR Po7vBJnDt7wgigWFh6+R4rJv+fh87XfB7B2wEOt4Yn37jUkK6dNRIy0zFmDaC1J2 bHgsJbWC+Sgs1G57gnYABJYzLj7RRdDyCu1/UUVyBBP7/WfZJw0kjABE7p3AaYTd 15JV1L0c/Ypuv05LJf40LkyF2F5w2fnP5QM2Rr8U4xW/GumEyWs= =8Hu7 -----END PGP SIGNATURE----- Merge tag 'ras_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull RAS updates from Borislav Petkov: "A relatively big amount of movements in RAS-land this time around: - First part of a series to move the AMD address translation code from arch/x86/ to amd64_edac as that is its only user anyway - Some MCE error injection improvements to the AMD side - Reorganization of the #MC handler code and the facilities it calls to make it noinstr-safe - Add support for new AMD MCA bank types and non-uniform banks layout - The usual set of cleanups and fixes" * tag 'ras_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits) x86/mce: Reduce number of machine checks taken during recovery x86/mce/inject: Avoid out-of-bounds write when setting flags x86/MCE/AMD, EDAC/mce_amd: Support non-uniform MCA bank type enumeration x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types x86/mce: Check regs before accessing it x86/mce: Mark mce_start() noinstr x86/mce: Mark mce_timed_out() noinstr x86/mce: Move the tainting outside of the noinstr region x86/mce: Mark mce_read_aux() noinstr x86/mce: Mark mce_end() noinstr x86/mce: Mark mce_panic() noinstr x86/mce: Prevent severity computation from being instrumented x86/mce: Allow instrumentation during task work queueing x86/mce: Remove noinstr annotation from mce_setup() x86/mce: Use mce_rdmsrl() in severity checking code x86/mce: Remove function-local cpus variables x86/mce: Do not use memset to clear the banks bitmaps x86/mce/inject: Set the valid bit in MCA_STATUS before error injection x86/mce/inject: Check if a bank is populated before injecting x86/mce: Get rid of cpu_missing ...
This commit is contained in:
commit
7e740ae635
@ -24,7 +24,6 @@ extern int amd_set_subcaches(int, unsigned long);
|
||||
|
||||
extern int amd_smn_read(u16 node, u32 address, u32 *value);
|
||||
extern int amd_smn_write(u16 node, u32 address, u32 value);
|
||||
extern int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo);
|
||||
|
||||
struct amd_l3_cache {
|
||||
unsigned indices;
|
||||
|
@ -313,31 +313,22 @@ enum smca_bank_types {
|
||||
SMCA_SMU, /* System Management Unit */
|
||||
SMCA_SMU_V2,
|
||||
SMCA_MP5, /* Microprocessor 5 Unit */
|
||||
SMCA_MPDMA, /* MPDMA Unit */
|
||||
SMCA_NBIO, /* Northbridge IO Unit */
|
||||
SMCA_PCIE, /* PCI Express Unit */
|
||||
SMCA_PCIE_V2,
|
||||
SMCA_XGMI_PCS, /* xGMI PCS Unit */
|
||||
SMCA_NBIF, /* NBIF Unit */
|
||||
SMCA_SHUB, /* System HUB Unit */
|
||||
SMCA_SATA, /* SATA Unit */
|
||||
SMCA_USB, /* USB Unit */
|
||||
SMCA_GMI_PCS, /* GMI PCS Unit */
|
||||
SMCA_XGMI_PHY, /* xGMI PHY Unit */
|
||||
SMCA_WAFL_PHY, /* WAFL PHY Unit */
|
||||
SMCA_GMI_PHY, /* GMI PHY Unit */
|
||||
N_SMCA_BANK_TYPES
|
||||
};
|
||||
|
||||
#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
|
||||
|
||||
struct smca_hwid {
|
||||
unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
|
||||
u32 hwid_mcatype; /* (hwid,mcatype) tuple */
|
||||
u8 count; /* Number of instances. */
|
||||
};
|
||||
|
||||
struct smca_bank {
|
||||
struct smca_hwid *hwid;
|
||||
u32 id; /* Value of MCA_IPID[InstanceId]. */
|
||||
u8 sysfs_id; /* Value used for sysfs name. */
|
||||
};
|
||||
|
||||
extern struct smca_bank smca_banks[MAX_NR_BANKS];
|
||||
|
||||
extern const char *smca_get_long_name(enum smca_bank_types t);
|
||||
extern bool amd_mce_is_memory_error(struct mce *m);
|
||||
|
||||
@ -345,16 +336,13 @@ extern int mce_threshold_create_device(unsigned int cpu);
|
||||
extern int mce_threshold_remove_device(unsigned int cpu);
|
||||
|
||||
void mce_amd_feature_init(struct cpuinfo_x86 *c);
|
||||
int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr);
|
||||
enum smca_bank_types smca_get_bank_type(unsigned int bank);
|
||||
enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank);
|
||||
#else
|
||||
|
||||
static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
|
||||
static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
|
||||
static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
|
||||
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
|
||||
static inline int
|
||||
umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; };
|
||||
#endif
|
||||
|
||||
static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); }
|
||||
|
@ -29,7 +29,7 @@
|
||||
#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d
|
||||
#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
|
||||
|
||||
/* Protect the PCI config register pairs used for SMN and DF indirect access. */
|
||||
/* Protect the PCI config register pairs used for SMN. */
|
||||
static DEFINE_MUTEX(smn_mutex);
|
||||
|
||||
static u32 *flush_words;
|
||||
@ -182,53 +182,6 @@ int amd_smn_write(u16 node, u32 address, u32 value)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(amd_smn_write);
|
||||
|
||||
/*
|
||||
* Data Fabric Indirect Access uses FICAA/FICAD.
|
||||
*
|
||||
* Fabric Indirect Configuration Access Address (FICAA): Constructed based
|
||||
* on the device's Instance Id and the PCI function and register offset of
|
||||
* the desired register.
|
||||
*
|
||||
* Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO
|
||||
* and FICAD HI registers but so far we only need the LO register.
|
||||
*/
|
||||
int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
|
||||
{
|
||||
struct pci_dev *F4;
|
||||
u32 ficaa;
|
||||
int err = -ENODEV;
|
||||
|
||||
if (node >= amd_northbridges.num)
|
||||
goto out;
|
||||
|
||||
F4 = node_to_amd_nb(node)->link;
|
||||
if (!F4)
|
||||
goto out;
|
||||
|
||||
ficaa = 1;
|
||||
ficaa |= reg & 0x3FC;
|
||||
ficaa |= (func & 0x7) << 11;
|
||||
ficaa |= instance_id << 16;
|
||||
|
||||
mutex_lock(&smn_mutex);
|
||||
|
||||
err = pci_write_config_dword(F4, 0x5C, ficaa);
|
||||
if (err) {
|
||||
pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
err = pci_read_config_dword(F4, 0x98, lo);
|
||||
if (err)
|
||||
pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&smn_mutex);
|
||||
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(amd_df_indirect_read);
|
||||
|
||||
int amd_cache_northbridges(void)
|
||||
{
|
||||
|
@ -71,6 +71,22 @@ static const char * const smca_umc_block_names[] = {
|
||||
"misc_umc"
|
||||
};
|
||||
|
||||
#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
|
||||
|
||||
struct smca_hwid {
|
||||
unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
|
||||
u32 hwid_mcatype; /* (hwid,mcatype) tuple */
|
||||
};
|
||||
|
||||
struct smca_bank {
|
||||
const struct smca_hwid *hwid;
|
||||
u32 id; /* Value of MCA_IPID[InstanceId]. */
|
||||
u8 sysfs_id; /* Value used for sysfs name. */
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
|
||||
static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
|
||||
|
||||
struct smca_bank_name {
|
||||
const char *name; /* Short name for sysfs */
|
||||
const char *long_name; /* Long name for pretty-printing */
|
||||
@ -95,11 +111,18 @@ static struct smca_bank_name smca_names[] = {
|
||||
[SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
|
||||
[SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
|
||||
[SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
|
||||
[SMCA_MPDMA] = { "mpdma", "MPDMA Unit" },
|
||||
[SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
|
||||
[SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" },
|
||||
[SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" },
|
||||
[SMCA_NBIF] = { "nbif", "NBIF Unit" },
|
||||
[SMCA_SHUB] = { "shub", "System Hub Unit" },
|
||||
[SMCA_SATA] = { "sata", "SATA Unit" },
|
||||
[SMCA_USB] = { "usb", "USB Unit" },
|
||||
[SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" },
|
||||
[SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
|
||||
[SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
|
||||
[SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" },
|
||||
};
|
||||
|
||||
static const char *smca_get_name(enum smca_bank_types t)
|
||||
@ -119,14 +142,14 @@ const char *smca_get_long_name(enum smca_bank_types t)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(smca_get_long_name);
|
||||
|
||||
enum smca_bank_types smca_get_bank_type(unsigned int bank)
|
||||
enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
|
||||
{
|
||||
struct smca_bank *b;
|
||||
|
||||
if (bank >= MAX_NR_BANKS)
|
||||
return N_SMCA_BANK_TYPES;
|
||||
|
||||
b = &smca_banks[bank];
|
||||
b = &per_cpu(smca_banks, cpu)[bank];
|
||||
if (!b->hwid)
|
||||
return N_SMCA_BANK_TYPES;
|
||||
|
||||
@ -134,7 +157,7 @@ enum smca_bank_types smca_get_bank_type(unsigned int bank)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(smca_get_bank_type);
|
||||
|
||||
static struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
static const struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
/* { bank_type, hwid_mcatype } */
|
||||
|
||||
/* Reserved type */
|
||||
@ -174,6 +197,9 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
/* Microprocessor 5 Unit MCA type */
|
||||
{ SMCA_MP5, HWID_MCATYPE(0x01, 0x2) },
|
||||
|
||||
/* MPDMA MCA type */
|
||||
{ SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) },
|
||||
|
||||
/* Northbridge IO Unit MCA type */
|
||||
{ SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) },
|
||||
|
||||
@ -181,19 +207,17 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
{ SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
|
||||
{ SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
|
||||
|
||||
/* xGMI PCS MCA type */
|
||||
{ SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
|
||||
|
||||
/* xGMI PHY MCA type */
|
||||
{ SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) },
|
||||
{ SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
|
||||
{ SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
|
||||
{ SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
|
||||
{ SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
|
||||
{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
|
||||
|
||||
/* WAFL PHY MCA type */
|
||||
{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
|
||||
{ SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
|
||||
};
|
||||
|
||||
struct smca_bank smca_banks[MAX_NR_BANKS];
|
||||
EXPORT_SYMBOL_GPL(smca_banks);
|
||||
|
||||
/*
|
||||
* In SMCA enabled processors, we can have multiple banks for a given IP type.
|
||||
* So to define a unique name for each bank, we use a temp c-string to append
|
||||
@ -249,8 +273,9 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)
|
||||
|
||||
static void smca_configure(unsigned int bank, unsigned int cpu)
|
||||
{
|
||||
u8 *bank_counts = this_cpu_ptr(smca_bank_counts);
|
||||
const struct smca_hwid *s_hwid;
|
||||
unsigned int i, hwid_mcatype;
|
||||
struct smca_hwid *s_hwid;
|
||||
u32 high, low;
|
||||
u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
|
||||
|
||||
@ -286,10 +311,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
|
||||
|
||||
smca_set_misc_banks_map(bank, cpu);
|
||||
|
||||
/* Return early if this bank was already initialized. */
|
||||
if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0)
|
||||
return;
|
||||
|
||||
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
|
||||
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
|
||||
return;
|
||||
@ -300,10 +321,11 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
|
||||
s_hwid = &smca_hwid_mcatypes[i];
|
||||
|
||||
if (hwid_mcatype == s_hwid->hwid_mcatype) {
|
||||
smca_banks[bank].hwid = s_hwid;
|
||||
smca_banks[bank].id = low;
|
||||
smca_banks[bank].sysfs_id = s_hwid->count++;
|
||||
this_cpu_ptr(smca_banks)[bank].hwid = s_hwid;
|
||||
this_cpu_ptr(smca_banks)[bank].id = low;
|
||||
this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -589,7 +611,7 @@ out:
|
||||
|
||||
bool amd_filter_mce(struct mce *m)
|
||||
{
|
||||
enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
|
||||
enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
|
||||
struct cpuinfo_x86 *c = &boot_cpu_data;
|
||||
|
||||
/* See Family 17h Models 10h-2Fh Erratum #1114. */
|
||||
@ -627,7 +649,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
|
||||
} else if (c->x86 == 0x17 &&
|
||||
(c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {
|
||||
|
||||
if (smca_get_bank_type(bank) != SMCA_IF)
|
||||
if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF)
|
||||
return;
|
||||
|
||||
msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
|
||||
@ -689,213 +711,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
|
||||
deferred_error_interrupt_enable(c);
|
||||
}
|
||||
|
||||
int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
|
||||
{
|
||||
u64 dram_base_addr, dram_limit_addr, dram_hole_base;
|
||||
/* We start from the normalized address */
|
||||
u64 ret_addr = norm_addr;
|
||||
|
||||
u32 tmp;
|
||||
|
||||
u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
|
||||
u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
|
||||
u8 intlv_addr_sel, intlv_addr_bit;
|
||||
u8 num_intlv_bits, hashed_bit;
|
||||
u8 lgcy_mmio_hole_en, base = 0;
|
||||
u8 cs_mask, cs_id = 0;
|
||||
bool hash_enabled = false;
|
||||
|
||||
/* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
|
||||
if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp))
|
||||
goto out_err;
|
||||
|
||||
/* Remove HiAddrOffset from normalized address, if enabled: */
|
||||
if (tmp & BIT(0)) {
|
||||
u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8;
|
||||
|
||||
if (norm_addr >= hi_addr_offset) {
|
||||
ret_addr -= hi_addr_offset;
|
||||
base = 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Read D18F0x110 (DramBaseAddress). */
|
||||
if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp))
|
||||
goto out_err;
|
||||
|
||||
/* Check if address range is valid. */
|
||||
if (!(tmp & BIT(0))) {
|
||||
pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
|
||||
__func__, tmp);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
lgcy_mmio_hole_en = tmp & BIT(1);
|
||||
intlv_num_chan = (tmp >> 4) & 0xF;
|
||||
intlv_addr_sel = (tmp >> 8) & 0x7;
|
||||
dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16;
|
||||
|
||||
/* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
|
||||
if (intlv_addr_sel > 3) {
|
||||
pr_err("%s: Invalid interleave address select %d.\n",
|
||||
__func__, intlv_addr_sel);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
/* Read D18F0x114 (DramLimitAddress). */
|
||||
if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp))
|
||||
goto out_err;
|
||||
|
||||
intlv_num_sockets = (tmp >> 8) & 0x1;
|
||||
intlv_num_dies = (tmp >> 10) & 0x3;
|
||||
dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
|
||||
|
||||
intlv_addr_bit = intlv_addr_sel + 8;
|
||||
|
||||
/* Re-use intlv_num_chan by setting it equal to log2(#channels) */
|
||||
switch (intlv_num_chan) {
|
||||
case 0: intlv_num_chan = 0; break;
|
||||
case 1: intlv_num_chan = 1; break;
|
||||
case 3: intlv_num_chan = 2; break;
|
||||
case 5: intlv_num_chan = 3; break;
|
||||
case 7: intlv_num_chan = 4; break;
|
||||
|
||||
case 8: intlv_num_chan = 1;
|
||||
hash_enabled = true;
|
||||
break;
|
||||
default:
|
||||
pr_err("%s: Invalid number of interleaved channels %d.\n",
|
||||
__func__, intlv_num_chan);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
num_intlv_bits = intlv_num_chan;
|
||||
|
||||
if (intlv_num_dies > 2) {
|
||||
pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
|
||||
__func__, intlv_num_dies);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
num_intlv_bits += intlv_num_dies;
|
||||
|
||||
/* Add a bit if sockets are interleaved. */
|
||||
num_intlv_bits += intlv_num_sockets;
|
||||
|
||||
/* Assert num_intlv_bits <= 4 */
|
||||
if (num_intlv_bits > 4) {
|
||||
pr_err("%s: Invalid interleave bits %d.\n",
|
||||
__func__, num_intlv_bits);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
if (num_intlv_bits > 0) {
|
||||
u64 temp_addr_x, temp_addr_i, temp_addr_y;
|
||||
u8 die_id_bit, sock_id_bit, cs_fabric_id;
|
||||
|
||||
/*
|
||||
* Read FabricBlockInstanceInformation3_CS[BlockFabricID].
|
||||
* This is the fabric id for this coherent slave. Use
|
||||
* umc/channel# as instance id of the coherent slave
|
||||
* for FICAA.
|
||||
*/
|
||||
if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp))
|
||||
goto out_err;
|
||||
|
||||
cs_fabric_id = (tmp >> 8) & 0xFF;
|
||||
die_id_bit = 0;
|
||||
|
||||
/* If interleaved over more than 1 channel: */
|
||||
if (intlv_num_chan) {
|
||||
die_id_bit = intlv_num_chan;
|
||||
cs_mask = (1 << die_id_bit) - 1;
|
||||
cs_id = cs_fabric_id & cs_mask;
|
||||
}
|
||||
|
||||
sock_id_bit = die_id_bit;
|
||||
|
||||
/* Read D18F1x208 (SystemFabricIdMask). */
|
||||
if (intlv_num_dies || intlv_num_sockets)
|
||||
if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp))
|
||||
goto out_err;
|
||||
|
||||
/* If interleaved over more than 1 die. */
|
||||
if (intlv_num_dies) {
|
||||
sock_id_bit = die_id_bit + intlv_num_dies;
|
||||
die_id_shift = (tmp >> 24) & 0xF;
|
||||
die_id_mask = (tmp >> 8) & 0xFF;
|
||||
|
||||
cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
|
||||
}
|
||||
|
||||
/* If interleaved over more than 1 socket. */
|
||||
if (intlv_num_sockets) {
|
||||
socket_id_shift = (tmp >> 28) & 0xF;
|
||||
socket_id_mask = (tmp >> 16) & 0xFF;
|
||||
|
||||
cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
|
||||
}
|
||||
|
||||
/*
|
||||
* The pre-interleaved address consists of XXXXXXIIIYYYYY
|
||||
* where III is the ID for this CS, and XXXXXXYYYYY are the
|
||||
* address bits from the post-interleaved address.
|
||||
* "num_intlv_bits" has been calculated to tell us how many "I"
|
||||
* bits there are. "intlv_addr_bit" tells us how many "Y" bits
|
||||
* there are (where "I" starts).
|
||||
*/
|
||||
temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0);
|
||||
temp_addr_i = (cs_id << intlv_addr_bit);
|
||||
temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
|
||||
ret_addr = temp_addr_x | temp_addr_i | temp_addr_y;
|
||||
}
|
||||
|
||||
/* Add dram base address */
|
||||
ret_addr += dram_base_addr;
|
||||
|
||||
/* If legacy MMIO hole enabled */
|
||||
if (lgcy_mmio_hole_en) {
|
||||
if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp))
|
||||
goto out_err;
|
||||
|
||||
dram_hole_base = tmp & GENMASK(31, 24);
|
||||
if (ret_addr >= dram_hole_base)
|
||||
ret_addr += (BIT_ULL(32) - dram_hole_base);
|
||||
}
|
||||
|
||||
if (hash_enabled) {
|
||||
/* Save some parentheses and grab ls-bit at the end. */
|
||||
hashed_bit = (ret_addr >> 12) ^
|
||||
(ret_addr >> 18) ^
|
||||
(ret_addr >> 21) ^
|
||||
(ret_addr >> 30) ^
|
||||
cs_id;
|
||||
|
||||
hashed_bit &= BIT(0);
|
||||
|
||||
if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0)))
|
||||
ret_addr ^= BIT(intlv_addr_bit);
|
||||
}
|
||||
|
||||
/* Is calculated system address is above DRAM limit address? */
|
||||
if (ret_addr > dram_limit_addr)
|
||||
goto out_err;
|
||||
|
||||
*sys_addr = ret_addr;
|
||||
return 0;
|
||||
|
||||
out_err:
|
||||
return -EINVAL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
|
||||
|
||||
bool amd_mce_is_memory_error(struct mce *m)
|
||||
{
|
||||
/* ErrCodeExt[20:16] */
|
||||
u8 xec = (m->status >> 16) & 0x1f;
|
||||
|
||||
if (mce_flags.smca)
|
||||
return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0;
|
||||
return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0;
|
||||
|
||||
return m->bank == 4 && xec == 0x8;
|
||||
}
|
||||
@ -1211,7 +1033,7 @@ static struct kobj_type threshold_ktype = {
|
||||
.release = threshold_block_release,
|
||||
};
|
||||
|
||||
static const char *get_name(unsigned int bank, struct threshold_block *b)
|
||||
static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b)
|
||||
{
|
||||
enum smca_bank_types bank_type;
|
||||
|
||||
@ -1222,7 +1044,7 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
|
||||
return th_names[bank];
|
||||
}
|
||||
|
||||
bank_type = smca_get_bank_type(bank);
|
||||
bank_type = smca_get_bank_type(cpu, bank);
|
||||
if (bank_type >= N_SMCA_BANK_TYPES)
|
||||
return NULL;
|
||||
|
||||
@ -1232,12 +1054,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (smca_banks[bank].hwid->count == 1)
|
||||
if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1)
|
||||
return smca_get_name(bank_type);
|
||||
|
||||
snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
|
||||
"%s_%x", smca_get_name(bank_type),
|
||||
smca_banks[bank].sysfs_id);
|
||||
"%s_%u", smca_get_name(bank_type),
|
||||
per_cpu(smca_banks, cpu)[bank].sysfs_id);
|
||||
return buf_mcatype;
|
||||
}
|
||||
|
||||
@ -1293,7 +1115,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
|
||||
else
|
||||
tb->blocks = b;
|
||||
|
||||
err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b));
|
||||
err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
|
||||
if (err)
|
||||
goto out_free;
|
||||
recurse:
|
||||
@ -1348,7 +1170,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
|
||||
struct device *dev = this_cpu_read(mce_device);
|
||||
struct amd_northbridge *nb = NULL;
|
||||
struct threshold_bank *b = NULL;
|
||||
const char *name = get_name(bank, NULL);
|
||||
const char *name = get_name(cpu, bank, NULL);
|
||||
int err = 0;
|
||||
|
||||
if (!dev)
|
||||
|
@ -99,7 +99,6 @@ struct mca_config mca_cfg __read_mostly = {
|
||||
|
||||
static DEFINE_PER_CPU(struct mce, mces_seen);
|
||||
static unsigned long mce_need_notify;
|
||||
static int cpu_missing;
|
||||
|
||||
/*
|
||||
* MCA banks polled by the period polling timer for corrected events.
|
||||
@ -128,7 +127,7 @@ static struct irq_work mce_irq_work;
|
||||
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
|
||||
|
||||
/* Do initial initialization of a struct mce */
|
||||
noinstr void mce_setup(struct mce *m)
|
||||
void mce_setup(struct mce *m)
|
||||
{
|
||||
memset(m, 0, sizeof(struct mce));
|
||||
m->cpu = m->extcpu = smp_processor_id();
|
||||
@ -267,11 +266,17 @@ static void wait_for_panic(void)
|
||||
panic("Panicing machine check CPU died");
|
||||
}
|
||||
|
||||
static void mce_panic(const char *msg, struct mce *final, char *exp)
|
||||
static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
|
||||
{
|
||||
int apei_err = 0;
|
||||
struct llist_node *pending;
|
||||
struct mce_evt_llist *l;
|
||||
int apei_err = 0;
|
||||
|
||||
/*
|
||||
* Allow instrumentation around external facilities usage. Not that it
|
||||
* matters a whole lot since the machine is going to panic anyway.
|
||||
*/
|
||||
instrumentation_begin();
|
||||
|
||||
if (!fake_panic) {
|
||||
/*
|
||||
@ -286,7 +291,7 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
|
||||
} else {
|
||||
/* Don't log too much for fake panic */
|
||||
if (atomic_inc_return(&mce_fake_panicked) > 1)
|
||||
return;
|
||||
goto out;
|
||||
}
|
||||
pending = mce_gen_pool_prepare_records();
|
||||
/* First print corrected ones that are still unlogged */
|
||||
@ -314,8 +319,6 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
|
||||
if (!apei_err)
|
||||
apei_err = apei_write_mce(final);
|
||||
}
|
||||
if (cpu_missing)
|
||||
pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
|
||||
if (exp)
|
||||
pr_emerg(HW_ERR "Machine check: %s\n", exp);
|
||||
if (!fake_panic) {
|
||||
@ -324,6 +327,9 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
|
||||
panic(msg);
|
||||
} else
|
||||
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
|
||||
|
||||
out:
|
||||
instrumentation_end();
|
||||
}
|
||||
|
||||
/* Support code for software error injection */
|
||||
@ -365,7 +371,7 @@ void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
|
||||
}
|
||||
|
||||
/* MSR access wrappers used for error injection */
|
||||
static noinstr u64 mce_rdmsrl(u32 msr)
|
||||
noinstr u64 mce_rdmsrl(u32 msr)
|
||||
{
|
||||
DECLARE_ARGS(val, low, high);
|
||||
|
||||
@ -433,9 +439,15 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
|
||||
* check into our "mce" struct so that we can use it later to assess
|
||||
* the severity of the problem as we read per-bank specific details.
|
||||
*/
|
||||
static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
|
||||
static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
|
||||
{
|
||||
/*
|
||||
* Enable instrumentation around mce_setup() which calls external
|
||||
* facilities.
|
||||
*/
|
||||
instrumentation_begin();
|
||||
mce_setup(m);
|
||||
instrumentation_end();
|
||||
|
||||
m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
|
||||
if (regs) {
|
||||
@ -636,7 +648,7 @@ static struct notifier_block mce_default_nb = {
|
||||
/*
|
||||
* Read ADDR and MISC registers.
|
||||
*/
|
||||
static void mce_read_aux(struct mce *m, int i)
|
||||
static noinstr void mce_read_aux(struct mce *m, int i)
|
||||
{
|
||||
if (m->status & MCI_STATUS_MISCV)
|
||||
m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
|
||||
@ -871,8 +883,13 @@ static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
|
||||
/*
|
||||
* Check if a timeout waiting for other CPUs happened.
|
||||
*/
|
||||
static int mce_timed_out(u64 *t, const char *msg)
|
||||
static noinstr int mce_timed_out(u64 *t, const char *msg)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
/* Enable instrumentation around calls to external facilities */
|
||||
instrumentation_begin();
|
||||
|
||||
/*
|
||||
* The others already did panic for some reason.
|
||||
* Bail out like in a timeout.
|
||||
@ -891,13 +908,17 @@ static int mce_timed_out(u64 *t, const char *msg)
|
||||
cpumask_pr_args(&mce_missing_cpus));
|
||||
mce_panic(msg, NULL, NULL);
|
||||
}
|
||||
cpu_missing = 1;
|
||||
return 1;
|
||||
ret = 1;
|
||||
goto out;
|
||||
}
|
||||
*t -= SPINUNIT;
|
||||
|
||||
out:
|
||||
touch_nmi_watchdog();
|
||||
return 0;
|
||||
|
||||
instrumentation_end();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -986,14 +1007,13 @@ static atomic_t global_nwo;
|
||||
* in the entry order.
|
||||
* TBD double check parallel CPU hotunplug
|
||||
*/
|
||||
static int mce_start(int *no_way_out)
|
||||
static noinstr int mce_start(int *no_way_out)
|
||||
{
|
||||
int order;
|
||||
int cpus = num_online_cpus();
|
||||
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
|
||||
int order, ret = -1;
|
||||
|
||||
if (!timeout)
|
||||
return -1;
|
||||
return ret;
|
||||
|
||||
atomic_add(*no_way_out, &global_nwo);
|
||||
/*
|
||||
@ -1003,14 +1023,17 @@ static int mce_start(int *no_way_out)
|
||||
order = atomic_inc_return(&mce_callin);
|
||||
cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
|
||||
|
||||
/* Enable instrumentation around calls to external facilities */
|
||||
instrumentation_begin();
|
||||
|
||||
/*
|
||||
* Wait for everyone.
|
||||
*/
|
||||
while (atomic_read(&mce_callin) != cpus) {
|
||||
while (atomic_read(&mce_callin) != num_online_cpus()) {
|
||||
if (mce_timed_out(&timeout,
|
||||
"Timeout: Not all CPUs entered broadcast exception handler")) {
|
||||
atomic_set(&global_nwo, 0);
|
||||
return -1;
|
||||
goto out;
|
||||
}
|
||||
ndelay(SPINUNIT);
|
||||
}
|
||||
@ -1036,7 +1059,7 @@ static int mce_start(int *no_way_out)
|
||||
if (mce_timed_out(&timeout,
|
||||
"Timeout: Subject CPUs unable to finish machine check processing")) {
|
||||
atomic_set(&global_nwo, 0);
|
||||
return -1;
|
||||
goto out;
|
||||
}
|
||||
ndelay(SPINUNIT);
|
||||
}
|
||||
@ -1047,17 +1070,25 @@ static int mce_start(int *no_way_out)
|
||||
*/
|
||||
*no_way_out = atomic_read(&global_nwo);
|
||||
|
||||
return order;
|
||||
ret = order;
|
||||
|
||||
out:
|
||||
instrumentation_end();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Synchronize between CPUs after main scanning loop.
|
||||
* This invokes the bulk of the Monarch processing.
|
||||
*/
|
||||
static int mce_end(int order)
|
||||
static noinstr int mce_end(int order)
|
||||
{
|
||||
int ret = -1;
|
||||
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
|
||||
int ret = -1;
|
||||
|
||||
/* Allow instrumentation around external facilities. */
|
||||
instrumentation_begin();
|
||||
|
||||
if (!timeout)
|
||||
goto reset;
|
||||
@ -1070,14 +1101,11 @@ static int mce_end(int order)
|
||||
atomic_inc(&mce_executing);
|
||||
|
||||
if (order == 1) {
|
||||
/* CHECKME: Can this race with a parallel hotplug? */
|
||||
int cpus = num_online_cpus();
|
||||
|
||||
/*
|
||||
* Monarch: Wait for everyone to go through their scanning
|
||||
* loops.
|
||||
*/
|
||||
while (atomic_read(&mce_executing) <= cpus) {
|
||||
while (atomic_read(&mce_executing) <= num_online_cpus()) {
|
||||
if (mce_timed_out(&timeout,
|
||||
"Timeout: Monarch CPU unable to finish machine check processing"))
|
||||
goto reset;
|
||||
@ -1101,7 +1129,8 @@ static int mce_end(int order)
|
||||
/*
|
||||
* Don't reset anything. That's done by the Monarch.
|
||||
*/
|
||||
return 0;
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1117,6 +1146,10 @@ reset:
|
||||
* Let others run again.
|
||||
*/
|
||||
atomic_set(&mce_executing, 0);
|
||||
|
||||
out:
|
||||
instrumentation_end();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1165,13 +1198,14 @@ static noinstr bool mce_check_crashing_cpu(void)
|
||||
return false;
|
||||
}
|
||||
|
||||
static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
|
||||
unsigned long *toclear, unsigned long *valid_banks,
|
||||
int no_way_out, int *worst)
|
||||
static __always_inline int
|
||||
__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
|
||||
unsigned long *toclear, unsigned long *valid_banks, int no_way_out,
|
||||
int *worst)
|
||||
{
|
||||
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
|
||||
struct mca_config *cfg = &mca_cfg;
|
||||
int severity, i;
|
||||
int severity, i, taint = 0;
|
||||
|
||||
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
|
||||
__clear_bit(i, toclear);
|
||||
@ -1198,7 +1232,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
|
||||
continue;
|
||||
|
||||
/* Set taint even when machine check was not enabled. */
|
||||
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
|
||||
taint++;
|
||||
|
||||
severity = mce_severity(m, regs, cfg->tolerant, NULL, true);
|
||||
|
||||
@ -1221,7 +1255,13 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
|
||||
/* assuming valid severity level != 0 */
|
||||
m->severity = severity;
|
||||
|
||||
/*
|
||||
* Enable instrumentation around the mce_log() call which is
|
||||
* done in #MC context, where instrumentation is disabled.
|
||||
*/
|
||||
instrumentation_begin();
|
||||
mce_log(m);
|
||||
instrumentation_end();
|
||||
|
||||
if (severity > *worst) {
|
||||
*final = *m;
|
||||
@ -1231,6 +1271,8 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
|
||||
|
||||
/* mce_clear_state will clear *final, save locally for use later */
|
||||
*m = *final;
|
||||
|
||||
return taint;
|
||||
}
|
||||
|
||||
static void kill_me_now(struct callback_head *ch)
|
||||
@ -1320,11 +1362,11 @@ static noinstr void unexpected_machine_check(struct pt_regs *regs)
|
||||
}
|
||||
|
||||
/*
|
||||
* The actual machine check handler. This only handles real
|
||||
* exceptions when something got corrupted coming in through int 18.
|
||||
* The actual machine check handler. This only handles real exceptions when
|
||||
* something got corrupted coming in through int 18.
|
||||
*
|
||||
* This is executed in NMI context not subject to normal locking rules. This
|
||||
* implies that most kernel services cannot be safely used. Don't even
|
||||
* This is executed in #MC context not subject to normal locking rules.
|
||||
* This implies that most kernel services cannot be safely used. Don't even
|
||||
* think about putting a printk in there!
|
||||
*
|
||||
* On Intel systems this is entered on all CPUs in parallel through
|
||||
@ -1336,12 +1378,20 @@ static noinstr void unexpected_machine_check(struct pt_regs *regs)
|
||||
* issues: if the machine check was due to a failure of the memory
|
||||
* backing the user stack, tracing that reads the user stack will cause
|
||||
* potentially infinite recursion.
|
||||
*
|
||||
* Currently, the #MC handler calls out to a number of external facilities
|
||||
* and, therefore, allows instrumentation around them. The optimal thing to
|
||||
* have would be to do the absolutely minimal work required in #MC context
|
||||
* and have instrumentation disabled only around that. Further processing can
|
||||
* then happen in process context where instrumentation is allowed. Achieving
|
||||
* that requires careful auditing and modifications. Until then, the code
|
||||
* allows instrumentation temporarily, where required. *
|
||||
*/
|
||||
noinstr void do_machine_check(struct pt_regs *regs)
|
||||
{
|
||||
int worst = 0, order, no_way_out, kill_current_task, lmce;
|
||||
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
|
||||
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
|
||||
int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
|
||||
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
|
||||
DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
|
||||
struct mca_config *cfg = &mca_cfg;
|
||||
struct mce m, *final;
|
||||
char *msg = NULL;
|
||||
@ -1385,7 +1435,6 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
final = this_cpu_ptr(&mces_seen);
|
||||
*final = m;
|
||||
|
||||
memset(valid_banks, 0, sizeof(valid_banks));
|
||||
no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
|
||||
|
||||
barrier();
|
||||
@ -1419,7 +1468,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
order = mce_start(&no_way_out);
|
||||
}
|
||||
|
||||
__mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
|
||||
taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
|
||||
|
||||
if (!no_way_out)
|
||||
mce_clear_state(toclear);
|
||||
@ -1451,6 +1500,16 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Enable instrumentation around the external facilities like task_work_add()
|
||||
* (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this
|
||||
* properly would need a lot more involved reorganization.
|
||||
*/
|
||||
instrumentation_begin();
|
||||
|
||||
if (taint)
|
||||
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
|
||||
|
||||
if (worst != MCE_AR_SEVERITY && !kill_current_task)
|
||||
goto out;
|
||||
|
||||
@ -1482,7 +1541,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
if (m.kflags & MCE_IN_KERNEL_COPYIN)
|
||||
queue_task_work(&m, msg, kill_me_never);
|
||||
}
|
||||
|
||||
out:
|
||||
instrumentation_end();
|
||||
|
||||
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(do_machine_check);
|
||||
@ -2702,7 +2764,6 @@ struct dentry *mce_get_debugfs_dir(void)
|
||||
|
||||
static void mce_reset(void)
|
||||
{
|
||||
cpu_missing = 0;
|
||||
atomic_set(&mce_fake_panicked, 0);
|
||||
atomic_set(&mce_executing, 0);
|
||||
atomic_set(&mce_callin, 0);
|
||||
|
@ -74,7 +74,6 @@ MCE_INJECT_SET(status);
|
||||
MCE_INJECT_SET(misc);
|
||||
MCE_INJECT_SET(addr);
|
||||
MCE_INJECT_SET(synd);
|
||||
MCE_INJECT_SET(ipid);
|
||||
|
||||
#define MCE_INJECT_GET(reg) \
|
||||
static int inj_##reg##_get(void *data, u64 *val) \
|
||||
@ -95,6 +94,20 @@ DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
|
||||
DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
|
||||
DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
|
||||
DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
|
||||
|
||||
/* Use the user provided IPID value on a sw injection. */
|
||||
static int inj_ipid_set(void *data, u64 val)
|
||||
{
|
||||
struct mce *m = (struct mce *)data;
|
||||
|
||||
if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
|
||||
if (inj_type == SW_INJ)
|
||||
m->ipid = val;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n");
|
||||
|
||||
static void setup_inj_struct(struct mce *m)
|
||||
@ -350,7 +363,7 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf,
|
||||
char buf[MAX_FLAG_OPT_SIZE], *__buf;
|
||||
int err;
|
||||
|
||||
if (cnt > MAX_FLAG_OPT_SIZE)
|
||||
if (!cnt || cnt > MAX_FLAG_OPT_SIZE)
|
||||
return -EINVAL;
|
||||
|
||||
if (copy_from_user(&buf, ubuf, cnt))
|
||||
@ -490,6 +503,8 @@ static void do_inject(void)
|
||||
|
||||
i_mce.tsc = rdtsc_ordered();
|
||||
|
||||
i_mce.status |= MCI_STATUS_VAL;
|
||||
|
||||
if (i_mce.misc)
|
||||
i_mce.status |= MCI_STATUS_MISCV;
|
||||
|
||||
@ -577,6 +592,33 @@ static int inj_bank_set(void *data, u64 val)
|
||||
}
|
||||
|
||||
m->bank = val;
|
||||
|
||||
/*
|
||||
* sw-only injection allows to write arbitrary values into the MCA
|
||||
* registers because it tests only the decoding paths.
|
||||
*/
|
||||
if (inj_type == SW_INJ)
|
||||
goto inject;
|
||||
|
||||
/*
|
||||
* Read IPID value to determine if a bank is populated on the target
|
||||
* CPU.
|
||||
*/
|
||||
if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
|
||||
u64 ipid;
|
||||
|
||||
if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) {
|
||||
pr_err("Error reading IPID on CPU%d\n", m->extcpu);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!ipid) {
|
||||
pr_err("Cannot inject into unpopulated bank %llu\n", val);
|
||||
return -ENODEV;
|
||||
}
|
||||
}
|
||||
|
||||
inject:
|
||||
do_inject();
|
||||
|
||||
/* Reset injection struct */
|
||||
|
@ -207,4 +207,6 @@ static inline void pentium_machine_check(struct pt_regs *regs) {}
|
||||
static inline void winchip_machine_check(struct pt_regs *regs) {}
|
||||
#endif
|
||||
|
||||
noinstr u64 mce_rdmsrl(u32 msr);
|
||||
|
||||
#endif /* __X86_MCE_INTERNAL_H__ */
|
||||
|
@ -222,6 +222,9 @@ static bool is_copy_from_user(struct pt_regs *regs)
|
||||
struct insn insn;
|
||||
int ret;
|
||||
|
||||
if (!regs)
|
||||
return false;
|
||||
|
||||
if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
|
||||
return false;
|
||||
|
||||
@ -263,24 +266,36 @@ static bool is_copy_from_user(struct pt_regs *regs)
|
||||
* distinguish an exception taken in user from from one
|
||||
* taken in the kernel.
|
||||
*/
|
||||
static int error_context(struct mce *m, struct pt_regs *regs)
|
||||
static noinstr int error_context(struct mce *m, struct pt_regs *regs)
|
||||
{
|
||||
int fixup_type;
|
||||
bool copy_user;
|
||||
|
||||
if ((m->cs & 3) == 3)
|
||||
return IN_USER;
|
||||
|
||||
if (!mc_recoverable(m->mcgstatus))
|
||||
return IN_KERNEL;
|
||||
|
||||
switch (ex_get_fixup_type(m->ip)) {
|
||||
/* Allow instrumentation around external facilities usage. */
|
||||
instrumentation_begin();
|
||||
fixup_type = ex_get_fixup_type(m->ip);
|
||||
copy_user = is_copy_from_user(regs);
|
||||
instrumentation_end();
|
||||
|
||||
switch (fixup_type) {
|
||||
case EX_TYPE_UACCESS:
|
||||
case EX_TYPE_COPY:
|
||||
if (!regs || !is_copy_from_user(regs))
|
||||
if (!copy_user)
|
||||
return IN_KERNEL;
|
||||
m->kflags |= MCE_IN_KERNEL_COPYIN;
|
||||
fallthrough;
|
||||
|
||||
case EX_TYPE_FAULT_MCE_SAFE:
|
||||
case EX_TYPE_DEFAULT_MCE_SAFE:
|
||||
m->kflags |= MCE_IN_KERNEL_RECOV;
|
||||
return IN_KERNEL_RECOV;
|
||||
|
||||
default:
|
||||
return IN_KERNEL;
|
||||
}
|
||||
@ -288,8 +303,7 @@ static int error_context(struct mce *m, struct pt_regs *regs)
|
||||
|
||||
static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
|
||||
{
|
||||
u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
|
||||
u32 low, high;
|
||||
u64 mcx_cfg;
|
||||
|
||||
/*
|
||||
* We need to look at the following bits:
|
||||
@ -300,11 +314,10 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
|
||||
if (!mce_flags.succor)
|
||||
return MCE_PANIC_SEVERITY;
|
||||
|
||||
if (rdmsr_safe(addr, &low, &high))
|
||||
return MCE_PANIC_SEVERITY;
|
||||
mcx_cfg = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(m->bank));
|
||||
|
||||
/* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
|
||||
if ((low & MCI_CONFIG_MCAX) &&
|
||||
if ((mcx_cfg & MCI_CONFIG_MCAX) &&
|
||||
(m->status & MCI_STATUS_TCC) &&
|
||||
(err_ctx == IN_KERNEL))
|
||||
return MCE_PANIC_SEVERITY;
|
||||
@ -317,8 +330,8 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
|
||||
* See AMD Error Scope Hierarchy table in a newer BKDG. For example
|
||||
* 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
|
||||
*/
|
||||
static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant,
|
||||
char **msg, bool is_excp)
|
||||
static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant,
|
||||
char **msg, bool is_excp)
|
||||
{
|
||||
enum context ctx = error_context(m, regs);
|
||||
|
||||
@ -370,8 +383,8 @@ static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant,
|
||||
return MCE_KEEP_SEVERITY;
|
||||
}
|
||||
|
||||
static int mce_severity_intel(struct mce *m, struct pt_regs *regs,
|
||||
int tolerant, char **msg, bool is_excp)
|
||||
static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs,
|
||||
int tolerant, char **msg, bool is_excp)
|
||||
{
|
||||
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
|
||||
enum context ctx = error_context(m, regs);
|
||||
@ -407,8 +420,8 @@ static int mce_severity_intel(struct mce *m, struct pt_regs *regs,
|
||||
}
|
||||
}
|
||||
|
||||
int mce_severity(struct mce *m, struct pt_regs *regs, int tolerant, char **msg,
|
||||
bool is_excp)
|
||||
int noinstr mce_severity(struct mce *m, struct pt_regs *regs, int tolerant, char **msg,
|
||||
bool is_excp)
|
||||
{
|
||||
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
|
||||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
|
||||
|
@ -225,6 +225,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
|
||||
* Don't try to copy the tail if machine check happened
|
||||
*
|
||||
* Input:
|
||||
* eax trap number written by ex_handler_copy()
|
||||
* rdi destination
|
||||
* rsi source
|
||||
* rdx count
|
||||
@ -233,12 +234,20 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
|
||||
* eax uncopied bytes or 0 if successful.
|
||||
*/
|
||||
SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
|
||||
cmp $X86_TRAP_MC,%eax
|
||||
je 3f
|
||||
|
||||
movl %edx,%ecx
|
||||
1: rep movsb
|
||||
2: mov %ecx,%eax
|
||||
ASM_CLAC
|
||||
ret
|
||||
|
||||
3:
|
||||
movl %edx,%eax
|
||||
ASM_CLAC
|
||||
RET
|
||||
|
||||
_ASM_EXTABLE_CPY(1b, 2b)
|
||||
SYM_CODE_END(.Lcopy_user_handle_tail)
|
||||
|
||||
|
@ -988,6 +988,281 @@ static int sys_addr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr)
|
||||
return csrow;
|
||||
}
|
||||
|
||||
/* Protect the PCI config register pairs used for DF indirect access. */
|
||||
static DEFINE_MUTEX(df_indirect_mutex);
|
||||
|
||||
/*
|
||||
* Data Fabric Indirect Access uses FICAA/FICAD.
|
||||
*
|
||||
* Fabric Indirect Configuration Access Address (FICAA): Constructed based
|
||||
* on the device's Instance Id and the PCI function and register offset of
|
||||
* the desired register.
|
||||
*
|
||||
* Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO
|
||||
* and FICAD HI registers but so far we only need the LO register.
|
||||
*
|
||||
* Use Instance Id 0xFF to indicate a broadcast read.
|
||||
*/
|
||||
#define DF_BROADCAST 0xFF
|
||||
static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
|
||||
{
|
||||
struct pci_dev *F4;
|
||||
u32 ficaa;
|
||||
int err = -ENODEV;
|
||||
|
||||
if (node >= amd_nb_num())
|
||||
goto out;
|
||||
|
||||
F4 = node_to_amd_nb(node)->link;
|
||||
if (!F4)
|
||||
goto out;
|
||||
|
||||
ficaa = (instance_id == DF_BROADCAST) ? 0 : 1;
|
||||
ficaa |= reg & 0x3FC;
|
||||
ficaa |= (func & 0x7) << 11;
|
||||
ficaa |= instance_id << 16;
|
||||
|
||||
mutex_lock(&df_indirect_mutex);
|
||||
|
||||
err = pci_write_config_dword(F4, 0x5C, ficaa);
|
||||
if (err) {
|
||||
pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
err = pci_read_config_dword(F4, 0x98, lo);
|
||||
if (err)
|
||||
pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&df_indirect_mutex);
|
||||
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int df_indirect_read_instance(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
|
||||
{
|
||||
return __df_indirect_read(node, func, reg, instance_id, lo);
|
||||
}
|
||||
|
||||
static int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo)
|
||||
{
|
||||
return __df_indirect_read(node, func, reg, DF_BROADCAST, lo);
|
||||
}
|
||||
|
||||
struct addr_ctx {
|
||||
u64 ret_addr;
|
||||
u32 tmp;
|
||||
u16 nid;
|
||||
u8 inst_id;
|
||||
};
|
||||
|
||||
static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
|
||||
{
|
||||
u64 dram_base_addr, dram_limit_addr, dram_hole_base;
|
||||
|
||||
u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
|
||||
u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
|
||||
u8 intlv_addr_sel, intlv_addr_bit;
|
||||
u8 num_intlv_bits, hashed_bit;
|
||||
u8 lgcy_mmio_hole_en, base = 0;
|
||||
u8 cs_mask, cs_id = 0;
|
||||
bool hash_enabled = false;
|
||||
|
||||
struct addr_ctx ctx;
|
||||
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
|
||||
/* Start from the normalized address */
|
||||
ctx.ret_addr = norm_addr;
|
||||
|
||||
ctx.nid = nid;
|
||||
ctx.inst_id = umc;
|
||||
|
||||
/* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
|
||||
if (df_indirect_read_instance(nid, 0, 0x1B4, umc, &ctx.tmp))
|
||||
goto out_err;
|
||||
|
||||
/* Remove HiAddrOffset from normalized address, if enabled: */
|
||||
if (ctx.tmp & BIT(0)) {
|
||||
u64 hi_addr_offset = (ctx.tmp & GENMASK_ULL(31, 20)) << 8;
|
||||
|
||||
if (norm_addr >= hi_addr_offset) {
|
||||
ctx.ret_addr -= hi_addr_offset;
|
||||
base = 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Read D18F0x110 (DramBaseAddress). */
|
||||
if (df_indirect_read_instance(nid, 0, 0x110 + (8 * base), umc, &ctx.tmp))
|
||||
goto out_err;
|
||||
|
||||
/* Check if address range is valid. */
|
||||
if (!(ctx.tmp & BIT(0))) {
|
||||
pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
|
||||
__func__, ctx.tmp);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
lgcy_mmio_hole_en = ctx.tmp & BIT(1);
|
||||
intlv_num_chan = (ctx.tmp >> 4) & 0xF;
|
||||
intlv_addr_sel = (ctx.tmp >> 8) & 0x7;
|
||||
dram_base_addr = (ctx.tmp & GENMASK_ULL(31, 12)) << 16;
|
||||
|
||||
/* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
|
||||
if (intlv_addr_sel > 3) {
|
||||
pr_err("%s: Invalid interleave address select %d.\n",
|
||||
__func__, intlv_addr_sel);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
/* Read D18F0x114 (DramLimitAddress). */
|
||||
if (df_indirect_read_instance(nid, 0, 0x114 + (8 * base), umc, &ctx.tmp))
|
||||
goto out_err;
|
||||
|
||||
intlv_num_sockets = (ctx.tmp >> 8) & 0x1;
|
||||
intlv_num_dies = (ctx.tmp >> 10) & 0x3;
|
||||
dram_limit_addr = ((ctx.tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
|
||||
|
||||
intlv_addr_bit = intlv_addr_sel + 8;
|
||||
|
||||
/* Re-use intlv_num_chan by setting it equal to log2(#channels) */
|
||||
switch (intlv_num_chan) {
|
||||
case 0: intlv_num_chan = 0; break;
|
||||
case 1: intlv_num_chan = 1; break;
|
||||
case 3: intlv_num_chan = 2; break;
|
||||
case 5: intlv_num_chan = 3; break;
|
||||
case 7: intlv_num_chan = 4; break;
|
||||
|
||||
case 8: intlv_num_chan = 1;
|
||||
hash_enabled = true;
|
||||
break;
|
||||
default:
|
||||
pr_err("%s: Invalid number of interleaved channels %d.\n",
|
||||
__func__, intlv_num_chan);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
num_intlv_bits = intlv_num_chan;
|
||||
|
||||
if (intlv_num_dies > 2) {
|
||||
pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
|
||||
__func__, intlv_num_dies);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
num_intlv_bits += intlv_num_dies;
|
||||
|
||||
/* Add a bit if sockets are interleaved. */
|
||||
num_intlv_bits += intlv_num_sockets;
|
||||
|
||||
/* Assert num_intlv_bits <= 4 */
|
||||
if (num_intlv_bits > 4) {
|
||||
pr_err("%s: Invalid interleave bits %d.\n",
|
||||
__func__, num_intlv_bits);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
if (num_intlv_bits > 0) {
|
||||
u64 temp_addr_x, temp_addr_i, temp_addr_y;
|
||||
u8 die_id_bit, sock_id_bit, cs_fabric_id;
|
||||
|
||||
/*
|
||||
* Read FabricBlockInstanceInformation3_CS[BlockFabricID].
|
||||
* This is the fabric id for this coherent slave. Use
|
||||
* umc/channel# as instance id of the coherent slave
|
||||
* for FICAA.
|
||||
*/
|
||||
if (df_indirect_read_instance(nid, 0, 0x50, umc, &ctx.tmp))
|
||||
goto out_err;
|
||||
|
||||
cs_fabric_id = (ctx.tmp >> 8) & 0xFF;
|
||||
die_id_bit = 0;
|
||||
|
||||
/* If interleaved over more than 1 channel: */
|
||||
if (intlv_num_chan) {
|
||||
die_id_bit = intlv_num_chan;
|
||||
cs_mask = (1 << die_id_bit) - 1;
|
||||
cs_id = cs_fabric_id & cs_mask;
|
||||
}
|
||||
|
||||
sock_id_bit = die_id_bit;
|
||||
|
||||
/* Read D18F1x208 (SystemFabricIdMask). */
|
||||
if (intlv_num_dies || intlv_num_sockets)
|
||||
if (df_indirect_read_broadcast(nid, 1, 0x208, &ctx.tmp))
|
||||
goto out_err;
|
||||
|
||||
/* If interleaved over more than 1 die. */
|
||||
if (intlv_num_dies) {
|
||||
sock_id_bit = die_id_bit + intlv_num_dies;
|
||||
die_id_shift = (ctx.tmp >> 24) & 0xF;
|
||||
die_id_mask = (ctx.tmp >> 8) & 0xFF;
|
||||
|
||||
cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
|
||||
}
|
||||
|
||||
/* If interleaved over more than 1 socket. */
|
||||
if (intlv_num_sockets) {
|
||||
socket_id_shift = (ctx.tmp >> 28) & 0xF;
|
||||
socket_id_mask = (ctx.tmp >> 16) & 0xFF;
|
||||
|
||||
cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
|
||||
}
|
||||
|
||||
/*
|
||||
* The pre-interleaved address consists of XXXXXXIIIYYYYY
|
||||
* where III is the ID for this CS, and XXXXXXYYYYY are the
|
||||
* address bits from the post-interleaved address.
|
||||
* "num_intlv_bits" has been calculated to tell us how many "I"
|
||||
* bits there are. "intlv_addr_bit" tells us how many "Y" bits
|
||||
* there are (where "I" starts).
|
||||
*/
|
||||
temp_addr_y = ctx.ret_addr & GENMASK_ULL(intlv_addr_bit - 1, 0);
|
||||
temp_addr_i = (cs_id << intlv_addr_bit);
|
||||
temp_addr_x = (ctx.ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
|
||||
ctx.ret_addr = temp_addr_x | temp_addr_i | temp_addr_y;
|
||||
}
|
||||
|
||||
/* Add dram base address */
|
||||
ctx.ret_addr += dram_base_addr;
|
||||
|
||||
/* If legacy MMIO hole enabled */
|
||||
if (lgcy_mmio_hole_en) {
|
||||
if (df_indirect_read_broadcast(nid, 0, 0x104, &ctx.tmp))
|
||||
goto out_err;
|
||||
|
||||
dram_hole_base = ctx.tmp & GENMASK(31, 24);
|
||||
if (ctx.ret_addr >= dram_hole_base)
|
||||
ctx.ret_addr += (BIT_ULL(32) - dram_hole_base);
|
||||
}
|
||||
|
||||
if (hash_enabled) {
|
||||
/* Save some parentheses and grab ls-bit at the end. */
|
||||
hashed_bit = (ctx.ret_addr >> 12) ^
|
||||
(ctx.ret_addr >> 18) ^
|
||||
(ctx.ret_addr >> 21) ^
|
||||
(ctx.ret_addr >> 30) ^
|
||||
cs_id;
|
||||
|
||||
hashed_bit &= BIT(0);
|
||||
|
||||
if (hashed_bit != ((ctx.ret_addr >> intlv_addr_bit) & BIT(0)))
|
||||
ctx.ret_addr ^= BIT(intlv_addr_bit);
|
||||
}
|
||||
|
||||
/* Is calculated system address is above DRAM limit address? */
|
||||
if (ctx.ret_addr > dram_limit_addr)
|
||||
goto out_err;
|
||||
|
||||
*sys_addr = ctx.ret_addr;
|
||||
return 0;
|
||||
|
||||
out_err:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int get_channel_from_ecc_syndrome(struct mem_ctl_info *, u16);
|
||||
|
||||
/*
|
||||
|
@ -399,6 +399,63 @@ static const char * const smca_mp5_mce_desc[] = {
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
};
|
||||
|
||||
static const char * const smca_mpdma_mce_desc[] = {
|
||||
"Main SRAM [31:0] bank ECC or parity error",
|
||||
"Main SRAM [63:32] bank ECC or parity error",
|
||||
"Main SRAM [95:64] bank ECC or parity error",
|
||||
"Main SRAM [127:96] bank ECC or parity error",
|
||||
"Data Cache Bank A ECC or parity error",
|
||||
"Data Cache Bank B ECC or parity error",
|
||||
"Data Tag Cache Bank A ECC or parity error",
|
||||
"Data Tag Cache Bank B ECC or parity error",
|
||||
"Instruction Cache Bank A ECC or parity error",
|
||||
"Instruction Cache Bank B ECC or parity error",
|
||||
"Instruction Tag Cache Bank A ECC or parity error",
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
"Data Cache Bank A ECC or parity error",
|
||||
"Data Cache Bank B ECC or parity error",
|
||||
"Data Tag Cache Bank A ECC or parity error",
|
||||
"Data Tag Cache Bank B ECC or parity error",
|
||||
"Instruction Cache Bank A ECC or parity error",
|
||||
"Instruction Cache Bank B ECC or parity error",
|
||||
"Instruction Tag Cache Bank A ECC or parity error",
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
"Data Cache Bank A ECC or parity error",
|
||||
"Data Cache Bank B ECC or parity error",
|
||||
"Data Tag Cache Bank A ECC or parity error",
|
||||
"Data Tag Cache Bank B ECC or parity error",
|
||||
"Instruction Cache Bank A ECC or parity error",
|
||||
"Instruction Cache Bank B ECC or parity error",
|
||||
"Instruction Tag Cache Bank A ECC or parity error",
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
"System Hub Read Buffer ECC or parity error",
|
||||
"MPDMA TVF DVSEC Memory ECC or parity error",
|
||||
"MPDMA TVF MMIO Mailbox0 ECC or parity error",
|
||||
"MPDMA TVF MMIO Mailbox1 ECC or parity error",
|
||||
"MPDMA TVF Doorbell Memory ECC or parity error",
|
||||
"MPDMA TVF SDP Slave Memory 0 ECC or parity error",
|
||||
"MPDMA TVF SDP Slave Memory 1 ECC or parity error",
|
||||
"MPDMA TVF SDP Slave Memory 2 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 0 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 1 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 2 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 3 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 4 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 5 ECC or parity error",
|
||||
"MPDMA TVF SDP Master Memory 6 ECC or parity error",
|
||||
"MPDMA PTE Command FIFO ECC or parity error",
|
||||
"MPDMA PTE Hub Data FIFO ECC or parity error",
|
||||
"MPDMA PTE Internal Data FIFO ECC or parity error",
|
||||
"MPDMA PTE Command Memory DMA ECC or parity error",
|
||||
"MPDMA PTE Command Memory Internal ECC or parity error",
|
||||
"MPDMA PTE DMA Completion FIFO ECC or parity error",
|
||||
"MPDMA PTE Tablewalk Completion FIFO ECC or parity error",
|
||||
"MPDMA PTE Descriptor Completion FIFO ECC or parity error",
|
||||
"MPDMA PTE ReadOnly Completion FIFO ECC or parity error",
|
||||
"MPDMA PTE DirectWrite Completion FIFO ECC or parity error",
|
||||
"SDP Watchdog Timer expired",
|
||||
};
|
||||
|
||||
static const char * const smca_nbio_mce_desc[] = {
|
||||
"ECC or Parity error",
|
||||
"PCIE error",
|
||||
@ -448,7 +505,7 @@ static const char * const smca_xgmipcs_mce_desc[] = {
|
||||
"Rx Replay Timeout Error",
|
||||
"LinkSub Tx Timeout Error",
|
||||
"LinkSub Rx Timeout Error",
|
||||
"Rx CMD Pocket Error",
|
||||
"Rx CMD Packet Error",
|
||||
};
|
||||
|
||||
static const char * const smca_xgmiphy_mce_desc[] = {
|
||||
@ -458,11 +515,66 @@ static const char * const smca_xgmiphy_mce_desc[] = {
|
||||
"PHY APB error",
|
||||
};
|
||||
|
||||
static const char * const smca_waflphy_mce_desc[] = {
|
||||
"RAM ECC Error",
|
||||
"ARC instruction buffer parity error",
|
||||
"ARC data buffer parity error",
|
||||
"PHY APB error",
|
||||
static const char * const smca_nbif_mce_desc[] = {
|
||||
"Timeout error from GMI",
|
||||
"SRAM ECC error",
|
||||
"NTB Error Event",
|
||||
"SDP Parity error",
|
||||
};
|
||||
|
||||
static const char * const smca_sata_mce_desc[] = {
|
||||
"Parity error for port 0",
|
||||
"Parity error for port 1",
|
||||
"Parity error for port 2",
|
||||
"Parity error for port 3",
|
||||
"Parity error for port 4",
|
||||
"Parity error for port 5",
|
||||
"Parity error for port 6",
|
||||
"Parity error for port 7",
|
||||
};
|
||||
|
||||
static const char * const smca_usb_mce_desc[] = {
|
||||
"Parity error or ECC error for S0 RAM0",
|
||||
"Parity error or ECC error for S0 RAM1",
|
||||
"Parity error or ECC error for S0 RAM2",
|
||||
"Parity error for PHY RAM0",
|
||||
"Parity error for PHY RAM1",
|
||||
"AXI Slave Response error",
|
||||
};
|
||||
|
||||
static const char * const smca_gmipcs_mce_desc[] = {
|
||||
"Data Loss Error",
|
||||
"Training Error",
|
||||
"Replay Parity Error",
|
||||
"Rx Fifo Underflow Error",
|
||||
"Rx Fifo Overflow Error",
|
||||
"CRC Error",
|
||||
"BER Exceeded Error",
|
||||
"Tx Fifo Underflow Error",
|
||||
"Replay Buffer Parity Error",
|
||||
"Tx Overflow Error",
|
||||
"Replay Fifo Overflow Error",
|
||||
"Replay Fifo Underflow Error",
|
||||
"Elastic Fifo Overflow Error",
|
||||
"Deskew Error",
|
||||
"Offline Error",
|
||||
"Data Startup Limit Error",
|
||||
"FC Init Timeout Error",
|
||||
"Recovery Timeout Error",
|
||||
"Ready Serial Timeout Error",
|
||||
"Ready Serial Attempt Error",
|
||||
"Recovery Attempt Error",
|
||||
"Recovery Relock Attempt Error",
|
||||
"Deskew Abort Error",
|
||||
"Rx Buffer Error",
|
||||
"Rx LFDS Fifo Overflow Error",
|
||||
"Rx LFDS Fifo Underflow Error",
|
||||
"LinkSub Tx Timeout Error",
|
||||
"LinkSub Rx Timeout Error",
|
||||
"Rx CMD Packet Error",
|
||||
"LFDS Training Timeout Error",
|
||||
"LFDS FC Init Timeout Error",
|
||||
"Data Loss Error",
|
||||
};
|
||||
|
||||
struct smca_mce_desc {
|
||||
@ -490,12 +602,21 @@ static struct smca_mce_desc smca_mce_descs[] = {
|
||||
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
|
||||
[SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) },
|
||||
[SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
|
||||
[SMCA_MPDMA] = { smca_mpdma_mce_desc, ARRAY_SIZE(smca_mpdma_mce_desc) },
|
||||
[SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) },
|
||||
[SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) },
|
||||
[SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) },
|
||||
[SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) },
|
||||
/* NBIF and SHUB have the same error descriptions, for now. */
|
||||
[SMCA_NBIF] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
|
||||
[SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
|
||||
[SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) },
|
||||
[SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) },
|
||||
[SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) },
|
||||
/* All the PHY bank types have the same error descriptions, for now. */
|
||||
[SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
|
||||
[SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) },
|
||||
[SMCA_WAFL_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
|
||||
[SMCA_GMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
|
||||
};
|
||||
|
||||
static bool f12h_mc0_mce(u16 ec, u8 xec)
|
||||
@ -1045,20 +1166,13 @@ static void decode_mc6_mce(struct mce *m)
|
||||
/* Decode errors according to Scalable MCA specification */
|
||||
static void decode_smca_error(struct mce *m)
|
||||
{
|
||||
struct smca_hwid *hwid;
|
||||
enum smca_bank_types bank_type;
|
||||
enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
|
||||
const char *ip_name;
|
||||
u8 xec = XEC(m->status, xec_mask);
|
||||
|
||||
if (m->bank >= ARRAY_SIZE(smca_banks))
|
||||
if (bank_type >= N_SMCA_BANK_TYPES)
|
||||
return;
|
||||
|
||||
hwid = smca_banks[m->bank].hwid;
|
||||
if (!hwid)
|
||||
return;
|
||||
|
||||
bank_type = hwid->bank_type;
|
||||
|
||||
if (bank_type == SMCA_RESERVED) {
|
||||
pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
|
||||
return;
|
||||
|
@ -2647,7 +2647,7 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
|
||||
* and error occurred in DramECC (Extended error code = 0) then only
|
||||
* process the error, else bail out.
|
||||
*/
|
||||
if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
|
||||
if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
|
||||
(XEC(m->status, 0x3f) == 0x0)))
|
||||
return NOTIFY_DONE;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user