e8a308e5f4
The NFIT machine check handler uses the physical address from the mce
structure, and compares it against information in the ACPI NFIT table
to determine whether that location lies on an NVDIMM. The mce->addr
field however may not always be valid, and this is indicated by the
MCI_STATUS_ADDRV bit in the status field.
Export mce_usable_address() which already performs validation for the
address, and use it in the NFIT handler.
Fixes: 6839a6d96f
("nfit: do an ARS scrub on hitting a latent media error")
Reported-by: Robert Elliott <elliott@hpe.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
CC: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Williams <dan.j.williams@intel.com>
CC: Dave Jiang <dave.jiang@intel.com>
CC: elliott@hpe.com
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: Len Brown <lenb@kernel.org>
CC: linux-acpi@vger.kernel.org
CC: linux-edac <linux-edac@vger.kernel.org>
CC: linux-nvdimm@lists.01.org
CC: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
CC: "Rafael J. Wysocki" <rjw@rjwysocki.net>
CC: Ross Zwisler <zwisler@kernel.org>
CC: stable <stable@vger.kernel.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Tony Luck <tony.luck@intel.com>
CC: x86-ml <x86@kernel.org>
CC: Yazen Ghannam <yazen.ghannam@amd.com>
Link: http://lkml.kernel.org/r/20181026003729.8420-2-vishal.l.verma@intel.com
108 lines
3.0 KiB
C
108 lines
3.0 KiB
C
/*
|
|
* NFIT - Machine Check Handler
|
|
*
|
|
* Copyright(c) 2013-2016 Intel Corporation. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of version 2 of the GNU General Public License as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*/
|
|
#include <linux/notifier.h>
|
|
#include <linux/acpi.h>
|
|
#include <linux/nd.h>
|
|
#include <asm/mce.h>
|
|
#include "nfit.h"
|
|
|
|
static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
|
|
void *data)
|
|
{
|
|
struct mce *mce = (struct mce *)data;
|
|
struct acpi_nfit_desc *acpi_desc;
|
|
struct nfit_spa *nfit_spa;
|
|
|
|
/* We only care about uncorrectable memory errors */
|
|
if (!mce_is_memory_error(mce) || mce_is_correctable(mce))
|
|
return NOTIFY_DONE;
|
|
|
|
/* Verify the address reported in the MCE is valid. */
|
|
if (!mce_usable_address(mce))
|
|
return NOTIFY_DONE;
|
|
|
|
/*
|
|
* mce->addr contains the physical addr accessed that caused the
|
|
* machine check. We need to walk through the list of NFITs, and see
|
|
* if any of them matches that address, and only then start a scrub.
|
|
*/
|
|
mutex_lock(&acpi_desc_lock);
|
|
list_for_each_entry(acpi_desc, &acpi_descs, list) {
|
|
struct device *dev = acpi_desc->dev;
|
|
int found_match = 0;
|
|
|
|
mutex_lock(&acpi_desc->init_mutex);
|
|
list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
|
|
struct acpi_nfit_system_address *spa = nfit_spa->spa;
|
|
|
|
if (nfit_spa_type(spa) != NFIT_SPA_PM)
|
|
continue;
|
|
/* find the spa that covers the mce addr */
|
|
if (spa->address > mce->addr)
|
|
continue;
|
|
if ((spa->address + spa->length - 1) < mce->addr)
|
|
continue;
|
|
found_match = 1;
|
|
dev_dbg(dev, "addr in SPA %d (0x%llx, 0x%llx)\n",
|
|
spa->range_index, spa->address, spa->length);
|
|
/*
|
|
* We can break at the first match because we're going
|
|
* to rescan all the SPA ranges. There shouldn't be any
|
|
* aliasing anyway.
|
|
*/
|
|
break;
|
|
}
|
|
mutex_unlock(&acpi_desc->init_mutex);
|
|
|
|
if (!found_match)
|
|
continue;
|
|
|
|
/* If this fails due to an -ENOMEM, there is little we can do */
|
|
nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
|
|
ALIGN(mce->addr, L1_CACHE_BYTES),
|
|
L1_CACHE_BYTES);
|
|
nvdimm_region_notify(nfit_spa->nd_region,
|
|
NVDIMM_REVALIDATE_POISON);
|
|
|
|
if (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON) {
|
|
/*
|
|
* We can ignore an -EBUSY here because if an ARS is
|
|
* already in progress, just let that be the last
|
|
* authoritative one
|
|
*/
|
|
acpi_nfit_ars_rescan(acpi_desc, 0);
|
|
}
|
|
break;
|
|
}
|
|
|
|
mutex_unlock(&acpi_desc_lock);
|
|
return NOTIFY_DONE;
|
|
}
|
|
|
|
static struct notifier_block nfit_mce_dec = {
|
|
.notifier_call = nfit_handle_mce,
|
|
.priority = MCE_PRIO_NFIT,
|
|
};
|
|
|
|
void nfit_mce_register(void)
|
|
{
|
|
mce_register_decode_chain(&nfit_mce_dec);
|
|
}
|
|
|
|
void nfit_mce_unregister(void)
|
|
{
|
|
mce_unregister_decode_chain(&nfit_mce_dec);
|
|
}
|