efc8b329c7
On systems with two or fewer sockets, when the boot CPU has CONSTANT_TSC, NONSTOP_TSC, and TSC_ADJUST, clocksource watchdog verification of the TSC is disabled. This works well much of the time, but there is the occasional production-level system that meets all of these criteria, but which still has a TSC that skews significantly from atomic-clock time. This is usually attributed to a firmware or hardware fault. Yes, the various NTP daemons do express their opinions of userspace-to-atomic-clock time skew, but they put them in various places, depending on the daemon and distro in question. It would therefore be good for the kernel to have some clue that there is a problem. The old behavior of marking the TSC unstable is a non-starter because a great many workloads simply cannot tolerate the overheads and latencies of the various non-TSC clocksources. In addition, NTP-corrected systems sometimes can tolerate significant kernel-space time skew as long as the userspace time sources are within epsilon of atomic-clock time. Therefore, when watchdog verification of TSC is disabled, enable it for HPET and PMTMR (AKA ACPI PM timer). This provides the needed in-kernel time-skew diagnostic without degrading the system's performance. Signed-off-by: Paul E. McKenney <paulmck@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Daniel Lezcano <daniel.lezcano@linaro.org> Cc: Waiman Long <longman@redhat.com> Cc: <x86@kernel.org> Tested-by: Feng Tang <feng.tang@intel.com>
246 lines
6.4 KiB
C
246 lines
6.4 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* linux/drivers/clocksource/acpi_pm.c
|
|
*
|
|
* This file contains the ACPI PM based clocksource.
|
|
*
|
|
* This code was largely moved from the i386 timer_pm.c file
|
|
* which was (C) Dominik Brodowski <linux@brodo.de> 2003
|
|
* and contained the following comments:
|
|
*
|
|
* Driver to use the Power Management Timer (PMTMR) available in some
|
|
* southbridges as primary timing source for the Linux kernel.
|
|
*
|
|
* Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
|
|
* timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
|
|
*/
|
|
|
|
#include <linux/acpi_pmtmr.h>
|
|
#include <linux/clocksource.h>
|
|
#include <linux/timex.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/init.h>
|
|
#include <linux/pci.h>
|
|
#include <linux/delay.h>
|
|
#include <asm/io.h>
|
|
#include <asm/time.h>
|
|
|
|
/*
|
|
* The I/O port the PMTMR resides at.
|
|
* The location is detected during setup_arch(),
|
|
* in arch/i386/kernel/acpi/boot.c
|
|
*/
|
|
u32 pmtmr_ioport __read_mostly;
|
|
|
|
static inline u32 read_pmtmr(void)
|
|
{
|
|
/* mask the output to 24 bits */
|
|
return inl(pmtmr_ioport) & ACPI_PM_MASK;
|
|
}
|
|
|
|
u32 acpi_pm_read_verified(void)
|
|
{
|
|
u32 v1 = 0, v2 = 0, v3 = 0;
|
|
|
|
/*
|
|
* It has been reported that because of various broken
|
|
* chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM clock
|
|
* source is not latched, you must read it multiple
|
|
* times to ensure a safe value is read:
|
|
*/
|
|
do {
|
|
v1 = read_pmtmr();
|
|
v2 = read_pmtmr();
|
|
v3 = read_pmtmr();
|
|
} while (unlikely((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
|
|
|| (v3 > v1 && v3 < v2)));
|
|
|
|
return v2;
|
|
}
|
|
|
|
static u64 acpi_pm_read(struct clocksource *cs)
|
|
{
|
|
return (u64)read_pmtmr();
|
|
}
|
|
|
|
static struct clocksource clocksource_acpi_pm = {
|
|
.name = "acpi_pm",
|
|
.rating = 200,
|
|
.read = acpi_pm_read,
|
|
.mask = (u64)ACPI_PM_MASK,
|
|
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
|
|
};
|
|
|
|
|
|
#ifdef CONFIG_PCI
|
|
static int acpi_pm_good;
|
|
static int __init acpi_pm_good_setup(char *__str)
|
|
{
|
|
acpi_pm_good = 1;
|
|
return 1;
|
|
}
|
|
__setup("acpi_pm_good", acpi_pm_good_setup);
|
|
|
|
static u64 acpi_pm_read_slow(struct clocksource *cs)
|
|
{
|
|
return (u64)acpi_pm_read_verified();
|
|
}
|
|
|
|
static inline void acpi_pm_need_workaround(void)
|
|
{
|
|
clocksource_acpi_pm.read = acpi_pm_read_slow;
|
|
clocksource_acpi_pm.rating = 120;
|
|
}
|
|
|
|
/*
|
|
* PIIX4 Errata:
|
|
*
|
|
* The power management timer may return improper results when read.
|
|
* Although the timer value settles properly after incrementing,
|
|
* while incrementing there is a 3 ns window every 69.8 ns where the
|
|
* timer value is indeterminate (a 4.2% chance that the data will be
|
|
* incorrect when read). As a result, the ACPI free running count up
|
|
* timer specification is violated due to erroneous reads.
|
|
*/
|
|
static void acpi_pm_check_blacklist(struct pci_dev *dev)
|
|
{
|
|
if (acpi_pm_good)
|
|
return;
|
|
|
|
/* the bug has been fixed in PIIX4M */
|
|
if (dev->revision < 3) {
|
|
pr_warn("* Found PM-Timer Bug on the chipset. Due to workarounds for a bug,\n"
|
|
"* this clock source is slow. Consider trying other clock sources\n");
|
|
|
|
acpi_pm_need_workaround();
|
|
}
|
|
}
|
|
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3,
|
|
acpi_pm_check_blacklist);
|
|
|
|
static void acpi_pm_check_graylist(struct pci_dev *dev)
|
|
{
|
|
if (acpi_pm_good)
|
|
return;
|
|
|
|
pr_warn("* The chipset may have PM-Timer Bug. Due to workarounds for a bug,\n"
|
|
"* this clock source is slow. If you are sure your timer does not have\n"
|
|
"* this bug, please use \"acpi_pm_good\" to disable the workaround\n");
|
|
|
|
acpi_pm_need_workaround();
|
|
}
|
|
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0,
|
|
acpi_pm_check_graylist);
|
|
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
|
|
acpi_pm_check_graylist);
|
|
#endif
|
|
|
|
#ifndef CONFIG_X86_64
|
|
#include <asm/mach_timer.h>
|
|
#define PMTMR_EXPECTED_RATE \
|
|
((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (PIT_TICK_RATE>>10))
|
|
/*
|
|
* Some boards have the PMTMR running way too fast. We check
|
|
* the PMTMR rate against PIT channel 2 to catch these cases.
|
|
*/
|
|
static int verify_pmtmr_rate(void)
|
|
{
|
|
u64 value1, value2;
|
|
unsigned long count, delta;
|
|
|
|
mach_prepare_counter();
|
|
value1 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
|
|
mach_countup(&count);
|
|
value2 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
|
|
delta = (value2 - value1) & ACPI_PM_MASK;
|
|
|
|
/* Check that the PMTMR delta is within 5% of what we expect */
|
|
if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 ||
|
|
delta > (PMTMR_EXPECTED_RATE * 21) / 20) {
|
|
pr_info("PM-Timer running at invalid rate: %lu%% of normal - aborting.\n",
|
|
100UL * delta / PMTMR_EXPECTED_RATE);
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
#else
|
|
#define verify_pmtmr_rate() (0)
|
|
#endif
|
|
|
|
/* Number of monotonicity checks to perform during initialization */
|
|
#define ACPI_PM_MONOTONICITY_CHECKS 10
|
|
/* Number of reads we try to get two different values */
|
|
#define ACPI_PM_READ_CHECKS 10000
|
|
|
|
static int __init init_acpi_pm_clocksource(void)
|
|
{
|
|
u64 value1, value2;
|
|
unsigned int i, j = 0;
|
|
|
|
if (!pmtmr_ioport)
|
|
return -ENODEV;
|
|
|
|
/* "verify" this timing source: */
|
|
for (j = 0; j < ACPI_PM_MONOTONICITY_CHECKS; j++) {
|
|
udelay(100 * j);
|
|
value1 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
|
|
for (i = 0; i < ACPI_PM_READ_CHECKS; i++) {
|
|
value2 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
|
|
if (value2 == value1)
|
|
continue;
|
|
if (value2 > value1)
|
|
break;
|
|
if ((value2 < value1) && ((value2) < 0xFFF))
|
|
break;
|
|
pr_info("PM-Timer had inconsistent results: %#llx, %#llx - aborting.\n",
|
|
value1, value2);
|
|
pmtmr_ioport = 0;
|
|
return -EINVAL;
|
|
}
|
|
if (i == ACPI_PM_READ_CHECKS) {
|
|
pr_info("PM-Timer failed consistency check (%#llx) - aborting.\n",
|
|
value1);
|
|
pmtmr_ioport = 0;
|
|
return -ENODEV;
|
|
}
|
|
}
|
|
|
|
if (verify_pmtmr_rate() != 0){
|
|
pmtmr_ioport = 0;
|
|
return -ENODEV;
|
|
}
|
|
|
|
if (tsc_clocksource_watchdog_disabled())
|
|
clocksource_acpi_pm.flags |= CLOCK_SOURCE_MUST_VERIFY;
|
|
return clocksource_register_hz(&clocksource_acpi_pm, PMTMR_TICKS_PER_SEC);
|
|
}
|
|
|
|
/* We use fs_initcall because we want the PCI fixups to have run
|
|
* but we still need to load before device_initcall
|
|
*/
|
|
fs_initcall(init_acpi_pm_clocksource);
|
|
|
|
/*
|
|
* Allow an override of the IOPort. Stupid BIOSes do not tell us about
|
|
* the PMTimer, but we might know where it is.
|
|
*/
|
|
static int __init parse_pmtmr(char *arg)
|
|
{
|
|
unsigned int base;
|
|
int ret;
|
|
|
|
ret = kstrtouint(arg, 16, &base);
|
|
if (ret) {
|
|
pr_warn("PMTMR: invalid 'pmtmr=' value: '%s'\n", arg);
|
|
return 1;
|
|
}
|
|
|
|
pr_info("PMTMR IOPort override: 0x%04x -> 0x%04x\n", pmtmr_ioport,
|
|
base);
|
|
pmtmr_ioport = base;
|
|
|
|
return 1;
|
|
}
|
|
__setup("pmtmr=", parse_pmtmr);
|