Clocksource watchdog commits for v6.3

This pull request contains the following:
 
 o	Improvements to clocksource-watchdog console messages.
 
 o	Loosening of the clocksource-watchdog skew criteria to match
 	those of NTP (500 parts per million, relaxed from 400 parts
 	per million).  If it is good enough for NTP, it is good enough
 	for the clocksource watchdog.
 
 o	Suspend clocksource-watchdog checking temporarily when high
 	memory latencies are detected.	This avoids the false-positive
 	clock-skew events that have been seen on production systems
 	running memory-intensive workloads.
 
 o	On systems where the TSC is deemed trustworthy, use it as the
 	watchdog timesource, but only when specifically requested using
 	the tsc=watchdog kernel boot parameter.  This permits clock-skew
 	events to be detected, but avoids forcing workloads to use the
 	slow HPET and ACPI PM timers.  These last two timers are slow
 	enough to cause systems to be needlessly marked bad on the one
 	hand, and real skew does sometimes happen on production systems
 	running production workloads on the other.  And sometimes it is
 	the fault of the TSC, or at least of the firmware that told the
 	kernel to program the TSC with the wrong frequency.
 
 o	Add a tsc=revalidate kernel boot parameter to allow the kernel
 	to diagnose cases where the TSC hardware works fine, but was told
 	by firmware to tick at the wrong frequency.  Such cases are rare,
 	but they really have happened on production systems.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEbK7UrM+RBIrCoViJnr8S83LZ+4wFAmPhnhkTHHBhdWxtY2tA
 a2VybmVsLm9yZwAKCRCevxLzctn7jClDD/9gTo62MakVQz2wzBRBcWunzX4BAfy2
 2ORqZYqq8cJ4ccFVWtSq7gZ+0bxiT+J4jaVyJpmUPzaiCSfNUT+GLjWyLGzF9Xq+
 xLWpFJOhFhKYjYN2m1ottuQ81V7aTlorC8AJt/o+oCJFGUCb/heg/UrmoZ6DweHw
 H7uXS9yenKdKgYoMENW+8IVsy16sT4D5Fe8XAD/2J6vBBUbgBzKWhi8XSgSHB/Xw
 GCP4UfXVGl5QRG9Xu4ZgrFV1t4azxtmdBghFm7/Kep/j6ttSY78yoS43AbI57bhD
 fWB5mfAQvO+Zo5/9rLjcDzeZCp/PSdARD41aycPMiei08K278tIN9T/fmfSoG6rV
 lVRdFxTHrQcqc9d+g+mGASQBezCF8pxonm9HYLBpNjyfYHnKV70SPXywO4oqAJ1I
 7dCm+uv3Y8KaJdVnPUWOHJjvQLx9NWK5/pXBYjsYnLR+69EVmGDgPZ+/ulQxkWBj
 DtrQgs+sHQ8gngNpAilxuu/lrUXzrC8N4mtxXKBFQoCPYQMFBkr9S+aAEHIgZT9H
 1dWwR1QxeR5uxt7U+3DmTyJ1XKfYjDyyScesILlLMLbdKgZtTS5wGaK4QdJ3QW2z
 z4zqPDccWDDZKZy9W4QBnFBx6Rn49C8xThy7f6Loc+2cKAT10hrEmRJsn79AOCDc
 6hV0S2U9a6ypQg==
 =OWY2
 -----END PGP SIGNATURE-----

Merge tag 'clocksource.2023.02.06b' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into timers/core

Pull clocksource watchdog changes from Paul McKenney:

     o	Improvements to clocksource-watchdog console messages.

     o	Loosening of the clocksource-watchdog skew criteria to match
     	those of NTP (500 parts per million, relaxed from 400 parts
     	per million).  If it is good enough for NTP, it is good enough
     	for the clocksource watchdog.

     o	Suspend clocksource-watchdog checking temporarily when high
     	memory latencies are detected.	This avoids the false-positive
     	clock-skew events that have been seen on production systems
     	running memory-intensive workloads.

     o	On systems where the TSC is deemed trustworthy, use it as the
     	watchdog timesource, but only when specifically requested using
     	the tsc=watchdog kernel boot parameter.  This permits clock-skew
     	events to be detected, but avoids forcing workloads to use the
     	slow HPET and ACPI PM timers.  These last two timers are slow
     	enough to cause systems to be needlessly marked bad on the one
     	hand, and real skew does sometimes happen on production systems
     	running production workloads on the other.  And sometimes it is
     	the fault of the TSC, or at least of the firmware that told the
     	kernel to program the TSC with the wrong frequency.

     o	Add a tsc=revalidate kernel boot parameter to allow the kernel
     	to diagnose cases where the TSC hardware works fine, but was told
     	by firmware to tick at the wrong frequency.  Such cases are rare,
     	but they really have happened on production systems.

Link: https://lore.kernel.org/r/20230210193640.GA3325193@paulmck-ThinkPad-P17-Gen-1
This commit is contained in:
Thomas Gleixner 2023-02-13 19:28:48 +01:00
commit ab407a1919
7 changed files with 123 additions and 29 deletions

View file

@ -6369,6 +6369,16 @@
in situations with strict latency requirements (where
interruptions from clocksource watchdog are not
acceptable).
[x86] recalibrate: force recalibration against a HW timer
(HPET or PM timer) on systems whose TSC frequency was
obtained from HW or FW using either an MSR or CPUID(0x15).
Warn if the difference is more than 500 ppm.
[x86] watchdog: Use TSC as the watchdog clocksource with
which to check other HW timers (HPET or PM timer), but
only on systems where TSC has been deemed trustworthy.
This will be suppressed by an earlier tsc=nowatchdog and
can be overridden by a later tsc=nowatchdog. A console
message will flag any such suppression or overriding.
tsc_early_khz= [X86] Skip early TSC calibration and use the given
value instead. Useful when the early TSC frequency discovery

View file

@ -8,6 +8,7 @@
extern void hpet_time_init(void);
extern void time_init(void);
extern bool pit_timer_init(void);
extern bool tsc_clocksource_watchdog_disabled(void);
extern struct clock_event_device *global_clock_event;

View file

@ -1091,6 +1091,8 @@ int __init hpet_enable(void)
if (!hpet_counting())
goto out_nohpet;
if (tsc_clocksource_watchdog_disabled())
clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY;
clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
if (id & HPET_ID_LEGSUP) {

View file

@ -48,6 +48,8 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc);
int tsc_clocksource_reliable;
static int __read_mostly tsc_force_recalibrate;
static u32 art_to_tsc_numerator;
static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
@ -292,6 +294,7 @@ __setup("notsc", notsc_setup);
static int no_sched_irq_time;
static int no_tsc_watchdog;
static int tsc_as_watchdog;
static int __init tsc_setup(char *str)
{
@ -301,8 +304,22 @@ static int __init tsc_setup(char *str)
no_sched_irq_time = 1;
if (!strcmp(str, "unstable"))
mark_tsc_unstable("boot parameter");
if (!strcmp(str, "nowatchdog"))
if (!strcmp(str, "nowatchdog")) {
no_tsc_watchdog = 1;
if (tsc_as_watchdog)
pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n",
__func__);
tsc_as_watchdog = 0;
}
if (!strcmp(str, "recalibrate"))
tsc_force_recalibrate = 1;
if (!strcmp(str, "watchdog")) {
if (no_tsc_watchdog)
pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n",
__func__);
else
tsc_as_watchdog = 1;
}
return 1;
}
@ -1186,6 +1203,12 @@ static void __init tsc_disable_clocksource_watchdog(void)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}
bool tsc_clocksource_watchdog_disabled(void)
{
return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) &&
tsc_as_watchdog && !no_tsc_watchdog;
}
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@ -1374,6 +1397,25 @@ static void tsc_refine_calibration_work(struct work_struct *work)
else
freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
/* Will hit this only if tsc_force_recalibrate has been set */
if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
/* Warn if the deviation exceeds 500 ppm */
if (abs(tsc_khz - freq) > (tsc_khz >> 11)) {
pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n");
pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n",
(unsigned long)tsc_khz / 1000,
(unsigned long)tsc_khz % 1000);
}
pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n",
hpet ? "HPET" : "PM_TIMER",
(unsigned long)freq / 1000,
(unsigned long)freq % 1000);
return;
}
/* Make sure we're within 1% */
if (abs(tsc_khz - freq) > tsc_khz/100)
goto out;
@ -1407,8 +1449,10 @@ static int __init init_tsc_clocksource(void)
if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
return 0;
if (tsc_unstable)
goto unreg;
if (tsc_unstable) {
clocksource_unregister(&clocksource_tsc_early);
return 0;
}
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@ -1421,9 +1465,10 @@ static int __init init_tsc_clocksource(void)
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
clocksource_register_khz(&clocksource_tsc, tsc_khz);
unreg:
clocksource_unregister(&clocksource_tsc_early);
return 0;
if (!tsc_force_recalibrate)
return 0;
}
schedule_delayed_work(&tsc_irqwork, 0);

View file

@ -23,6 +23,7 @@
#include <linux/pci.h>
#include <linux/delay.h>
#include <asm/io.h>
#include <asm/time.h>
/*
* The I/O port the PMTMR resides at.
@ -210,8 +211,9 @@ static int __init init_acpi_pm_clocksource(void)
return -ENODEV;
}
return clocksource_register_hz(&clocksource_acpi_pm,
PMTMR_TICKS_PER_SEC);
if (tsc_clocksource_watchdog_disabled())
clocksource_acpi_pm.flags |= CLOCK_SOURCE_MUST_VERIFY;
return clocksource_register_hz(&clocksource_acpi_pm, PMTMR_TICKS_PER_SEC);
}
/* We use fs_initcall because we want the PCI fixups to have run

View file

@ -200,10 +200,14 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
int "Clocksource watchdog maximum allowable skew (in μs)"
depends on CLOCKSOURCE_WATCHDOG
range 50 1000
default 100
default 125
help
Specify the maximum amount of allowable watchdog skew in
microseconds before reporting the clocksource to be unstable.
The default is based on a half-second clocksource watchdog
interval and NTP's maximum frequency drift of 500 parts
per million. If the clocksource is good enough for NTP,
it is good enough for the clocksource watchdog!
endmenu
endif

View file

@ -95,6 +95,11 @@ static char override_name[CS_NAME_LEN];
static int finished_booting;
static u64 suspend_start;
/*
* Interval: 0.5sec.
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
/*
* Threshold: 0.0312s, when doubled: 0.0625s.
* Also a default for cs->uncertainty_margin when registering clocks.
@ -106,11 +111,14 @@ static u64 suspend_start;
* clocksource surrounding a read of the clocksource being validated.
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
* a lower bound for cs->uncertainty_margin values when registering clocks.
*
* The default of 500 parts per million is based on NTP's limits.
* If a clocksource is good enough for NTP, it is good enough for us!
*/
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#else
#define MAX_SKEW_USEC 100
#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
#endif
#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
@ -140,11 +148,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags)
static int clocksource_watchdog_kthread(void *data);
static void __clocksource_change_rating(struct clocksource *cs, int rating);
/*
* Interval: 0.5sec.
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
static void clocksource_watchdog_work(struct work_struct *work)
{
/*
@ -257,8 +260,8 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
goto skip_test;
}
pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
smp_processor_id(), watchdog->name, wd_delay, nretries);
pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
return WD_READ_UNSTABLE;
skip_test:
@ -384,6 +387,15 @@ void clocksource_verify_percpu(struct clocksource *cs)
}
EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
static inline void clocksource_reset_watchdog(void)
{
struct clocksource *cs;
list_for_each_entry(cs, &watchdog_list, wd_list)
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
}
static void clocksource_watchdog(struct timer_list *unused)
{
u64 csnow, wdnow, cslast, wdlast, delta;
@ -391,6 +403,7 @@ static void clocksource_watchdog(struct timer_list *unused)
int64_t wd_nsec, cs_nsec;
struct clocksource *cs;
enum wd_read_status read_ret;
unsigned long extra_wait = 0;
u32 md;
spin_lock(&watchdog_lock);
@ -410,13 +423,30 @@ static void clocksource_watchdog(struct timer_list *unused)
read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
if (read_ret != WD_READ_SUCCESS) {
if (read_ret == WD_READ_UNSTABLE)
/* Clock readout unreliable, so give it up. */
__clocksource_unstable(cs);
if (read_ret == WD_READ_UNSTABLE) {
/* Clock readout unreliable, so give it up. */
__clocksource_unstable(cs);
continue;
}
/*
* When WD_READ_SKIP is returned, it means the system is likely
* under very heavy load, where the latency of reading
* watchdog/clocksource is very big, and affect the accuracy of
* watchdog check. So give system some space and suspend the
* watchdog check for 5 minutes.
*/
if (read_ret == WD_READ_SKIP) {
/*
* As the watchdog timer will be suspended, and
* cs->last could keep unchanged for 5 minutes, reset
* the counters.
*/
clocksource_reset_watchdog();
extra_wait = HZ * 300;
break;
}
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
atomic_read(&watchdog_reset_pending)) {
@ -443,12 +473,20 @@ static void clocksource_watchdog(struct timer_list *unused)
/* Check the deviation from the watchdog clocksource. */
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
if (abs(cs_nsec - wd_nsec) > md) {
u64 cs_wd_msec;
u64 wd_msec;
u32 wd_rem;
pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
smp_processor_id(), cs->name);
pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
cs->name, cs_nsec, csnow, cslast, cs->mask);
cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem);
wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem);
pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
if (curr_clocksource == cs)
pr_warn(" '%s' is current clocksource.\n", cs->name);
else if (curr_clocksource)
@ -512,7 +550,7 @@ static void clocksource_watchdog(struct timer_list *unused)
* pair clocksource_stop_watchdog() clocksource_start_watchdog().
*/
if (!timer_pending(&watchdog_timer)) {
watchdog_timer.expires += WATCHDOG_INTERVAL;
watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
add_timer_on(&watchdog_timer, next_cpu);
}
out:
@ -537,14 +575,6 @@ static inline void clocksource_stop_watchdog(void)
watchdog_running = 0;
}
static inline void clocksource_reset_watchdog(void)
{
struct clocksource *cs;
list_for_each_entry(cs, &watchdog_list, wd_list)
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
}
static void clocksource_resume_watchdog(void)
{
atomic_inc(&watchdog_reset_pending);