linux-stable/include/linux/nmi.h
Thomas Gleixner 01f0a02701 watchdog/core: Remove the park_in_progress obfuscation
Commit:

  b94f51183b ("kernel/watchdog: prevent false hardlockup on overloaded system")

tries to fix the following issue:

proc_write()
   set_sample_period()    <--- New sample period becoms visible
			  <----- Broken starts
   proc_watchdog_update()
     watchdog_enable_all_cpus()		watchdog_hrtimer_fn()
     update_watchdog_all_cpus()		   restart_timer(sample_period)
        watchdog_park_threads()

					thread->park()
					  disable_nmi()
			  <----- Broken ends

The reason why this is broken is that the update of the watchdog threshold
becomes immediately effective and visible for the hrtimer function which
uses that value to rearm the timer. But the NMI/perf side still uses the
old value up to the point where it is disabled. If the rate has been
lowered then the NMI can run fast enough to 'detect' a hard lockup because
the timer has not fired due to the longer period.

The patch 'fixed' this by adding a variable:

proc_write()
   set_sample_period()
					<----- Broken starts
   proc_watchdog_update()
     watchdog_enable_all_cpus()		watchdog_hrtimer_fn()
     update_watchdog_all_cpus()		   restart_timer(sample_period)
         watchdog_park_threads()
	  park_in_progress = 1
					<----- Broken ends
				        nmi_watchdog()
					  if (park_in_progress)
					     return;

The only effect of this variable was to make the window where the breakage
can hit small enough that it was not longer observable in testing. From a
correctness point of view it is a pointless bandaid which merily papers
over the root cause: the unsychronized update of the variable.

Looking deeper into the related code pathes unearthed similar problems in
the watchdog_start()/stop() functions.

 watchdog_start()
	perf_nmi_event_start()
	hrtimer_start()

 watchdog_stop()
	hrtimer_cancel()
	perf_nmi_event_stop()

In both cases the call order is wrong because if the tasks gets preempted
or the VM gets scheduled out long enough after the first call, then there is
a chance that the next NMI will see a stale hrtimer interrupt count and
trigger a false positive hard lockup splat.

Get rid of park_in_progress so the code can be gradually deobfuscated and
pruned from several layers of duct tape papering over the root cause,
which has been either ignored or not understood at all.

Once this is removed the underlying problem will be fixed by rewriting the
proc interface to do a proper synchronized update.

Address the start/stop() ordering problem as well by reverting the call
order, so this part is at least correct now.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1709052038270.2393@nanos
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-09-14 11:41:05 +02:00

205 lines
5.8 KiB
C

/*
* linux/include/linux/nmi.h
*/
#ifndef LINUX_NMI_H
#define LINUX_NMI_H
#include <linux/sched.h>
#include <asm/irq.h>
#if defined(CONFIG_HAVE_NMI_WATCHDOG)
#include <asm/nmi.h>
#endif
#ifdef CONFIG_LOCKUP_DETECTOR
void lockup_detector_init(void);
void lockup_detector_soft_poweroff(void);
void lockup_detector_cleanup(void);
#else
static inline void lockup_detector_init(void) { }
static inline void lockup_detector_soft_poweroff(void) { }
static inline void lockup_detector_cleanup(void) { }
#endif
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
extern void touch_softlockup_watchdog_sched(void);
extern void touch_softlockup_watchdog(void);
extern void touch_softlockup_watchdog_sync(void);
extern void touch_all_softlockup_watchdogs(void);
extern unsigned int softlockup_panic;
extern int soft_watchdog_enabled;
#else
static inline void touch_softlockup_watchdog_sched(void)
{
}
static inline void touch_softlockup_watchdog(void)
{
}
static inline void touch_softlockup_watchdog_sync(void)
{
}
static inline void touch_all_softlockup_watchdogs(void)
{
}
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
void reset_hung_task_detector(void);
#else
static inline void reset_hung_task_detector(void)
{
}
#endif
/*
* The run state of the lockup detectors is controlled by the content of the
* 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
* bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
*
* 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
* are variables that are only used as an 'interface' between the parameters
* in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
* 'watchdog_thresh' variable is handled differently because its value is not
* boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
* is equal zero.
*/
#define NMI_WATCHDOG_ENABLED_BIT 0
#define SOFT_WATCHDOG_ENABLED_BIT 1
#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
#if defined(CONFIG_HARDLOCKUP_DETECTOR)
extern void hardlockup_detector_disable(void);
extern unsigned int hardlockup_panic;
#else
static inline void hardlockup_detector_disable(void) {}
#endif
#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
extern void arch_touch_nmi_watchdog(void);
extern void hardlockup_detector_perf_stop(void);
extern void hardlockup_detector_perf_restart(void);
extern void hardlockup_detector_perf_disable(void);
extern void hardlockup_detector_perf_cleanup(void);
#else
static inline void hardlockup_detector_perf_stop(void) { }
static inline void hardlockup_detector_perf_restart(void) { }
static inline void hardlockup_detector_perf_disable(void) { }
static inline void hardlockup_detector_perf_cleanup(void) { }
#if !defined(CONFIG_HAVE_NMI_WATCHDOG)
static inline void arch_touch_nmi_watchdog(void) {}
#endif
#endif
/**
* touch_nmi_watchdog - restart NMI watchdog timeout.
*
* If the architecture supports the NMI watchdog, touch_nmi_watchdog()
* may be used to reset the timeout - for code which intentionally
* disables interrupts for a long time. This call is stateless.
*/
static inline void touch_nmi_watchdog(void)
{
arch_touch_nmi_watchdog();
touch_softlockup_watchdog();
}
/*
* Create trigger_all_cpu_backtrace() out of the arch-provided
* base function. Return whether such support was available,
* to allow calling code to fall back to some other mechanism:
*/
#ifdef arch_trigger_cpumask_backtrace
static inline bool trigger_all_cpu_backtrace(void)
{
arch_trigger_cpumask_backtrace(cpu_online_mask, false);
return true;
}
static inline bool trigger_allbutself_cpu_backtrace(void)
{
arch_trigger_cpumask_backtrace(cpu_online_mask, true);
return true;
}
static inline bool trigger_cpumask_backtrace(struct cpumask *mask)
{
arch_trigger_cpumask_backtrace(mask, false);
return true;
}
static inline bool trigger_single_cpu_backtrace(int cpu)
{
arch_trigger_cpumask_backtrace(cpumask_of(cpu), false);
return true;
}
/* generic implementation */
void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
bool exclude_self,
void (*raise)(cpumask_t *mask));
bool nmi_cpu_backtrace(struct pt_regs *regs);
#else
static inline bool trigger_all_cpu_backtrace(void)
{
return false;
}
static inline bool trigger_allbutself_cpu_backtrace(void)
{
return false;
}
static inline bool trigger_cpumask_backtrace(struct cpumask *mask)
{
return false;
}
static inline bool trigger_single_cpu_backtrace(int cpu)
{
return false;
}
#endif
#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
u64 hw_nmi_get_sample_period(int watchdog_thresh);
#endif
#ifdef CONFIG_LOCKUP_DETECTOR
extern int nmi_watchdog_enabled;
extern int watchdog_user_enabled;
extern int watchdog_thresh;
extern unsigned long watchdog_enabled;
extern struct cpumask watchdog_cpumask;
extern unsigned long *watchdog_cpumask_bits;
#ifdef CONFIG_SMP
extern int sysctl_softlockup_all_cpu_backtrace;
extern int sysctl_hardlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
#define sysctl_hardlockup_all_cpu_backtrace 0
#endif
#if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \
defined(CONFIG_HARDLOCKUP_DETECTOR)
void watchdog_update_hrtimer_threshold(u64 period);
#else
static inline void watchdog_update_hrtimer_threshold(u64 period) { }
#endif
extern bool is_hardlockup(void);
struct ctl_table;
extern int proc_watchdog(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
extern int proc_nmi_watchdog(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
extern int proc_soft_watchdog(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
extern int proc_watchdog_thresh(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
extern int proc_watchdog_cpumask(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
#endif
#ifdef CONFIG_HAVE_ACPI_APEI_NMI
#include <asm/nmi.h>
#endif
#endif