The new nmi_watchdog (which uses the perf event subsystem) is very similar in structure to the softlockup detector. Using Ingo's suggestion, I combined the two functionalities into one file: kernel/watchdog.c. Now both the nmi_watchdog (or hardlockup detector) and softlockup detector sit on top of the perf event subsystem, which is run every 60 seconds or so to see if there are any lockups. To detect hardlockups, cpus not responding to interrupts, I implemented an hrtimer that runs 5 times for every perf event overflow event. If that stops counting on a cpu, then the cpu is most likely in trouble. To detect softlockups, tasks not yielding to the scheduler, I used the previous kthread idea that now gets kicked every time the hrtimer fires. If the kthread isn't being scheduled neither is anyone else and the warning is printed to the console. I tested this on x86_64 and both the softlockup and hardlockup paths work. V2: - cleaned up the Kconfig and softlockup combination - surrounded hardlockup cases with #ifdef CONFIG_PERF_EVENTS_NMI - seperated out the softlockup case from perf event subsystem - re-arranged the enabling/disabling nmi watchdog from proc space - added cpumasks for hardlockup failure cases - removed fallback to soft events if no PMU exists for hard events V3: - comment cleanups - drop support for older softlockup code - per_cpu cleanups - completely remove software clock base hardlockup detector - use per_cpu masking on hard/soft lockup detection - #ifdef cleanups - rename config option NMI_WATCHDOG to LOCKUP_DETECTOR - documentation additions V4: - documentation fixes - convert per_cpu to __get_cpu_var - powerpc compile fixes V5: - split apart warn flags for hard and soft lockups TODO: - figure out how to make an arch-agnostic clock2cycles call (if possible) to feed into perf events as a sample period [fweisbec: merged conflict patch] Signed-off-by: Don Zickus <dzickus@redhat.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Cyrill Gorcunov <gorcunov@gmail.com> Cc: Eric Paris <eparis@redhat.com> Cc: Randy Dunlap <randy.dunlap@oracle.com> LKML-Reference: <1273266711-18706-2-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
64 lines
1.5 KiB
C
64 lines
1.5 KiB
C
/*
|
|
* linux/include/linux/nmi.h
|
|
*/
|
|
#ifndef LINUX_NMI_H
|
|
#define LINUX_NMI_H
|
|
|
|
#include <linux/sched.h>
|
|
#include <asm/irq.h>
|
|
|
|
/**
|
|
* touch_nmi_watchdog - restart NMI watchdog timeout.
|
|
*
|
|
* If the architecture supports the NMI watchdog, touch_nmi_watchdog()
|
|
* may be used to reset the timeout - for code which intentionally
|
|
* disables interrupts for a long time. This call is stateless.
|
|
*/
|
|
#ifdef ARCH_HAS_NMI_WATCHDOG
|
|
#include <asm/nmi.h>
|
|
extern void touch_nmi_watchdog(void);
|
|
extern void acpi_nmi_disable(void);
|
|
extern void acpi_nmi_enable(void);
|
|
#else
|
|
#ifndef CONFIG_LOCKUP_DETECTOR
|
|
static inline void touch_nmi_watchdog(void)
|
|
{
|
|
touch_softlockup_watchdog();
|
|
}
|
|
#else
|
|
extern void touch_nmi_watchdog(void);
|
|
#endif
|
|
static inline void acpi_nmi_disable(void) { }
|
|
static inline void acpi_nmi_enable(void) { }
|
|
#endif
|
|
|
|
/*
|
|
* Create trigger_all_cpu_backtrace() out of the arch-provided
|
|
* base function. Return whether such support was available,
|
|
* to allow calling code to fall back to some other mechanism:
|
|
*/
|
|
#ifdef arch_trigger_all_cpu_backtrace
|
|
static inline bool trigger_all_cpu_backtrace(void)
|
|
{
|
|
arch_trigger_all_cpu_backtrace();
|
|
|
|
return true;
|
|
}
|
|
#else
|
|
static inline bool trigger_all_cpu_backtrace(void)
|
|
{
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_LOCKUP_DETECTOR
|
|
int hw_nmi_is_cpu_stuck(struct pt_regs *);
|
|
u64 hw_nmi_get_sample_period(void);
|
|
extern int watchdog_enabled;
|
|
struct ctl_table;
|
|
extern int proc_dowatchdog_enabled(struct ctl_table *, int ,
|
|
void __user *, size_t *, loff_t *);
|
|
#endif
|
|
|
|
#endif
|