diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 207156f2143c..bc1162895f35 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -21,6 +21,7 @@ void lockup_detector_soft_poweroff(void); extern int watchdog_user_enabled; extern int watchdog_thresh; extern unsigned long watchdog_enabled; +extern int watchdog_hardlockup_miss_thresh; extern struct cpumask watchdog_cpumask; extern unsigned long *watchdog_cpumask_bits; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 431c540bd035..87dd5e0f6968 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -60,6 +60,13 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); int __read_mostly sysctl_hardlockup_all_cpu_backtrace; # endif /* CONFIG_SMP */ +/* + * Number of consecutive missed interrupts before declaring a lockup. + * Default to 1 (immediate) for NMI/Perf. Buddy will overwrite this to 3. + */ +int __read_mostly watchdog_hardlockup_miss_thresh = 1; +EXPORT_SYMBOL_GPL(watchdog_hardlockup_miss_thresh); + /* * Should we panic when a soft-lockup or hard-lockup occurs: */ @@ -137,6 +144,7 @@ __setup("nmi_watchdog=", hardlockup_panic_setup); static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts); static DEFINE_PER_CPU(int, hrtimer_interrupts_saved); +static DEFINE_PER_CPU(int, hrtimer_interrupts_missed); static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched); static unsigned long hard_lockup_nmi_warn; @@ -159,7 +167,7 @@ void watchdog_hardlockup_touch_cpu(unsigned int cpu) per_cpu(watchdog_hardlockup_touched, cpu) = true; } -static void watchdog_hardlockup_update(unsigned int cpu) +static void watchdog_hardlockup_update_reset(unsigned int cpu) { int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu)); @@ -169,6 +177,7 @@ static void watchdog_hardlockup_update(unsigned int cpu) * written/read by a single CPU. */ per_cpu(hrtimer_interrupts_saved, cpu) = hrint; + per_cpu(hrtimer_interrupts_missed, cpu) = 0; } static bool is_hardlockup(unsigned int cpu) @@ -176,10 +185,14 @@ static bool is_hardlockup(unsigned int cpu) int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu)); if (per_cpu(hrtimer_interrupts_saved, cpu) != hrint) { - watchdog_hardlockup_update(cpu); + watchdog_hardlockup_update_reset(cpu); return false; } + per_cpu(hrtimer_interrupts_missed, cpu)++; + if (per_cpu(hrtimer_interrupts_missed, cpu) % watchdog_hardlockup_miss_thresh) + return false; + return true; } @@ -198,7 +211,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) unsigned long flags; if (per_cpu(watchdog_hardlockup_touched, cpu)) { - watchdog_hardlockup_update(cpu); + watchdog_hardlockup_update_reset(cpu); per_cpu(watchdog_hardlockup_touched, cpu) = false; return; } diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c index ee754d767c21..3a1e57080c1c 100644 --- a/kernel/watchdog_buddy.c +++ b/kernel/watchdog_buddy.c @@ -21,6 +21,7 @@ static unsigned int watchdog_next_cpu(unsigned int cpu) int __init watchdog_hardlockup_probe(void) { + watchdog_hardlockup_miss_thresh = 3; return 0; } @@ -86,14 +87,6 @@ void watchdog_buddy_check_hardlockup(int hrtimer_interrupts) { unsigned int next_cpu; - /* - * Test for hardlockups every 3 samples. The sample period is - * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over - * watchdog_thresh (over by 20%). - */ - if (hrtimer_interrupts % 3 != 0) - return; - /* check for a hardlockup on the next CPU */ next_cpu = watchdog_next_cpu(smp_processor_id()); if (next_cpu >= nr_cpu_ids)