162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * hangcheck-timer.c 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Driver for a little io fencing timer. 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Copyright (C) 2002, 2003 Oracle. All rights reserved. 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * Author: Joel Becker <joel.becker@oracle.com> 1062306a36Sopenharmony_ci */ 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci/* 1362306a36Sopenharmony_ci * The hangcheck-timer driver uses the TSC to catch delays that 1462306a36Sopenharmony_ci * jiffies does not notice. A timer is set. When the timer fires, it 1562306a36Sopenharmony_ci * checks whether it was delayed and if that delay exceeds a given 1662306a36Sopenharmony_ci * margin of error. The hangcheck_tick module parameter takes the timer 1762306a36Sopenharmony_ci * duration in seconds. The hangcheck_margin parameter defines the 1862306a36Sopenharmony_ci * margin of error, in seconds. The defaults are 60 seconds for the 1962306a36Sopenharmony_ci * timer and 180 seconds for the margin of error. IOW, a timer is set 2062306a36Sopenharmony_ci * for 60 seconds. When the timer fires, the callback checks the 2162306a36Sopenharmony_ci * actual duration that the timer waited. If the duration exceeds the 2262306a36Sopenharmony_ci * allotted time and margin (here 60 + 180, or 240 seconds), the machine 2362306a36Sopenharmony_ci * is restarted. A healthy machine will have the duration match the 2462306a36Sopenharmony_ci * expected timeout very closely. 2562306a36Sopenharmony_ci */ 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci#include <linux/module.h> 2862306a36Sopenharmony_ci#include <linux/moduleparam.h> 2962306a36Sopenharmony_ci#include <linux/types.h> 3062306a36Sopenharmony_ci#include <linux/kernel.h> 3162306a36Sopenharmony_ci#include <linux/fs.h> 3262306a36Sopenharmony_ci#include <linux/mm.h> 3362306a36Sopenharmony_ci#include <linux/reboot.h> 3462306a36Sopenharmony_ci#include <linux/init.h> 3562306a36Sopenharmony_ci#include <linux/delay.h> 3662306a36Sopenharmony_ci#include <linux/uaccess.h> 3762306a36Sopenharmony_ci#include <linux/sysrq.h> 3862306a36Sopenharmony_ci#include <linux/timer.h> 3962306a36Sopenharmony_ci#include <linux/hrtimer.h> 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_ci#define VERSION_STR "0.9.1" 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ci#define DEFAULT_IOFENCE_MARGIN 60 /* Default fudge factor, in seconds */ 4462306a36Sopenharmony_ci#define DEFAULT_IOFENCE_TICK 180 /* Default timer timeout, in seconds */ 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_cistatic int hangcheck_tick = DEFAULT_IOFENCE_TICK; 4762306a36Sopenharmony_cistatic int hangcheck_margin = DEFAULT_IOFENCE_MARGIN; 4862306a36Sopenharmony_cistatic int hangcheck_reboot; /* Defaults to not reboot */ 4962306a36Sopenharmony_cistatic int hangcheck_dump_tasks; /* Defaults to not dumping SysRQ T */ 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci/* options - modular */ 5262306a36Sopenharmony_cimodule_param(hangcheck_tick, int, 0); 5362306a36Sopenharmony_ciMODULE_PARM_DESC(hangcheck_tick, "Timer delay."); 5462306a36Sopenharmony_cimodule_param(hangcheck_margin, int, 0); 5562306a36Sopenharmony_ciMODULE_PARM_DESC(hangcheck_margin, "If the hangcheck timer has been delayed more than hangcheck_margin seconds, the driver will fire."); 5662306a36Sopenharmony_cimodule_param(hangcheck_reboot, int, 0); 5762306a36Sopenharmony_ciMODULE_PARM_DESC(hangcheck_reboot, "If nonzero, the machine will reboot when the timer margin is exceeded."); 5862306a36Sopenharmony_cimodule_param(hangcheck_dump_tasks, int, 0); 5962306a36Sopenharmony_ciMODULE_PARM_DESC(hangcheck_dump_tasks, "If nonzero, the machine will dump the system task state when the timer margin is exceeded."); 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ciMODULE_AUTHOR("Oracle"); 6262306a36Sopenharmony_ciMODULE_DESCRIPTION("Hangcheck-timer detects when the system has gone out to lunch past a certain margin."); 6362306a36Sopenharmony_ciMODULE_LICENSE("GPL"); 6462306a36Sopenharmony_ciMODULE_VERSION(VERSION_STR); 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_ci/* options - nonmodular */ 6762306a36Sopenharmony_ci#ifndef MODULE 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_cistatic int __init hangcheck_parse_tick(char *str) 7062306a36Sopenharmony_ci{ 7162306a36Sopenharmony_ci int par; 7262306a36Sopenharmony_ci if (get_option(&str,&par)) 7362306a36Sopenharmony_ci hangcheck_tick = par; 7462306a36Sopenharmony_ci return 1; 7562306a36Sopenharmony_ci} 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_cistatic int __init hangcheck_parse_margin(char *str) 7862306a36Sopenharmony_ci{ 7962306a36Sopenharmony_ci int par; 8062306a36Sopenharmony_ci if (get_option(&str,&par)) 8162306a36Sopenharmony_ci hangcheck_margin = par; 8262306a36Sopenharmony_ci return 1; 8362306a36Sopenharmony_ci} 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_cistatic int __init hangcheck_parse_reboot(char *str) 8662306a36Sopenharmony_ci{ 8762306a36Sopenharmony_ci int par; 8862306a36Sopenharmony_ci if (get_option(&str,&par)) 8962306a36Sopenharmony_ci hangcheck_reboot = par; 9062306a36Sopenharmony_ci return 1; 9162306a36Sopenharmony_ci} 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_cistatic int __init hangcheck_parse_dump_tasks(char *str) 9462306a36Sopenharmony_ci{ 9562306a36Sopenharmony_ci int par; 9662306a36Sopenharmony_ci if (get_option(&str,&par)) 9762306a36Sopenharmony_ci hangcheck_dump_tasks = par; 9862306a36Sopenharmony_ci return 1; 9962306a36Sopenharmony_ci} 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci__setup("hcheck_tick", hangcheck_parse_tick); 10262306a36Sopenharmony_ci__setup("hcheck_margin", hangcheck_parse_margin); 10362306a36Sopenharmony_ci__setup("hcheck_reboot", hangcheck_parse_reboot); 10462306a36Sopenharmony_ci__setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); 10562306a36Sopenharmony_ci#endif /* not MODULE */ 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ci#define TIMER_FREQ 1000000000ULL 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci/* Last time scheduled */ 11062306a36Sopenharmony_cistatic unsigned long long hangcheck_tsc, hangcheck_tsc_margin; 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_cistatic void hangcheck_fire(struct timer_list *); 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_cistatic DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire); 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_cistatic void hangcheck_fire(struct timer_list *unused) 11762306a36Sopenharmony_ci{ 11862306a36Sopenharmony_ci unsigned long long cur_tsc, tsc_diff; 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci cur_tsc = ktime_get_ns(); 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci if (cur_tsc > hangcheck_tsc) 12362306a36Sopenharmony_ci tsc_diff = cur_tsc - hangcheck_tsc; 12462306a36Sopenharmony_ci else 12562306a36Sopenharmony_ci tsc_diff = (cur_tsc + (~0ULL - hangcheck_tsc)); /* or something */ 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ci if (tsc_diff > hangcheck_tsc_margin) { 12862306a36Sopenharmony_ci if (hangcheck_dump_tasks) { 12962306a36Sopenharmony_ci printk(KERN_CRIT "Hangcheck: Task state:\n"); 13062306a36Sopenharmony_ci#ifdef CONFIG_MAGIC_SYSRQ 13162306a36Sopenharmony_ci handle_sysrq('t'); 13262306a36Sopenharmony_ci#endif /* CONFIG_MAGIC_SYSRQ */ 13362306a36Sopenharmony_ci } 13462306a36Sopenharmony_ci if (hangcheck_reboot) { 13562306a36Sopenharmony_ci printk(KERN_CRIT "Hangcheck: hangcheck is restarting the machine.\n"); 13662306a36Sopenharmony_ci emergency_restart(); 13762306a36Sopenharmony_ci } else { 13862306a36Sopenharmony_ci printk(KERN_CRIT "Hangcheck: hangcheck value past margin!\n"); 13962306a36Sopenharmony_ci } 14062306a36Sopenharmony_ci } 14162306a36Sopenharmony_ci#if 0 14262306a36Sopenharmony_ci /* 14362306a36Sopenharmony_ci * Enable to investigate delays in detail 14462306a36Sopenharmony_ci */ 14562306a36Sopenharmony_ci printk("Hangcheck: called %Ld ns since last time (%Ld ns overshoot)\n", 14662306a36Sopenharmony_ci tsc_diff, tsc_diff - hangcheck_tick*TIMER_FREQ); 14762306a36Sopenharmony_ci#endif 14862306a36Sopenharmony_ci mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 14962306a36Sopenharmony_ci hangcheck_tsc = ktime_get_ns(); 15062306a36Sopenharmony_ci} 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci 15362306a36Sopenharmony_cistatic int __init hangcheck_init(void) 15462306a36Sopenharmony_ci{ 15562306a36Sopenharmony_ci printk("Hangcheck: starting hangcheck timer %s (tick is %d seconds, margin is %d seconds).\n", 15662306a36Sopenharmony_ci VERSION_STR, hangcheck_tick, hangcheck_margin); 15762306a36Sopenharmony_ci hangcheck_tsc_margin = 15862306a36Sopenharmony_ci (unsigned long long)hangcheck_margin + hangcheck_tick; 15962306a36Sopenharmony_ci hangcheck_tsc_margin *= TIMER_FREQ; 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci hangcheck_tsc = ktime_get_ns(); 16262306a36Sopenharmony_ci mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci return 0; 16562306a36Sopenharmony_ci} 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_cistatic void __exit hangcheck_exit(void) 16962306a36Sopenharmony_ci{ 17062306a36Sopenharmony_ci del_timer_sync(&hangcheck_ticktock); 17162306a36Sopenharmony_ci printk("Hangcheck: Stopped hangcheck timer.\n"); 17262306a36Sopenharmony_ci} 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_cimodule_init(hangcheck_init); 17562306a36Sopenharmony_cimodule_exit(hangcheck_exit); 176