18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * hangcheck-timer.c 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Driver for a little io fencing timer. 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * Copyright (C) 2002, 2003 Oracle. All rights reserved. 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * Author: Joel Becker <joel.becker@oracle.com> 108c2ecf20Sopenharmony_ci */ 118c2ecf20Sopenharmony_ci 128c2ecf20Sopenharmony_ci/* 138c2ecf20Sopenharmony_ci * The hangcheck-timer driver uses the TSC to catch delays that 148c2ecf20Sopenharmony_ci * jiffies does not notice. A timer is set. When the timer fires, it 158c2ecf20Sopenharmony_ci * checks whether it was delayed and if that delay exceeds a given 168c2ecf20Sopenharmony_ci * margin of error. The hangcheck_tick module parameter takes the timer 178c2ecf20Sopenharmony_ci * duration in seconds. The hangcheck_margin parameter defines the 188c2ecf20Sopenharmony_ci * margin of error, in seconds. The defaults are 60 seconds for the 198c2ecf20Sopenharmony_ci * timer and 180 seconds for the margin of error. IOW, a timer is set 208c2ecf20Sopenharmony_ci * for 60 seconds. When the timer fires, the callback checks the 218c2ecf20Sopenharmony_ci * actual duration that the timer waited. If the duration exceeds the 228c2ecf20Sopenharmony_ci * allotted time and margin (here 60 + 180, or 240 seconds), the machine 238c2ecf20Sopenharmony_ci * is restarted. A healthy machine will have the duration match the 248c2ecf20Sopenharmony_ci * expected timeout very closely. 258c2ecf20Sopenharmony_ci */ 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci#include <linux/module.h> 288c2ecf20Sopenharmony_ci#include <linux/moduleparam.h> 298c2ecf20Sopenharmony_ci#include <linux/types.h> 308c2ecf20Sopenharmony_ci#include <linux/kernel.h> 318c2ecf20Sopenharmony_ci#include <linux/fs.h> 328c2ecf20Sopenharmony_ci#include <linux/mm.h> 338c2ecf20Sopenharmony_ci#include <linux/reboot.h> 348c2ecf20Sopenharmony_ci#include <linux/init.h> 358c2ecf20Sopenharmony_ci#include <linux/delay.h> 368c2ecf20Sopenharmony_ci#include <linux/uaccess.h> 378c2ecf20Sopenharmony_ci#include <linux/sysrq.h> 388c2ecf20Sopenharmony_ci#include <linux/timer.h> 398c2ecf20Sopenharmony_ci#include <linux/hrtimer.h> 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ci#define VERSION_STR "0.9.1" 428c2ecf20Sopenharmony_ci 438c2ecf20Sopenharmony_ci#define DEFAULT_IOFENCE_MARGIN 60 /* Default fudge factor, in seconds */ 448c2ecf20Sopenharmony_ci#define DEFAULT_IOFENCE_TICK 180 /* Default timer timeout, in seconds */ 458c2ecf20Sopenharmony_ci 468c2ecf20Sopenharmony_cistatic int hangcheck_tick = DEFAULT_IOFENCE_TICK; 478c2ecf20Sopenharmony_cistatic int hangcheck_margin = DEFAULT_IOFENCE_MARGIN; 488c2ecf20Sopenharmony_cistatic int hangcheck_reboot; /* Defaults to not reboot */ 498c2ecf20Sopenharmony_cistatic int hangcheck_dump_tasks; /* Defaults to not dumping SysRQ T */ 508c2ecf20Sopenharmony_ci 518c2ecf20Sopenharmony_ci/* options - modular */ 528c2ecf20Sopenharmony_cimodule_param(hangcheck_tick, int, 0); 538c2ecf20Sopenharmony_ciMODULE_PARM_DESC(hangcheck_tick, "Timer delay."); 548c2ecf20Sopenharmony_cimodule_param(hangcheck_margin, int, 0); 558c2ecf20Sopenharmony_ciMODULE_PARM_DESC(hangcheck_margin, "If the hangcheck timer has been delayed more than hangcheck_margin seconds, the driver will fire."); 568c2ecf20Sopenharmony_cimodule_param(hangcheck_reboot, int, 0); 578c2ecf20Sopenharmony_ciMODULE_PARM_DESC(hangcheck_reboot, "If nonzero, the machine will reboot when the timer margin is exceeded."); 588c2ecf20Sopenharmony_cimodule_param(hangcheck_dump_tasks, int, 0); 598c2ecf20Sopenharmony_ciMODULE_PARM_DESC(hangcheck_dump_tasks, "If nonzero, the machine will dump the system task state when the timer margin is exceeded."); 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ciMODULE_AUTHOR("Oracle"); 628c2ecf20Sopenharmony_ciMODULE_DESCRIPTION("Hangcheck-timer detects when the system has gone out to lunch past a certain margin."); 638c2ecf20Sopenharmony_ciMODULE_LICENSE("GPL"); 648c2ecf20Sopenharmony_ciMODULE_VERSION(VERSION_STR); 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_ci/* options - nonmodular */ 678c2ecf20Sopenharmony_ci#ifndef MODULE 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_cistatic int __init hangcheck_parse_tick(char *str) 708c2ecf20Sopenharmony_ci{ 718c2ecf20Sopenharmony_ci int par; 728c2ecf20Sopenharmony_ci if (get_option(&str,&par)) 738c2ecf20Sopenharmony_ci hangcheck_tick = par; 748c2ecf20Sopenharmony_ci return 1; 758c2ecf20Sopenharmony_ci} 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_cistatic int __init hangcheck_parse_margin(char *str) 788c2ecf20Sopenharmony_ci{ 798c2ecf20Sopenharmony_ci int par; 808c2ecf20Sopenharmony_ci if (get_option(&str,&par)) 818c2ecf20Sopenharmony_ci hangcheck_margin = par; 828c2ecf20Sopenharmony_ci return 1; 838c2ecf20Sopenharmony_ci} 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_cistatic int __init hangcheck_parse_reboot(char *str) 868c2ecf20Sopenharmony_ci{ 878c2ecf20Sopenharmony_ci int par; 888c2ecf20Sopenharmony_ci if (get_option(&str,&par)) 898c2ecf20Sopenharmony_ci hangcheck_reboot = par; 908c2ecf20Sopenharmony_ci return 1; 918c2ecf20Sopenharmony_ci} 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_cistatic int __init hangcheck_parse_dump_tasks(char *str) 948c2ecf20Sopenharmony_ci{ 958c2ecf20Sopenharmony_ci int par; 968c2ecf20Sopenharmony_ci if (get_option(&str,&par)) 978c2ecf20Sopenharmony_ci hangcheck_dump_tasks = par; 988c2ecf20Sopenharmony_ci return 1; 998c2ecf20Sopenharmony_ci} 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ci__setup("hcheck_tick", hangcheck_parse_tick); 1028c2ecf20Sopenharmony_ci__setup("hcheck_margin", hangcheck_parse_margin); 1038c2ecf20Sopenharmony_ci__setup("hcheck_reboot", hangcheck_parse_reboot); 1048c2ecf20Sopenharmony_ci__setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); 1058c2ecf20Sopenharmony_ci#endif /* not MODULE */ 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_ci#define TIMER_FREQ 1000000000ULL 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ci/* Last time scheduled */ 1108c2ecf20Sopenharmony_cistatic unsigned long long hangcheck_tsc, hangcheck_tsc_margin; 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_cistatic void hangcheck_fire(struct timer_list *); 1138c2ecf20Sopenharmony_ci 1148c2ecf20Sopenharmony_cistatic DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire); 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_cistatic void hangcheck_fire(struct timer_list *unused) 1178c2ecf20Sopenharmony_ci{ 1188c2ecf20Sopenharmony_ci unsigned long long cur_tsc, tsc_diff; 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci cur_tsc = ktime_get_ns(); 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci if (cur_tsc > hangcheck_tsc) 1238c2ecf20Sopenharmony_ci tsc_diff = cur_tsc - hangcheck_tsc; 1248c2ecf20Sopenharmony_ci else 1258c2ecf20Sopenharmony_ci tsc_diff = (cur_tsc + (~0ULL - hangcheck_tsc)); /* or something */ 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci if (tsc_diff > hangcheck_tsc_margin) { 1288c2ecf20Sopenharmony_ci if (hangcheck_dump_tasks) { 1298c2ecf20Sopenharmony_ci printk(KERN_CRIT "Hangcheck: Task state:\n"); 1308c2ecf20Sopenharmony_ci#ifdef CONFIG_MAGIC_SYSRQ 1318c2ecf20Sopenharmony_ci handle_sysrq('t'); 1328c2ecf20Sopenharmony_ci#endif /* CONFIG_MAGIC_SYSRQ */ 1338c2ecf20Sopenharmony_ci } 1348c2ecf20Sopenharmony_ci if (hangcheck_reboot) { 1358c2ecf20Sopenharmony_ci printk(KERN_CRIT "Hangcheck: hangcheck is restarting the machine.\n"); 1368c2ecf20Sopenharmony_ci emergency_restart(); 1378c2ecf20Sopenharmony_ci } else { 1388c2ecf20Sopenharmony_ci printk(KERN_CRIT "Hangcheck: hangcheck value past margin!\n"); 1398c2ecf20Sopenharmony_ci } 1408c2ecf20Sopenharmony_ci } 1418c2ecf20Sopenharmony_ci#if 0 1428c2ecf20Sopenharmony_ci /* 1438c2ecf20Sopenharmony_ci * Enable to investigate delays in detail 1448c2ecf20Sopenharmony_ci */ 1458c2ecf20Sopenharmony_ci printk("Hangcheck: called %Ld ns since last time (%Ld ns overshoot)\n", 1468c2ecf20Sopenharmony_ci tsc_diff, tsc_diff - hangcheck_tick*TIMER_FREQ); 1478c2ecf20Sopenharmony_ci#endif 1488c2ecf20Sopenharmony_ci mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 1498c2ecf20Sopenharmony_ci hangcheck_tsc = ktime_get_ns(); 1508c2ecf20Sopenharmony_ci} 1518c2ecf20Sopenharmony_ci 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_cistatic int __init hangcheck_init(void) 1548c2ecf20Sopenharmony_ci{ 1558c2ecf20Sopenharmony_ci printk("Hangcheck: starting hangcheck timer %s (tick is %d seconds, margin is %d seconds).\n", 1568c2ecf20Sopenharmony_ci VERSION_STR, hangcheck_tick, hangcheck_margin); 1578c2ecf20Sopenharmony_ci hangcheck_tsc_margin = 1588c2ecf20Sopenharmony_ci (unsigned long long)hangcheck_margin + hangcheck_tick; 1598c2ecf20Sopenharmony_ci hangcheck_tsc_margin *= TIMER_FREQ; 1608c2ecf20Sopenharmony_ci 1618c2ecf20Sopenharmony_ci hangcheck_tsc = ktime_get_ns(); 1628c2ecf20Sopenharmony_ci mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_ci return 0; 1658c2ecf20Sopenharmony_ci} 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_cistatic void __exit hangcheck_exit(void) 1698c2ecf20Sopenharmony_ci{ 1708c2ecf20Sopenharmony_ci del_timer_sync(&hangcheck_ticktock); 1718c2ecf20Sopenharmony_ci printk("Hangcheck: Stopped hangcheck timer.\n"); 1728c2ecf20Sopenharmony_ci} 1738c2ecf20Sopenharmony_ci 1748c2ecf20Sopenharmony_cimodule_init(hangcheck_init); 1758c2ecf20Sopenharmony_cimodule_exit(hangcheck_exit); 176