162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * hangcheck-timer.c
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Driver for a little io fencing timer.
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Copyright (C) 2002, 2003 Oracle.  All rights reserved.
862306a36Sopenharmony_ci *
962306a36Sopenharmony_ci * Author: Joel Becker <joel.becker@oracle.com>
1062306a36Sopenharmony_ci */
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci/*
1362306a36Sopenharmony_ci * The hangcheck-timer driver uses the TSC to catch delays that
1462306a36Sopenharmony_ci * jiffies does not notice.  A timer is set.  When the timer fires, it
1562306a36Sopenharmony_ci * checks whether it was delayed and if that delay exceeds a given
1662306a36Sopenharmony_ci * margin of error.  The hangcheck_tick module parameter takes the timer
1762306a36Sopenharmony_ci * duration in seconds.  The hangcheck_margin parameter defines the
1862306a36Sopenharmony_ci * margin of error, in seconds.  The defaults are 60 seconds for the
1962306a36Sopenharmony_ci * timer and 180 seconds for the margin of error.  IOW, a timer is set
2062306a36Sopenharmony_ci * for 60 seconds.  When the timer fires, the callback checks the
2162306a36Sopenharmony_ci * actual duration that the timer waited.  If the duration exceeds the
2262306a36Sopenharmony_ci * allotted time and margin (here 60 + 180, or 240 seconds), the machine
2362306a36Sopenharmony_ci * is restarted.  A healthy machine will have the duration match the
2462306a36Sopenharmony_ci * expected timeout very closely.
2562306a36Sopenharmony_ci */
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci#include <linux/module.h>
2862306a36Sopenharmony_ci#include <linux/moduleparam.h>
2962306a36Sopenharmony_ci#include <linux/types.h>
3062306a36Sopenharmony_ci#include <linux/kernel.h>
3162306a36Sopenharmony_ci#include <linux/fs.h>
3262306a36Sopenharmony_ci#include <linux/mm.h>
3362306a36Sopenharmony_ci#include <linux/reboot.h>
3462306a36Sopenharmony_ci#include <linux/init.h>
3562306a36Sopenharmony_ci#include <linux/delay.h>
3662306a36Sopenharmony_ci#include <linux/uaccess.h>
3762306a36Sopenharmony_ci#include <linux/sysrq.h>
3862306a36Sopenharmony_ci#include <linux/timer.h>
3962306a36Sopenharmony_ci#include <linux/hrtimer.h>
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_ci#define VERSION_STR "0.9.1"
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_ci#define DEFAULT_IOFENCE_MARGIN 60	/* Default fudge factor, in seconds */
4462306a36Sopenharmony_ci#define DEFAULT_IOFENCE_TICK 180	/* Default timer timeout, in seconds */
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_cistatic int hangcheck_tick = DEFAULT_IOFENCE_TICK;
4762306a36Sopenharmony_cistatic int hangcheck_margin = DEFAULT_IOFENCE_MARGIN;
4862306a36Sopenharmony_cistatic int hangcheck_reboot;  /* Defaults to not reboot */
4962306a36Sopenharmony_cistatic int hangcheck_dump_tasks;  /* Defaults to not dumping SysRQ T */
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci/* options - modular */
5262306a36Sopenharmony_cimodule_param(hangcheck_tick, int, 0);
5362306a36Sopenharmony_ciMODULE_PARM_DESC(hangcheck_tick, "Timer delay.");
5462306a36Sopenharmony_cimodule_param(hangcheck_margin, int, 0);
5562306a36Sopenharmony_ciMODULE_PARM_DESC(hangcheck_margin, "If the hangcheck timer has been delayed more than hangcheck_margin seconds, the driver will fire.");
5662306a36Sopenharmony_cimodule_param(hangcheck_reboot, int, 0);
5762306a36Sopenharmony_ciMODULE_PARM_DESC(hangcheck_reboot, "If nonzero, the machine will reboot when the timer margin is exceeded.");
5862306a36Sopenharmony_cimodule_param(hangcheck_dump_tasks, int, 0);
5962306a36Sopenharmony_ciMODULE_PARM_DESC(hangcheck_dump_tasks, "If nonzero, the machine will dump the system task state when the timer margin is exceeded.");
6062306a36Sopenharmony_ci
6162306a36Sopenharmony_ciMODULE_AUTHOR("Oracle");
6262306a36Sopenharmony_ciMODULE_DESCRIPTION("Hangcheck-timer detects when the system has gone out to lunch past a certain margin.");
6362306a36Sopenharmony_ciMODULE_LICENSE("GPL");
6462306a36Sopenharmony_ciMODULE_VERSION(VERSION_STR);
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci/* options - nonmodular */
6762306a36Sopenharmony_ci#ifndef MODULE
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_cistatic int __init hangcheck_parse_tick(char *str)
7062306a36Sopenharmony_ci{
7162306a36Sopenharmony_ci	int par;
7262306a36Sopenharmony_ci	if (get_option(&str,&par))
7362306a36Sopenharmony_ci		hangcheck_tick = par;
7462306a36Sopenharmony_ci	return 1;
7562306a36Sopenharmony_ci}
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_cistatic int __init hangcheck_parse_margin(char *str)
7862306a36Sopenharmony_ci{
7962306a36Sopenharmony_ci	int par;
8062306a36Sopenharmony_ci	if (get_option(&str,&par))
8162306a36Sopenharmony_ci		hangcheck_margin = par;
8262306a36Sopenharmony_ci	return 1;
8362306a36Sopenharmony_ci}
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_cistatic int __init hangcheck_parse_reboot(char *str)
8662306a36Sopenharmony_ci{
8762306a36Sopenharmony_ci	int par;
8862306a36Sopenharmony_ci	if (get_option(&str,&par))
8962306a36Sopenharmony_ci		hangcheck_reboot = par;
9062306a36Sopenharmony_ci	return 1;
9162306a36Sopenharmony_ci}
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_cistatic int __init hangcheck_parse_dump_tasks(char *str)
9462306a36Sopenharmony_ci{
9562306a36Sopenharmony_ci	int par;
9662306a36Sopenharmony_ci	if (get_option(&str,&par))
9762306a36Sopenharmony_ci		hangcheck_dump_tasks = par;
9862306a36Sopenharmony_ci	return 1;
9962306a36Sopenharmony_ci}
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci__setup("hcheck_tick", hangcheck_parse_tick);
10262306a36Sopenharmony_ci__setup("hcheck_margin", hangcheck_parse_margin);
10362306a36Sopenharmony_ci__setup("hcheck_reboot", hangcheck_parse_reboot);
10462306a36Sopenharmony_ci__setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks);
10562306a36Sopenharmony_ci#endif /* not MODULE */
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci#define TIMER_FREQ 1000000000ULL
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci/* Last time scheduled */
11062306a36Sopenharmony_cistatic unsigned long long hangcheck_tsc, hangcheck_tsc_margin;
11162306a36Sopenharmony_ci
11262306a36Sopenharmony_cistatic void hangcheck_fire(struct timer_list *);
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_cistatic DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire);
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_cistatic void hangcheck_fire(struct timer_list *unused)
11762306a36Sopenharmony_ci{
11862306a36Sopenharmony_ci	unsigned long long cur_tsc, tsc_diff;
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_ci	cur_tsc = ktime_get_ns();
12162306a36Sopenharmony_ci
12262306a36Sopenharmony_ci	if (cur_tsc > hangcheck_tsc)
12362306a36Sopenharmony_ci		tsc_diff = cur_tsc - hangcheck_tsc;
12462306a36Sopenharmony_ci	else
12562306a36Sopenharmony_ci		tsc_diff = (cur_tsc + (~0ULL - hangcheck_tsc)); /* or something */
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci	if (tsc_diff > hangcheck_tsc_margin) {
12862306a36Sopenharmony_ci		if (hangcheck_dump_tasks) {
12962306a36Sopenharmony_ci			printk(KERN_CRIT "Hangcheck: Task state:\n");
13062306a36Sopenharmony_ci#ifdef CONFIG_MAGIC_SYSRQ
13162306a36Sopenharmony_ci			handle_sysrq('t');
13262306a36Sopenharmony_ci#endif  /* CONFIG_MAGIC_SYSRQ */
13362306a36Sopenharmony_ci		}
13462306a36Sopenharmony_ci		if (hangcheck_reboot) {
13562306a36Sopenharmony_ci			printk(KERN_CRIT "Hangcheck: hangcheck is restarting the machine.\n");
13662306a36Sopenharmony_ci			emergency_restart();
13762306a36Sopenharmony_ci		} else {
13862306a36Sopenharmony_ci			printk(KERN_CRIT "Hangcheck: hangcheck value past margin!\n");
13962306a36Sopenharmony_ci		}
14062306a36Sopenharmony_ci	}
14162306a36Sopenharmony_ci#if 0
14262306a36Sopenharmony_ci	/*
14362306a36Sopenharmony_ci	 * Enable to investigate delays in detail
14462306a36Sopenharmony_ci	 */
14562306a36Sopenharmony_ci	printk("Hangcheck: called %Ld ns since last time (%Ld ns overshoot)\n",
14662306a36Sopenharmony_ci			tsc_diff, tsc_diff - hangcheck_tick*TIMER_FREQ);
14762306a36Sopenharmony_ci#endif
14862306a36Sopenharmony_ci	mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ));
14962306a36Sopenharmony_ci	hangcheck_tsc = ktime_get_ns();
15062306a36Sopenharmony_ci}
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci
15362306a36Sopenharmony_cistatic int __init hangcheck_init(void)
15462306a36Sopenharmony_ci{
15562306a36Sopenharmony_ci	printk("Hangcheck: starting hangcheck timer %s (tick is %d seconds, margin is %d seconds).\n",
15662306a36Sopenharmony_ci	       VERSION_STR, hangcheck_tick, hangcheck_margin);
15762306a36Sopenharmony_ci	hangcheck_tsc_margin =
15862306a36Sopenharmony_ci		(unsigned long long)hangcheck_margin + hangcheck_tick;
15962306a36Sopenharmony_ci	hangcheck_tsc_margin *= TIMER_FREQ;
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci	hangcheck_tsc = ktime_get_ns();
16262306a36Sopenharmony_ci	mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ));
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci	return 0;
16562306a36Sopenharmony_ci}
16662306a36Sopenharmony_ci
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_cistatic void __exit hangcheck_exit(void)
16962306a36Sopenharmony_ci{
17062306a36Sopenharmony_ci	del_timer_sync(&hangcheck_ticktock);
17162306a36Sopenharmony_ci        printk("Hangcheck: Stopped hangcheck timer.\n");
17262306a36Sopenharmony_ci}
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_cimodule_init(hangcheck_init);
17562306a36Sopenharmony_cimodule_exit(hangcheck_exit);
176