18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci *	Precise Delay Loops for i386
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci *	Copyright (C) 1993 Linus Torvalds
68c2ecf20Sopenharmony_ci *	Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
78c2ecf20Sopenharmony_ci *	Copyright (C) 2008 Jiri Hladky <hladky _dot_ jiri _at_ gmail _dot_ com>
88c2ecf20Sopenharmony_ci *
98c2ecf20Sopenharmony_ci *	The __delay function must _NOT_ be inlined as its execution time
108c2ecf20Sopenharmony_ci *	depends wildly on alignment on many x86 processors. The additional
118c2ecf20Sopenharmony_ci *	jump magic is needed to get the timing stable on all the CPU's
128c2ecf20Sopenharmony_ci *	we have to worry about.
138c2ecf20Sopenharmony_ci */
148c2ecf20Sopenharmony_ci
158c2ecf20Sopenharmony_ci#include <linux/export.h>
168c2ecf20Sopenharmony_ci#include <linux/sched.h>
178c2ecf20Sopenharmony_ci#include <linux/timex.h>
188c2ecf20Sopenharmony_ci#include <linux/preempt.h>
198c2ecf20Sopenharmony_ci#include <linux/delay.h>
208c2ecf20Sopenharmony_ci
218c2ecf20Sopenharmony_ci#include <asm/processor.h>
228c2ecf20Sopenharmony_ci#include <asm/delay.h>
238c2ecf20Sopenharmony_ci#include <asm/timer.h>
248c2ecf20Sopenharmony_ci#include <asm/mwait.h>
258c2ecf20Sopenharmony_ci
268c2ecf20Sopenharmony_ci#ifdef CONFIG_SMP
278c2ecf20Sopenharmony_ci# include <asm/smp.h>
288c2ecf20Sopenharmony_ci#endif
298c2ecf20Sopenharmony_ci
308c2ecf20Sopenharmony_cistatic void delay_loop(u64 __loops);
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_ci/*
338c2ecf20Sopenharmony_ci * Calibration and selection of the delay mechanism happens only once
348c2ecf20Sopenharmony_ci * during boot.
358c2ecf20Sopenharmony_ci */
368c2ecf20Sopenharmony_cistatic void (*delay_fn)(u64) __ro_after_init = delay_loop;
378c2ecf20Sopenharmony_cistatic void (*delay_halt_fn)(u64 start, u64 cycles) __ro_after_init;
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_ci/* simple loop based delay: */
408c2ecf20Sopenharmony_cistatic void delay_loop(u64 __loops)
418c2ecf20Sopenharmony_ci{
428c2ecf20Sopenharmony_ci	unsigned long loops = (unsigned long)__loops;
438c2ecf20Sopenharmony_ci
448c2ecf20Sopenharmony_ci	asm volatile(
458c2ecf20Sopenharmony_ci		"	test %0,%0	\n"
468c2ecf20Sopenharmony_ci		"	jz 3f		\n"
478c2ecf20Sopenharmony_ci		"	jmp 1f		\n"
488c2ecf20Sopenharmony_ci
498c2ecf20Sopenharmony_ci		".align 16		\n"
508c2ecf20Sopenharmony_ci		"1:	jmp 2f		\n"
518c2ecf20Sopenharmony_ci
528c2ecf20Sopenharmony_ci		".align 16		\n"
538c2ecf20Sopenharmony_ci		"2:	dec %0		\n"
548c2ecf20Sopenharmony_ci		"	jnz 2b		\n"
558c2ecf20Sopenharmony_ci		"3:	dec %0		\n"
568c2ecf20Sopenharmony_ci
578c2ecf20Sopenharmony_ci		: "+a" (loops)
588c2ecf20Sopenharmony_ci		:
598c2ecf20Sopenharmony_ci	);
608c2ecf20Sopenharmony_ci}
618c2ecf20Sopenharmony_ci
628c2ecf20Sopenharmony_ci/* TSC based delay: */
638c2ecf20Sopenharmony_cistatic void delay_tsc(u64 cycles)
648c2ecf20Sopenharmony_ci{
658c2ecf20Sopenharmony_ci	u64 bclock, now;
668c2ecf20Sopenharmony_ci	int cpu;
678c2ecf20Sopenharmony_ci
688c2ecf20Sopenharmony_ci	preempt_disable();
698c2ecf20Sopenharmony_ci	cpu = smp_processor_id();
708c2ecf20Sopenharmony_ci	bclock = rdtsc_ordered();
718c2ecf20Sopenharmony_ci	for (;;) {
728c2ecf20Sopenharmony_ci		now = rdtsc_ordered();
738c2ecf20Sopenharmony_ci		if ((now - bclock) >= cycles)
748c2ecf20Sopenharmony_ci			break;
758c2ecf20Sopenharmony_ci
768c2ecf20Sopenharmony_ci		/* Allow RT tasks to run */
778c2ecf20Sopenharmony_ci		preempt_enable();
788c2ecf20Sopenharmony_ci		rep_nop();
798c2ecf20Sopenharmony_ci		preempt_disable();
808c2ecf20Sopenharmony_ci
818c2ecf20Sopenharmony_ci		/*
828c2ecf20Sopenharmony_ci		 * It is possible that we moved to another CPU, and
838c2ecf20Sopenharmony_ci		 * since TSC's are per-cpu we need to calculate
848c2ecf20Sopenharmony_ci		 * that. The delay must guarantee that we wait "at
858c2ecf20Sopenharmony_ci		 * least" the amount of time. Being moved to another
868c2ecf20Sopenharmony_ci		 * CPU could make the wait longer but we just need to
878c2ecf20Sopenharmony_ci		 * make sure we waited long enough. Rebalance the
888c2ecf20Sopenharmony_ci		 * counter for this CPU.
898c2ecf20Sopenharmony_ci		 */
908c2ecf20Sopenharmony_ci		if (unlikely(cpu != smp_processor_id())) {
918c2ecf20Sopenharmony_ci			cycles -= (now - bclock);
928c2ecf20Sopenharmony_ci			cpu = smp_processor_id();
938c2ecf20Sopenharmony_ci			bclock = rdtsc_ordered();
948c2ecf20Sopenharmony_ci		}
958c2ecf20Sopenharmony_ci	}
968c2ecf20Sopenharmony_ci	preempt_enable();
978c2ecf20Sopenharmony_ci}
988c2ecf20Sopenharmony_ci
998c2ecf20Sopenharmony_ci/*
1008c2ecf20Sopenharmony_ci * On Intel the TPAUSE instruction waits until any of:
1018c2ecf20Sopenharmony_ci * 1) the TSC counter exceeds the value provided in EDX:EAX
1028c2ecf20Sopenharmony_ci * 2) global timeout in IA32_UMWAIT_CONTROL is exceeded
1038c2ecf20Sopenharmony_ci * 3) an external interrupt occurs
1048c2ecf20Sopenharmony_ci */
1058c2ecf20Sopenharmony_cistatic void delay_halt_tpause(u64 start, u64 cycles)
1068c2ecf20Sopenharmony_ci{
1078c2ecf20Sopenharmony_ci	u64 until = start + cycles;
1088c2ecf20Sopenharmony_ci	u32 eax, edx;
1098c2ecf20Sopenharmony_ci
1108c2ecf20Sopenharmony_ci	eax = lower_32_bits(until);
1118c2ecf20Sopenharmony_ci	edx = upper_32_bits(until);
1128c2ecf20Sopenharmony_ci
1138c2ecf20Sopenharmony_ci	/*
1148c2ecf20Sopenharmony_ci	 * Hard code the deeper (C0.2) sleep state because exit latency is
1158c2ecf20Sopenharmony_ci	 * small compared to the "microseconds" that usleep() will delay.
1168c2ecf20Sopenharmony_ci	 */
1178c2ecf20Sopenharmony_ci	__tpause(TPAUSE_C02_STATE, edx, eax);
1188c2ecf20Sopenharmony_ci}
1198c2ecf20Sopenharmony_ci
1208c2ecf20Sopenharmony_ci/*
1218c2ecf20Sopenharmony_ci * On some AMD platforms, MWAITX has a configurable 32-bit timer, that
1228c2ecf20Sopenharmony_ci * counts with TSC frequency. The input value is the number of TSC cycles
1238c2ecf20Sopenharmony_ci * to wait. MWAITX will also exit when the timer expires.
1248c2ecf20Sopenharmony_ci */
1258c2ecf20Sopenharmony_cistatic void delay_halt_mwaitx(u64 unused, u64 cycles)
1268c2ecf20Sopenharmony_ci{
1278c2ecf20Sopenharmony_ci	u64 delay;
1288c2ecf20Sopenharmony_ci
1298c2ecf20Sopenharmony_ci	delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles);
1308c2ecf20Sopenharmony_ci	/*
1318c2ecf20Sopenharmony_ci	 * Use cpu_tss_rw as a cacheline-aligned, seldomly accessed per-cpu
1328c2ecf20Sopenharmony_ci	 * variable as the monitor target.
1338c2ecf20Sopenharmony_ci	 */
1348c2ecf20Sopenharmony_ci	 __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
1358c2ecf20Sopenharmony_ci
1368c2ecf20Sopenharmony_ci	/*
1378c2ecf20Sopenharmony_ci	 * AMD, like Intel, supports the EAX hint and EAX=0xf means, do not
1388c2ecf20Sopenharmony_ci	 * enter any deep C-state and we use it here in delay() to minimize
1398c2ecf20Sopenharmony_ci	 * wakeup latency.
1408c2ecf20Sopenharmony_ci	 */
1418c2ecf20Sopenharmony_ci	__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
1428c2ecf20Sopenharmony_ci}
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci/*
1458c2ecf20Sopenharmony_ci * Call a vendor specific function to delay for a given amount of time. Because
1468c2ecf20Sopenharmony_ci * these functions may return earlier than requested, check for actual elapsed
1478c2ecf20Sopenharmony_ci * time and call again until done.
1488c2ecf20Sopenharmony_ci */
1498c2ecf20Sopenharmony_cistatic void delay_halt(u64 __cycles)
1508c2ecf20Sopenharmony_ci{
1518c2ecf20Sopenharmony_ci	u64 start, end, cycles = __cycles;
1528c2ecf20Sopenharmony_ci
1538c2ecf20Sopenharmony_ci	/*
1548c2ecf20Sopenharmony_ci	 * Timer value of 0 causes MWAITX to wait indefinitely, unless there
1558c2ecf20Sopenharmony_ci	 * is a store on the memory monitored by MONITORX.
1568c2ecf20Sopenharmony_ci	 */
1578c2ecf20Sopenharmony_ci	if (!cycles)
1588c2ecf20Sopenharmony_ci		return;
1598c2ecf20Sopenharmony_ci
1608c2ecf20Sopenharmony_ci	start = rdtsc_ordered();
1618c2ecf20Sopenharmony_ci
1628c2ecf20Sopenharmony_ci	for (;;) {
1638c2ecf20Sopenharmony_ci		delay_halt_fn(start, cycles);
1648c2ecf20Sopenharmony_ci		end = rdtsc_ordered();
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ci		if (cycles <= end - start)
1678c2ecf20Sopenharmony_ci			break;
1688c2ecf20Sopenharmony_ci
1698c2ecf20Sopenharmony_ci		cycles -= end - start;
1708c2ecf20Sopenharmony_ci		start = end;
1718c2ecf20Sopenharmony_ci	}
1728c2ecf20Sopenharmony_ci}
1738c2ecf20Sopenharmony_ci
1748c2ecf20Sopenharmony_civoid __init use_tsc_delay(void)
1758c2ecf20Sopenharmony_ci{
1768c2ecf20Sopenharmony_ci	if (delay_fn == delay_loop)
1778c2ecf20Sopenharmony_ci		delay_fn = delay_tsc;
1788c2ecf20Sopenharmony_ci}
1798c2ecf20Sopenharmony_ci
1808c2ecf20Sopenharmony_civoid __init use_tpause_delay(void)
1818c2ecf20Sopenharmony_ci{
1828c2ecf20Sopenharmony_ci	delay_halt_fn = delay_halt_tpause;
1838c2ecf20Sopenharmony_ci	delay_fn = delay_halt;
1848c2ecf20Sopenharmony_ci}
1858c2ecf20Sopenharmony_ci
1868c2ecf20Sopenharmony_civoid use_mwaitx_delay(void)
1878c2ecf20Sopenharmony_ci{
1888c2ecf20Sopenharmony_ci	delay_halt_fn = delay_halt_mwaitx;
1898c2ecf20Sopenharmony_ci	delay_fn = delay_halt;
1908c2ecf20Sopenharmony_ci}
1918c2ecf20Sopenharmony_ci
1928c2ecf20Sopenharmony_ciint read_current_timer(unsigned long *timer_val)
1938c2ecf20Sopenharmony_ci{
1948c2ecf20Sopenharmony_ci	if (delay_fn == delay_tsc) {
1958c2ecf20Sopenharmony_ci		*timer_val = rdtsc();
1968c2ecf20Sopenharmony_ci		return 0;
1978c2ecf20Sopenharmony_ci	}
1988c2ecf20Sopenharmony_ci	return -1;
1998c2ecf20Sopenharmony_ci}
2008c2ecf20Sopenharmony_ci
2018c2ecf20Sopenharmony_civoid __delay(unsigned long loops)
2028c2ecf20Sopenharmony_ci{
2038c2ecf20Sopenharmony_ci	delay_fn(loops);
2048c2ecf20Sopenharmony_ci}
2058c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__delay);
2068c2ecf20Sopenharmony_ci
2078c2ecf20Sopenharmony_cinoinline void __const_udelay(unsigned long xloops)
2088c2ecf20Sopenharmony_ci{
2098c2ecf20Sopenharmony_ci	unsigned long lpj = this_cpu_read(cpu_info.loops_per_jiffy) ? : loops_per_jiffy;
2108c2ecf20Sopenharmony_ci	int d0;
2118c2ecf20Sopenharmony_ci
2128c2ecf20Sopenharmony_ci	xloops *= 4;
2138c2ecf20Sopenharmony_ci	asm("mull %%edx"
2148c2ecf20Sopenharmony_ci		:"=d" (xloops), "=&a" (d0)
2158c2ecf20Sopenharmony_ci		:"1" (xloops), "0" (lpj * (HZ / 4)));
2168c2ecf20Sopenharmony_ci
2178c2ecf20Sopenharmony_ci	__delay(++xloops);
2188c2ecf20Sopenharmony_ci}
2198c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__const_udelay);
2208c2ecf20Sopenharmony_ci
2218c2ecf20Sopenharmony_civoid __udelay(unsigned long usecs)
2228c2ecf20Sopenharmony_ci{
2238c2ecf20Sopenharmony_ci	__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
2248c2ecf20Sopenharmony_ci}
2258c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__udelay);
2268c2ecf20Sopenharmony_ci
2278c2ecf20Sopenharmony_civoid __ndelay(unsigned long nsecs)
2288c2ecf20Sopenharmony_ci{
2298c2ecf20Sopenharmony_ci	__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
2308c2ecf20Sopenharmony_ci}
2318c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__ndelay);
232