162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * common.c - C code for kernel entry and exit
462306a36Sopenharmony_ci * Copyright (c) 2015 Andrew Lutomirski
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * Based on asm and ptrace code by many authors.  The code here originated
762306a36Sopenharmony_ci * in ptrace.c and signal.c.
862306a36Sopenharmony_ci */
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#include <linux/kernel.h>
1162306a36Sopenharmony_ci#include <linux/sched.h>
1262306a36Sopenharmony_ci#include <linux/sched/task_stack.h>
1362306a36Sopenharmony_ci#include <linux/entry-common.h>
1462306a36Sopenharmony_ci#include <linux/mm.h>
1562306a36Sopenharmony_ci#include <linux/smp.h>
1662306a36Sopenharmony_ci#include <linux/errno.h>
1762306a36Sopenharmony_ci#include <linux/ptrace.h>
1862306a36Sopenharmony_ci#include <linux/export.h>
1962306a36Sopenharmony_ci#include <linux/nospec.h>
2062306a36Sopenharmony_ci#include <linux/syscalls.h>
2162306a36Sopenharmony_ci#include <linux/uaccess.h>
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_ci#ifdef CONFIG_XEN_PV
2462306a36Sopenharmony_ci#include <xen/xen-ops.h>
2562306a36Sopenharmony_ci#include <xen/events.h>
2662306a36Sopenharmony_ci#endif
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci#include <asm/apic.h>
2962306a36Sopenharmony_ci#include <asm/desc.h>
3062306a36Sopenharmony_ci#include <asm/traps.h>
3162306a36Sopenharmony_ci#include <asm/vdso.h>
3262306a36Sopenharmony_ci#include <asm/cpufeature.h>
3362306a36Sopenharmony_ci#include <asm/fpu/api.h>
3462306a36Sopenharmony_ci#include <asm/nospec-branch.h>
3562306a36Sopenharmony_ci#include <asm/io_bitmap.h>
3662306a36Sopenharmony_ci#include <asm/syscall.h>
3762306a36Sopenharmony_ci#include <asm/irq_stack.h>
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci#ifdef CONFIG_X86_64
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_cistatic __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
4262306a36Sopenharmony_ci{
4362306a36Sopenharmony_ci	/*
4462306a36Sopenharmony_ci	 * Convert negative numbers to very high and thus out of range
4562306a36Sopenharmony_ci	 * numbers for comparisons.
4662306a36Sopenharmony_ci	 */
4762306a36Sopenharmony_ci	unsigned int unr = nr;
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci	if (likely(unr < NR_syscalls)) {
5062306a36Sopenharmony_ci		unr = array_index_nospec(unr, NR_syscalls);
5162306a36Sopenharmony_ci		regs->ax = sys_call_table[unr](regs);
5262306a36Sopenharmony_ci		return true;
5362306a36Sopenharmony_ci	}
5462306a36Sopenharmony_ci	return false;
5562306a36Sopenharmony_ci}
5662306a36Sopenharmony_ci
5762306a36Sopenharmony_cistatic __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
5862306a36Sopenharmony_ci{
5962306a36Sopenharmony_ci	/*
6062306a36Sopenharmony_ci	 * Adjust the starting offset of the table, and convert numbers
6162306a36Sopenharmony_ci	 * < __X32_SYSCALL_BIT to very high and thus out of range
6262306a36Sopenharmony_ci	 * numbers for comparisons.
6362306a36Sopenharmony_ci	 */
6462306a36Sopenharmony_ci	unsigned int xnr = nr - __X32_SYSCALL_BIT;
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
6762306a36Sopenharmony_ci		xnr = array_index_nospec(xnr, X32_NR_syscalls);
6862306a36Sopenharmony_ci		regs->ax = x32_sys_call_table[xnr](regs);
6962306a36Sopenharmony_ci		return true;
7062306a36Sopenharmony_ci	}
7162306a36Sopenharmony_ci	return false;
7262306a36Sopenharmony_ci}
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_ci__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
7562306a36Sopenharmony_ci{
7662306a36Sopenharmony_ci	add_random_kstack_offset();
7762306a36Sopenharmony_ci	nr = syscall_enter_from_user_mode(regs, nr);
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_ci	instrumentation_begin();
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_ci	if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
8262306a36Sopenharmony_ci		/* Invalid system call, but still a system call. */
8362306a36Sopenharmony_ci		regs->ax = __x64_sys_ni_syscall(regs);
8462306a36Sopenharmony_ci	}
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_ci	instrumentation_end();
8762306a36Sopenharmony_ci	syscall_exit_to_user_mode(regs);
8862306a36Sopenharmony_ci}
8962306a36Sopenharmony_ci#endif
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
9262306a36Sopenharmony_cistatic __always_inline int syscall_32_enter(struct pt_regs *regs)
9362306a36Sopenharmony_ci{
9462306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_IA32_EMULATION))
9562306a36Sopenharmony_ci		current_thread_info()->status |= TS_COMPAT;
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci	return (int)regs->orig_ax;
9862306a36Sopenharmony_ci}
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci#ifdef CONFIG_IA32_EMULATION
10162306a36Sopenharmony_cibool __ia32_enabled __ro_after_init = true;
10262306a36Sopenharmony_ci#endif
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci/*
10562306a36Sopenharmony_ci * Invoke a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.
10662306a36Sopenharmony_ci */
10762306a36Sopenharmony_cistatic __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
10862306a36Sopenharmony_ci{
10962306a36Sopenharmony_ci	/*
11062306a36Sopenharmony_ci	 * Convert negative numbers to very high and thus out of range
11162306a36Sopenharmony_ci	 * numbers for comparisons.
11262306a36Sopenharmony_ci	 */
11362306a36Sopenharmony_ci	unsigned int unr = nr;
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_ci	if (likely(unr < IA32_NR_syscalls)) {
11662306a36Sopenharmony_ci		unr = array_index_nospec(unr, IA32_NR_syscalls);
11762306a36Sopenharmony_ci		regs->ax = ia32_sys_call_table[unr](regs);
11862306a36Sopenharmony_ci	} else if (nr != -1) {
11962306a36Sopenharmony_ci		regs->ax = __ia32_sys_ni_syscall(regs);
12062306a36Sopenharmony_ci	}
12162306a36Sopenharmony_ci}
12262306a36Sopenharmony_ci
12362306a36Sopenharmony_ci#ifdef CONFIG_IA32_EMULATION
12462306a36Sopenharmony_cistatic __always_inline bool int80_is_external(void)
12562306a36Sopenharmony_ci{
12662306a36Sopenharmony_ci	const unsigned int offs = (0x80 / 32) * 0x10;
12762306a36Sopenharmony_ci	const u32 bit = BIT(0x80 % 32);
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci	/* The local APIC on XENPV guests is fake */
13062306a36Sopenharmony_ci	if (cpu_feature_enabled(X86_FEATURE_XENPV))
13162306a36Sopenharmony_ci		return false;
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci	/*
13462306a36Sopenharmony_ci	 * If vector 0x80 is set in the APIC ISR then this is an external
13562306a36Sopenharmony_ci	 * interrupt. Either from broken hardware or injected by a VMM.
13662306a36Sopenharmony_ci	 *
13762306a36Sopenharmony_ci	 * Note: In guest mode this is only valid for secure guests where
13862306a36Sopenharmony_ci	 * the secure module fully controls the vAPIC exposed to the guest.
13962306a36Sopenharmony_ci	 */
14062306a36Sopenharmony_ci	return apic_read(APIC_ISR + offs) & bit;
14162306a36Sopenharmony_ci}
14262306a36Sopenharmony_ci
14362306a36Sopenharmony_ci/**
14462306a36Sopenharmony_ci * int80_emulation - 32-bit legacy syscall entry
14562306a36Sopenharmony_ci *
14662306a36Sopenharmony_ci * This entry point can be used by 32-bit and 64-bit programs to perform
14762306a36Sopenharmony_ci * 32-bit system calls.  Instances of INT $0x80 can be found inline in
14862306a36Sopenharmony_ci * various programs and libraries.  It is also used by the vDSO's
14962306a36Sopenharmony_ci * __kernel_vsyscall fallback for hardware that doesn't support a faster
15062306a36Sopenharmony_ci * entry method.  Restarted 32-bit system calls also fall back to INT
15162306a36Sopenharmony_ci * $0x80 regardless of what instruction was originally used to do the
15262306a36Sopenharmony_ci * system call.
15362306a36Sopenharmony_ci *
15462306a36Sopenharmony_ci * This is considered a slow path.  It is not used by most libc
15562306a36Sopenharmony_ci * implementations on modern hardware except during process startup.
15662306a36Sopenharmony_ci *
15762306a36Sopenharmony_ci * The arguments for the INT $0x80 based syscall are on stack in the
15862306a36Sopenharmony_ci * pt_regs structure:
15962306a36Sopenharmony_ci *   eax:				system call number
16062306a36Sopenharmony_ci *   ebx, ecx, edx, esi, edi, ebp:	arg1 - arg 6
16162306a36Sopenharmony_ci */
16262306a36Sopenharmony_ciDEFINE_IDTENTRY_RAW(int80_emulation)
16362306a36Sopenharmony_ci{
16462306a36Sopenharmony_ci	int nr;
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_ci	/* Kernel does not use INT $0x80! */
16762306a36Sopenharmony_ci	if (unlikely(!user_mode(regs))) {
16862306a36Sopenharmony_ci		irqentry_enter(regs);
16962306a36Sopenharmony_ci		instrumentation_begin();
17062306a36Sopenharmony_ci		panic("Unexpected external interrupt 0x80\n");
17162306a36Sopenharmony_ci	}
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci	/*
17462306a36Sopenharmony_ci	 * Establish kernel context for instrumentation, including for
17562306a36Sopenharmony_ci	 * int80_is_external() below which calls into the APIC driver.
17662306a36Sopenharmony_ci	 * Identical for soft and external interrupts.
17762306a36Sopenharmony_ci	 */
17862306a36Sopenharmony_ci	enter_from_user_mode(regs);
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_ci	instrumentation_begin();
18162306a36Sopenharmony_ci	add_random_kstack_offset();
18262306a36Sopenharmony_ci
18362306a36Sopenharmony_ci	/* Validate that this is a soft interrupt to the extent possible */
18462306a36Sopenharmony_ci	if (unlikely(int80_is_external()))
18562306a36Sopenharmony_ci		panic("Unexpected external interrupt 0x80\n");
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_ci	/*
18862306a36Sopenharmony_ci	 * The low level idtentry code pushed -1 into regs::orig_ax
18962306a36Sopenharmony_ci	 * and regs::ax contains the syscall number.
19062306a36Sopenharmony_ci	 *
19162306a36Sopenharmony_ci	 * User tracing code (ptrace or signal handlers) might assume
19262306a36Sopenharmony_ci	 * that the regs::orig_ax contains a 32-bit number on invoking
19362306a36Sopenharmony_ci	 * a 32-bit syscall.
19462306a36Sopenharmony_ci	 *
19562306a36Sopenharmony_ci	 * Establish the syscall convention by saving the 32bit truncated
19662306a36Sopenharmony_ci	 * syscall number in regs::orig_ax and by invalidating regs::ax.
19762306a36Sopenharmony_ci	 */
19862306a36Sopenharmony_ci	regs->orig_ax = regs->ax & GENMASK(31, 0);
19962306a36Sopenharmony_ci	regs->ax = -ENOSYS;
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_ci	nr = syscall_32_enter(regs);
20262306a36Sopenharmony_ci
20362306a36Sopenharmony_ci	local_irq_enable();
20462306a36Sopenharmony_ci	nr = syscall_enter_from_user_mode_work(regs, nr);
20562306a36Sopenharmony_ci	do_syscall_32_irqs_on(regs, nr);
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	instrumentation_end();
20862306a36Sopenharmony_ci	syscall_exit_to_user_mode(regs);
20962306a36Sopenharmony_ci}
21062306a36Sopenharmony_ci#else /* CONFIG_IA32_EMULATION */
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci/* Handles int $0x80 on a 32bit kernel */
21362306a36Sopenharmony_ci__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
21462306a36Sopenharmony_ci{
21562306a36Sopenharmony_ci	int nr = syscall_32_enter(regs);
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci	add_random_kstack_offset();
21862306a36Sopenharmony_ci	/*
21962306a36Sopenharmony_ci	 * Subtlety here: if ptrace pokes something larger than 2^31-1 into
22062306a36Sopenharmony_ci	 * orig_ax, the int return value truncates it. This matches
22162306a36Sopenharmony_ci	 * the semantics of syscall_get_nr().
22262306a36Sopenharmony_ci	 */
22362306a36Sopenharmony_ci	nr = syscall_enter_from_user_mode(regs, nr);
22462306a36Sopenharmony_ci	instrumentation_begin();
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci	do_syscall_32_irqs_on(regs, nr);
22762306a36Sopenharmony_ci
22862306a36Sopenharmony_ci	instrumentation_end();
22962306a36Sopenharmony_ci	syscall_exit_to_user_mode(regs);
23062306a36Sopenharmony_ci}
23162306a36Sopenharmony_ci#endif /* !CONFIG_IA32_EMULATION */
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_cistatic noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
23462306a36Sopenharmony_ci{
23562306a36Sopenharmony_ci	int nr = syscall_32_enter(regs);
23662306a36Sopenharmony_ci	int res;
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci	add_random_kstack_offset();
23962306a36Sopenharmony_ci	/*
24062306a36Sopenharmony_ci	 * This cannot use syscall_enter_from_user_mode() as it has to
24162306a36Sopenharmony_ci	 * fetch EBP before invoking any of the syscall entry work
24262306a36Sopenharmony_ci	 * functions.
24362306a36Sopenharmony_ci	 */
24462306a36Sopenharmony_ci	syscall_enter_from_user_mode_prepare(regs);
24562306a36Sopenharmony_ci
24662306a36Sopenharmony_ci	instrumentation_begin();
24762306a36Sopenharmony_ci	/* Fetch EBP from where the vDSO stashed it. */
24862306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_X86_64)) {
24962306a36Sopenharmony_ci		/*
25062306a36Sopenharmony_ci		 * Micro-optimization: the pointer we're following is
25162306a36Sopenharmony_ci		 * explicitly 32 bits, so it can't be out of range.
25262306a36Sopenharmony_ci		 */
25362306a36Sopenharmony_ci		res = __get_user(*(u32 *)&regs->bp,
25462306a36Sopenharmony_ci			 (u32 __user __force *)(unsigned long)(u32)regs->sp);
25562306a36Sopenharmony_ci	} else {
25662306a36Sopenharmony_ci		res = get_user(*(u32 *)&regs->bp,
25762306a36Sopenharmony_ci		       (u32 __user __force *)(unsigned long)(u32)regs->sp);
25862306a36Sopenharmony_ci	}
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci	if (res) {
26162306a36Sopenharmony_ci		/* User code screwed up. */
26262306a36Sopenharmony_ci		regs->ax = -EFAULT;
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci		local_irq_disable();
26562306a36Sopenharmony_ci		instrumentation_end();
26662306a36Sopenharmony_ci		irqentry_exit_to_user_mode(regs);
26762306a36Sopenharmony_ci		return false;
26862306a36Sopenharmony_ci	}
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci	nr = syscall_enter_from_user_mode_work(regs, nr);
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci	/* Now this is just like a normal syscall. */
27362306a36Sopenharmony_ci	do_syscall_32_irqs_on(regs, nr);
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ci	instrumentation_end();
27662306a36Sopenharmony_ci	syscall_exit_to_user_mode(regs);
27762306a36Sopenharmony_ci	return true;
27862306a36Sopenharmony_ci}
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
28162306a36Sopenharmony_ci__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
28262306a36Sopenharmony_ci{
28362306a36Sopenharmony_ci	/*
28462306a36Sopenharmony_ci	 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
28562306a36Sopenharmony_ci	 * convention.  Adjust regs so it looks like we entered using int80.
28662306a36Sopenharmony_ci	 */
28762306a36Sopenharmony_ci	unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
28862306a36Sopenharmony_ci					vdso_image_32.sym_int80_landing_pad;
28962306a36Sopenharmony_ci
29062306a36Sopenharmony_ci	/*
29162306a36Sopenharmony_ci	 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
29262306a36Sopenharmony_ci	 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
29362306a36Sopenharmony_ci	 * Fix it up.
29462306a36Sopenharmony_ci	 */
29562306a36Sopenharmony_ci	regs->ip = landing_pad;
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_ci	/* Invoke the syscall. If it failed, keep it simple: use IRET. */
29862306a36Sopenharmony_ci	if (!__do_fast_syscall_32(regs))
29962306a36Sopenharmony_ci		return 0;
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ci#ifdef CONFIG_X86_64
30262306a36Sopenharmony_ci	/*
30362306a36Sopenharmony_ci	 * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
30462306a36Sopenharmony_ci	 * SYSRETL is available on all 64-bit CPUs, so we don't need to
30562306a36Sopenharmony_ci	 * bother with SYSEXIT.
30662306a36Sopenharmony_ci	 *
30762306a36Sopenharmony_ci	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
30862306a36Sopenharmony_ci	 * because the ECX fixup above will ensure that this is essentially
30962306a36Sopenharmony_ci	 * never the case.
31062306a36Sopenharmony_ci	 */
31162306a36Sopenharmony_ci	return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
31262306a36Sopenharmony_ci		regs->ip == landing_pad &&
31362306a36Sopenharmony_ci		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
31462306a36Sopenharmony_ci#else
31562306a36Sopenharmony_ci	/*
31662306a36Sopenharmony_ci	 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
31762306a36Sopenharmony_ci	 *
31862306a36Sopenharmony_ci	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
31962306a36Sopenharmony_ci	 * because the ECX fixup above will ensure that this is essentially
32062306a36Sopenharmony_ci	 * never the case.
32162306a36Sopenharmony_ci	 *
32262306a36Sopenharmony_ci	 * We don't allow syscalls at all from VM86 mode, but we still
32362306a36Sopenharmony_ci	 * need to check VM, because we might be returning from sys_vm86.
32462306a36Sopenharmony_ci	 */
32562306a36Sopenharmony_ci	return static_cpu_has(X86_FEATURE_SEP) &&
32662306a36Sopenharmony_ci		regs->cs == __USER_CS && regs->ss == __USER_DS &&
32762306a36Sopenharmony_ci		regs->ip == landing_pad &&
32862306a36Sopenharmony_ci		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
32962306a36Sopenharmony_ci#endif
33062306a36Sopenharmony_ci}
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
33362306a36Sopenharmony_ci__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
33462306a36Sopenharmony_ci{
33562306a36Sopenharmony_ci	/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
33662306a36Sopenharmony_ci	regs->sp = regs->bp;
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ci	/* SYSENTER clobbers EFLAGS.IF.  Assume it was set in usermode. */
33962306a36Sopenharmony_ci	regs->flags |= X86_EFLAGS_IF;
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci	return do_fast_syscall_32(regs);
34262306a36Sopenharmony_ci}
34362306a36Sopenharmony_ci#endif
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ciSYSCALL_DEFINE0(ni_syscall)
34662306a36Sopenharmony_ci{
34762306a36Sopenharmony_ci	return -ENOSYS;
34862306a36Sopenharmony_ci}
34962306a36Sopenharmony_ci
35062306a36Sopenharmony_ci#ifdef CONFIG_XEN_PV
35162306a36Sopenharmony_ci#ifndef CONFIG_PREEMPTION
35262306a36Sopenharmony_ci/*
35362306a36Sopenharmony_ci * Some hypercalls issued by the toolstack can take many 10s of
35462306a36Sopenharmony_ci * seconds. Allow tasks running hypercalls via the privcmd driver to
35562306a36Sopenharmony_ci * be voluntarily preempted even if full kernel preemption is
35662306a36Sopenharmony_ci * disabled.
35762306a36Sopenharmony_ci *
35862306a36Sopenharmony_ci * Such preemptible hypercalls are bracketed by
35962306a36Sopenharmony_ci * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
36062306a36Sopenharmony_ci * calls.
36162306a36Sopenharmony_ci */
36262306a36Sopenharmony_ciDEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
36362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci/*
36662306a36Sopenharmony_ci * In case of scheduling the flag must be cleared and restored after
36762306a36Sopenharmony_ci * returning from schedule as the task might move to a different CPU.
36862306a36Sopenharmony_ci */
36962306a36Sopenharmony_cistatic __always_inline bool get_and_clear_inhcall(void)
37062306a36Sopenharmony_ci{
37162306a36Sopenharmony_ci	bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
37262306a36Sopenharmony_ci
37362306a36Sopenharmony_ci	__this_cpu_write(xen_in_preemptible_hcall, false);
37462306a36Sopenharmony_ci	return inhcall;
37562306a36Sopenharmony_ci}
37662306a36Sopenharmony_ci
37762306a36Sopenharmony_cistatic __always_inline void restore_inhcall(bool inhcall)
37862306a36Sopenharmony_ci{
37962306a36Sopenharmony_ci	__this_cpu_write(xen_in_preemptible_hcall, inhcall);
38062306a36Sopenharmony_ci}
38162306a36Sopenharmony_ci#else
38262306a36Sopenharmony_cistatic __always_inline bool get_and_clear_inhcall(void) { return false; }
38362306a36Sopenharmony_cistatic __always_inline void restore_inhcall(bool inhcall) { }
38462306a36Sopenharmony_ci#endif
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_cistatic void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
38762306a36Sopenharmony_ci{
38862306a36Sopenharmony_ci	struct pt_regs *old_regs = set_irq_regs(regs);
38962306a36Sopenharmony_ci
39062306a36Sopenharmony_ci	inc_irq_stat(irq_hv_callback_count);
39162306a36Sopenharmony_ci
39262306a36Sopenharmony_ci	xen_evtchn_do_upcall();
39362306a36Sopenharmony_ci
39462306a36Sopenharmony_ci	set_irq_regs(old_regs);
39562306a36Sopenharmony_ci}
39662306a36Sopenharmony_ci
39762306a36Sopenharmony_ci__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
39862306a36Sopenharmony_ci{
39962306a36Sopenharmony_ci	irqentry_state_t state = irqentry_enter(regs);
40062306a36Sopenharmony_ci	bool inhcall;
40162306a36Sopenharmony_ci
40262306a36Sopenharmony_ci	instrumentation_begin();
40362306a36Sopenharmony_ci	run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
40462306a36Sopenharmony_ci
40562306a36Sopenharmony_ci	inhcall = get_and_clear_inhcall();
40662306a36Sopenharmony_ci	if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
40762306a36Sopenharmony_ci		irqentry_exit_cond_resched();
40862306a36Sopenharmony_ci		instrumentation_end();
40962306a36Sopenharmony_ci		restore_inhcall(inhcall);
41062306a36Sopenharmony_ci	} else {
41162306a36Sopenharmony_ci		instrumentation_end();
41262306a36Sopenharmony_ci		irqentry_exit(regs, state);
41362306a36Sopenharmony_ci	}
41462306a36Sopenharmony_ci}
41562306a36Sopenharmony_ci#endif /* CONFIG_XEN_PV */
416