162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/* Copyright 2002 Andi Kleen */
362306a36Sopenharmony_ci
462306a36Sopenharmony_ci#include <linux/linkage.h>
562306a36Sopenharmony_ci#include <linux/cfi_types.h>
662306a36Sopenharmony_ci#include <asm/errno.h>
762306a36Sopenharmony_ci#include <asm/cpufeatures.h>
862306a36Sopenharmony_ci#include <asm/alternative.h>
962306a36Sopenharmony_ci#include <asm/export.h>
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_ci.section .noinstr.text, "ax"
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci/*
1462306a36Sopenharmony_ci * memcpy - Copy a memory block.
1562306a36Sopenharmony_ci *
1662306a36Sopenharmony_ci * Input:
1762306a36Sopenharmony_ci *  rdi destination
1862306a36Sopenharmony_ci *  rsi source
1962306a36Sopenharmony_ci *  rdx count
2062306a36Sopenharmony_ci *
2162306a36Sopenharmony_ci * Output:
2262306a36Sopenharmony_ci * rax original destination
2362306a36Sopenharmony_ci *
2462306a36Sopenharmony_ci * The FSRM alternative should be done inline (avoiding the call and
2562306a36Sopenharmony_ci * the disgusting return handling), but that would require some help
2662306a36Sopenharmony_ci * from the compiler for better calling conventions.
2762306a36Sopenharmony_ci *
2862306a36Sopenharmony_ci * The 'rep movsb' itself is small enough to replace the call, but the
2962306a36Sopenharmony_ci * two register moves blow up the code. And one of them is "needed"
3062306a36Sopenharmony_ci * only for the return value that is the same as the source input,
3162306a36Sopenharmony_ci * which the compiler could/should do much better anyway.
3262306a36Sopenharmony_ci */
3362306a36Sopenharmony_ciSYM_TYPED_FUNC_START(__memcpy)
3462306a36Sopenharmony_ci	ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_ci	movq %rdi, %rax
3762306a36Sopenharmony_ci	movq %rdx, %rcx
3862306a36Sopenharmony_ci	rep movsb
3962306a36Sopenharmony_ci	RET
4062306a36Sopenharmony_ciSYM_FUNC_END(__memcpy)
4162306a36Sopenharmony_ciEXPORT_SYMBOL(__memcpy)
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_ciSYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
4462306a36Sopenharmony_ciEXPORT_SYMBOL(memcpy)
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(memcpy_orig)
4762306a36Sopenharmony_ci	movq %rdi, %rax
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci	cmpq $0x20, %rdx
5062306a36Sopenharmony_ci	jb .Lhandle_tail
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci	/*
5362306a36Sopenharmony_ci	 * We check whether memory false dependence could occur,
5462306a36Sopenharmony_ci	 * then jump to corresponding copy mode.
5562306a36Sopenharmony_ci	 */
5662306a36Sopenharmony_ci	cmp  %dil, %sil
5762306a36Sopenharmony_ci	jl .Lcopy_backward
5862306a36Sopenharmony_ci	subq $0x20, %rdx
5962306a36Sopenharmony_ci.Lcopy_forward_loop:
6062306a36Sopenharmony_ci	subq $0x20,	%rdx
6162306a36Sopenharmony_ci
6262306a36Sopenharmony_ci	/*
6362306a36Sopenharmony_ci	 * Move in blocks of 4x8 bytes:
6462306a36Sopenharmony_ci	 */
6562306a36Sopenharmony_ci	movq 0*8(%rsi),	%r8
6662306a36Sopenharmony_ci	movq 1*8(%rsi),	%r9
6762306a36Sopenharmony_ci	movq 2*8(%rsi),	%r10
6862306a36Sopenharmony_ci	movq 3*8(%rsi),	%r11
6962306a36Sopenharmony_ci	leaq 4*8(%rsi),	%rsi
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci	movq %r8,	0*8(%rdi)
7262306a36Sopenharmony_ci	movq %r9,	1*8(%rdi)
7362306a36Sopenharmony_ci	movq %r10,	2*8(%rdi)
7462306a36Sopenharmony_ci	movq %r11,	3*8(%rdi)
7562306a36Sopenharmony_ci	leaq 4*8(%rdi),	%rdi
7662306a36Sopenharmony_ci	jae  .Lcopy_forward_loop
7762306a36Sopenharmony_ci	addl $0x20,	%edx
7862306a36Sopenharmony_ci	jmp  .Lhandle_tail
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_ci.Lcopy_backward:
8162306a36Sopenharmony_ci	/*
8262306a36Sopenharmony_ci	 * Calculate copy position to tail.
8362306a36Sopenharmony_ci	 */
8462306a36Sopenharmony_ci	addq %rdx,	%rsi
8562306a36Sopenharmony_ci	addq %rdx,	%rdi
8662306a36Sopenharmony_ci	subq $0x20,	%rdx
8762306a36Sopenharmony_ci	/*
8862306a36Sopenharmony_ci	 * At most 3 ALU operations in one cycle,
8962306a36Sopenharmony_ci	 * so append NOPS in the same 16 bytes trunk.
9062306a36Sopenharmony_ci	 */
9162306a36Sopenharmony_ci	.p2align 4
9262306a36Sopenharmony_ci.Lcopy_backward_loop:
9362306a36Sopenharmony_ci	subq $0x20,	%rdx
9462306a36Sopenharmony_ci	movq -1*8(%rsi),	%r8
9562306a36Sopenharmony_ci	movq -2*8(%rsi),	%r9
9662306a36Sopenharmony_ci	movq -3*8(%rsi),	%r10
9762306a36Sopenharmony_ci	movq -4*8(%rsi),	%r11
9862306a36Sopenharmony_ci	leaq -4*8(%rsi),	%rsi
9962306a36Sopenharmony_ci	movq %r8,		-1*8(%rdi)
10062306a36Sopenharmony_ci	movq %r9,		-2*8(%rdi)
10162306a36Sopenharmony_ci	movq %r10,		-3*8(%rdi)
10262306a36Sopenharmony_ci	movq %r11,		-4*8(%rdi)
10362306a36Sopenharmony_ci	leaq -4*8(%rdi),	%rdi
10462306a36Sopenharmony_ci	jae  .Lcopy_backward_loop
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci	/*
10762306a36Sopenharmony_ci	 * Calculate copy position to head.
10862306a36Sopenharmony_ci	 */
10962306a36Sopenharmony_ci	addl $0x20,	%edx
11062306a36Sopenharmony_ci	subq %rdx,	%rsi
11162306a36Sopenharmony_ci	subq %rdx,	%rdi
11262306a36Sopenharmony_ci.Lhandle_tail:
11362306a36Sopenharmony_ci	cmpl $16,	%edx
11462306a36Sopenharmony_ci	jb   .Lless_16bytes
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci	/*
11762306a36Sopenharmony_ci	 * Move data from 16 bytes to 31 bytes.
11862306a36Sopenharmony_ci	 */
11962306a36Sopenharmony_ci	movq 0*8(%rsi), %r8
12062306a36Sopenharmony_ci	movq 1*8(%rsi),	%r9
12162306a36Sopenharmony_ci	movq -2*8(%rsi, %rdx),	%r10
12262306a36Sopenharmony_ci	movq -1*8(%rsi, %rdx),	%r11
12362306a36Sopenharmony_ci	movq %r8,	0*8(%rdi)
12462306a36Sopenharmony_ci	movq %r9,	1*8(%rdi)
12562306a36Sopenharmony_ci	movq %r10,	-2*8(%rdi, %rdx)
12662306a36Sopenharmony_ci	movq %r11,	-1*8(%rdi, %rdx)
12762306a36Sopenharmony_ci	RET
12862306a36Sopenharmony_ci	.p2align 4
12962306a36Sopenharmony_ci.Lless_16bytes:
13062306a36Sopenharmony_ci	cmpl $8,	%edx
13162306a36Sopenharmony_ci	jb   .Lless_8bytes
13262306a36Sopenharmony_ci	/*
13362306a36Sopenharmony_ci	 * Move data from 8 bytes to 15 bytes.
13462306a36Sopenharmony_ci	 */
13562306a36Sopenharmony_ci	movq 0*8(%rsi),	%r8
13662306a36Sopenharmony_ci	movq -1*8(%rsi, %rdx),	%r9
13762306a36Sopenharmony_ci	movq %r8,	0*8(%rdi)
13862306a36Sopenharmony_ci	movq %r9,	-1*8(%rdi, %rdx)
13962306a36Sopenharmony_ci	RET
14062306a36Sopenharmony_ci	.p2align 4
14162306a36Sopenharmony_ci.Lless_8bytes:
14262306a36Sopenharmony_ci	cmpl $4,	%edx
14362306a36Sopenharmony_ci	jb   .Lless_3bytes
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_ci	/*
14662306a36Sopenharmony_ci	 * Move data from 4 bytes to 7 bytes.
14762306a36Sopenharmony_ci	 */
14862306a36Sopenharmony_ci	movl (%rsi), %ecx
14962306a36Sopenharmony_ci	movl -4(%rsi, %rdx), %r8d
15062306a36Sopenharmony_ci	movl %ecx, (%rdi)
15162306a36Sopenharmony_ci	movl %r8d, -4(%rdi, %rdx)
15262306a36Sopenharmony_ci	RET
15362306a36Sopenharmony_ci	.p2align 4
15462306a36Sopenharmony_ci.Lless_3bytes:
15562306a36Sopenharmony_ci	subl $1, %edx
15662306a36Sopenharmony_ci	jb .Lend
15762306a36Sopenharmony_ci	/*
15862306a36Sopenharmony_ci	 * Move data from 1 bytes to 3 bytes.
15962306a36Sopenharmony_ci	 */
16062306a36Sopenharmony_ci	movzbl (%rsi), %ecx
16162306a36Sopenharmony_ci	jz .Lstore_1byte
16262306a36Sopenharmony_ci	movzbq 1(%rsi), %r8
16362306a36Sopenharmony_ci	movzbq (%rsi, %rdx), %r9
16462306a36Sopenharmony_ci	movb %r8b, 1(%rdi)
16562306a36Sopenharmony_ci	movb %r9b, (%rdi, %rdx)
16662306a36Sopenharmony_ci.Lstore_1byte:
16762306a36Sopenharmony_ci	movb %cl, (%rdi)
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci.Lend:
17062306a36Sopenharmony_ci	RET
17162306a36Sopenharmony_ciSYM_FUNC_END(memcpy_orig)
17262306a36Sopenharmony_ci
173