162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/* Copyright 2002 Andi Kleen */
362306a36Sopenharmony_ci
462306a36Sopenharmony_ci#include <linux/linkage.h>
562306a36Sopenharmony_ci#include <asm/errno.h>
662306a36Sopenharmony_ci#include <asm/cpufeatures.h>
762306a36Sopenharmony_ci#include <asm/alternative.h>
862306a36Sopenharmony_ci#include <asm/export.h>
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci.section .noinstr.text, "ax"
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci/*
1362306a36Sopenharmony_ci * memcpy - Copy a memory block.
1462306a36Sopenharmony_ci *
1562306a36Sopenharmony_ci * Input:
1662306a36Sopenharmony_ci *  rdi destination
1762306a36Sopenharmony_ci *  rsi source
1862306a36Sopenharmony_ci *  rdx count
1962306a36Sopenharmony_ci *
2062306a36Sopenharmony_ci * Output:
2162306a36Sopenharmony_ci * rax original destination
2262306a36Sopenharmony_ci *
2362306a36Sopenharmony_ci * The FSRM alternative should be done inline (avoiding the call and
2462306a36Sopenharmony_ci * the disgusting return handling), but that would require some help
2562306a36Sopenharmony_ci * from the compiler for better calling conventions.
2662306a36Sopenharmony_ci *
2762306a36Sopenharmony_ci * The 'rep movsb' itself is small enough to replace the call, but the
2862306a36Sopenharmony_ci * two register moves blow up the code. And one of them is "needed"
2962306a36Sopenharmony_ci * only for the return value that is the same as the source input,
3062306a36Sopenharmony_ci * which the compiler could/should do much better anyway.
3162306a36Sopenharmony_ci */
3262306a36Sopenharmony_ciSYM_TYPED_FUNC_START(__memcpy)
3362306a36Sopenharmony_ci	ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_ci	movq %rdi, %rax
3662306a36Sopenharmony_ci	movq %rdx, %rcx
3762306a36Sopenharmony_ci	rep movsb
3862306a36Sopenharmony_ci	RET
3962306a36Sopenharmony_ciSYM_FUNC_END(__memcpy)
4062306a36Sopenharmony_ciEXPORT_SYMBOL(__memcpy)
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ciSYM_FUNC_ALIAS(memcpy, __memcpy)
4362306a36Sopenharmony_ciEXPORT_SYMBOL(memcpy)
4462306a36Sopenharmony_ci
4562306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(memcpy_orig)
4662306a36Sopenharmony_ci	movq %rdi, %rax
4762306a36Sopenharmony_ci
4862306a36Sopenharmony_ci	cmpq $0x20, %rdx
4962306a36Sopenharmony_ci	jb .Lhandle_tail
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci	/*
5262306a36Sopenharmony_ci	 * We check whether memory false dependence could occur,
5362306a36Sopenharmony_ci	 * then jump to corresponding copy mode.
5462306a36Sopenharmony_ci	 */
5562306a36Sopenharmony_ci	cmp  %dil, %sil
5662306a36Sopenharmony_ci	jl .Lcopy_backward
5762306a36Sopenharmony_ci	subq $0x20, %rdx
5862306a36Sopenharmony_ci.Lcopy_forward_loop:
5962306a36Sopenharmony_ci	subq $0x20,	%rdx
6062306a36Sopenharmony_ci
6162306a36Sopenharmony_ci	/*
6262306a36Sopenharmony_ci	 * Move in blocks of 4x8 bytes:
6362306a36Sopenharmony_ci	 */
6462306a36Sopenharmony_ci	movq 0*8(%rsi),	%r8
6562306a36Sopenharmony_ci	movq 1*8(%rsi),	%r9
6662306a36Sopenharmony_ci	movq 2*8(%rsi),	%r10
6762306a36Sopenharmony_ci	movq 3*8(%rsi),	%r11
6862306a36Sopenharmony_ci	leaq 4*8(%rsi),	%rsi
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci	movq %r8,	0*8(%rdi)
7162306a36Sopenharmony_ci	movq %r9,	1*8(%rdi)
7262306a36Sopenharmony_ci	movq %r10,	2*8(%rdi)
7362306a36Sopenharmony_ci	movq %r11,	3*8(%rdi)
7462306a36Sopenharmony_ci	leaq 4*8(%rdi),	%rdi
7562306a36Sopenharmony_ci	jae  .Lcopy_forward_loop
7662306a36Sopenharmony_ci	addl $0x20,	%edx
7762306a36Sopenharmony_ci	jmp  .Lhandle_tail
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_ci.Lcopy_backward:
8062306a36Sopenharmony_ci	/*
8162306a36Sopenharmony_ci	 * Calculate copy position to tail.
8262306a36Sopenharmony_ci	 */
8362306a36Sopenharmony_ci	addq %rdx,	%rsi
8462306a36Sopenharmony_ci	addq %rdx,	%rdi
8562306a36Sopenharmony_ci	subq $0x20,	%rdx
8662306a36Sopenharmony_ci	/*
8762306a36Sopenharmony_ci	 * At most 3 ALU operations in one cycle,
8862306a36Sopenharmony_ci	 * so append NOPS in the same 16 bytes trunk.
8962306a36Sopenharmony_ci	 */
9062306a36Sopenharmony_ci	.p2align 4
9162306a36Sopenharmony_ci.Lcopy_backward_loop:
9262306a36Sopenharmony_ci	subq $0x20,	%rdx
9362306a36Sopenharmony_ci	movq -1*8(%rsi),	%r8
9462306a36Sopenharmony_ci	movq -2*8(%rsi),	%r9
9562306a36Sopenharmony_ci	movq -3*8(%rsi),	%r10
9662306a36Sopenharmony_ci	movq -4*8(%rsi),	%r11
9762306a36Sopenharmony_ci	leaq -4*8(%rsi),	%rsi
9862306a36Sopenharmony_ci	movq %r8,		-1*8(%rdi)
9962306a36Sopenharmony_ci	movq %r9,		-2*8(%rdi)
10062306a36Sopenharmony_ci	movq %r10,		-3*8(%rdi)
10162306a36Sopenharmony_ci	movq %r11,		-4*8(%rdi)
10262306a36Sopenharmony_ci	leaq -4*8(%rdi),	%rdi
10362306a36Sopenharmony_ci	jae  .Lcopy_backward_loop
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci	/*
10662306a36Sopenharmony_ci	 * Calculate copy position to head.
10762306a36Sopenharmony_ci	 */
10862306a36Sopenharmony_ci	addl $0x20,	%edx
10962306a36Sopenharmony_ci	subq %rdx,	%rsi
11062306a36Sopenharmony_ci	subq %rdx,	%rdi
11162306a36Sopenharmony_ci.Lhandle_tail:
11262306a36Sopenharmony_ci	cmpl $16,	%edx
11362306a36Sopenharmony_ci	jb   .Lless_16bytes
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_ci	/*
11662306a36Sopenharmony_ci	 * Move data from 16 bytes to 31 bytes.
11762306a36Sopenharmony_ci	 */
11862306a36Sopenharmony_ci	movq 0*8(%rsi), %r8
11962306a36Sopenharmony_ci	movq 1*8(%rsi),	%r9
12062306a36Sopenharmony_ci	movq -2*8(%rsi, %rdx),	%r10
12162306a36Sopenharmony_ci	movq -1*8(%rsi, %rdx),	%r11
12262306a36Sopenharmony_ci	movq %r8,	0*8(%rdi)
12362306a36Sopenharmony_ci	movq %r9,	1*8(%rdi)
12462306a36Sopenharmony_ci	movq %r10,	-2*8(%rdi, %rdx)
12562306a36Sopenharmony_ci	movq %r11,	-1*8(%rdi, %rdx)
12662306a36Sopenharmony_ci	RET
12762306a36Sopenharmony_ci	.p2align 4
12862306a36Sopenharmony_ci.Lless_16bytes:
12962306a36Sopenharmony_ci	cmpl $8,	%edx
13062306a36Sopenharmony_ci	jb   .Lless_8bytes
13162306a36Sopenharmony_ci	/*
13262306a36Sopenharmony_ci	 * Move data from 8 bytes to 15 bytes.
13362306a36Sopenharmony_ci	 */
13462306a36Sopenharmony_ci	movq 0*8(%rsi),	%r8
13562306a36Sopenharmony_ci	movq -1*8(%rsi, %rdx),	%r9
13662306a36Sopenharmony_ci	movq %r8,	0*8(%rdi)
13762306a36Sopenharmony_ci	movq %r9,	-1*8(%rdi, %rdx)
13862306a36Sopenharmony_ci	RET
13962306a36Sopenharmony_ci	.p2align 4
14062306a36Sopenharmony_ci.Lless_8bytes:
14162306a36Sopenharmony_ci	cmpl $4,	%edx
14262306a36Sopenharmony_ci	jb   .Lless_3bytes
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci	/*
14562306a36Sopenharmony_ci	 * Move data from 4 bytes to 7 bytes.
14662306a36Sopenharmony_ci	 */
14762306a36Sopenharmony_ci	movl (%rsi), %ecx
14862306a36Sopenharmony_ci	movl -4(%rsi, %rdx), %r8d
14962306a36Sopenharmony_ci	movl %ecx, (%rdi)
15062306a36Sopenharmony_ci	movl %r8d, -4(%rdi, %rdx)
15162306a36Sopenharmony_ci	RET
15262306a36Sopenharmony_ci	.p2align 4
15362306a36Sopenharmony_ci.Lless_3bytes:
15462306a36Sopenharmony_ci	subl $1, %edx
15562306a36Sopenharmony_ci	jb .Lend
15662306a36Sopenharmony_ci	/*
15762306a36Sopenharmony_ci	 * Move data from 1 bytes to 3 bytes.
15862306a36Sopenharmony_ci	 */
15962306a36Sopenharmony_ci	movzbl (%rsi), %ecx
16062306a36Sopenharmony_ci	jz .Lstore_1byte
16162306a36Sopenharmony_ci	movzbq 1(%rsi), %r8
16262306a36Sopenharmony_ci	movzbq (%rsi, %rdx), %r9
16362306a36Sopenharmony_ci	movb %r8b, 1(%rdi)
16462306a36Sopenharmony_ci	movb %r9b, (%rdi, %rdx)
16562306a36Sopenharmony_ci.Lstore_1byte:
16662306a36Sopenharmony_ci	movb %cl, (%rdi)
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ci.Lend:
16962306a36Sopenharmony_ci	RET
17062306a36Sopenharmony_ciSYM_FUNC_END(memcpy_orig)
17162306a36Sopenharmony_ci
172