xref: /kernel/linux/linux-6.6/arch/x86/lib/memmove_64.S (revision 62306a36)
162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Normally compiler builtins are used, but sometimes the compiler calls out
462306a36Sopenharmony_ci * of line code. Based on asm-i386/string.h.
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * This assembly file is re-written from memmove_64.c file.
762306a36Sopenharmony_ci *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
862306a36Sopenharmony_ci */
962306a36Sopenharmony_ci#include <linux/linkage.h>
1062306a36Sopenharmony_ci#include <asm/cpufeatures.h>
1162306a36Sopenharmony_ci#include <asm/alternative.h>
1262306a36Sopenharmony_ci#include <asm/export.h>
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci#undef memmove
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci.section .noinstr.text, "ax"
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci/*
1962306a36Sopenharmony_ci * Implement memmove(). This can handle overlap between src and dst.
2062306a36Sopenharmony_ci *
2162306a36Sopenharmony_ci * Input:
2262306a36Sopenharmony_ci * rdi: dest
2362306a36Sopenharmony_ci * rsi: src
2462306a36Sopenharmony_ci * rdx: count
2562306a36Sopenharmony_ci *
2662306a36Sopenharmony_ci * Output:
2762306a36Sopenharmony_ci * rax: dest
2862306a36Sopenharmony_ci */
2962306a36Sopenharmony_ciSYM_FUNC_START(__memmove)
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_ci	mov %rdi, %rax
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci	/* Decide forward/backward copy mode */
3462306a36Sopenharmony_ci	cmp %rdi, %rsi
3562306a36Sopenharmony_ci	jge .Lmemmove_begin_forward
3662306a36Sopenharmony_ci	mov %rsi, %r8
3762306a36Sopenharmony_ci	add %rdx, %r8
3862306a36Sopenharmony_ci	cmp %rdi, %r8
3962306a36Sopenharmony_ci	jg 2f
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_ci#define CHECK_LEN	cmp $0x20, %rdx; jb 1f
4262306a36Sopenharmony_ci#define MEMMOVE_BYTES	movq %rdx, %rcx; rep movsb; RET
4362306a36Sopenharmony_ci.Lmemmove_begin_forward:
4462306a36Sopenharmony_ci	ALTERNATIVE_2 __stringify(CHECK_LEN), \
4562306a36Sopenharmony_ci		      __stringify(CHECK_LEN; MEMMOVE_BYTES), X86_FEATURE_ERMS, \
4662306a36Sopenharmony_ci		      __stringify(MEMMOVE_BYTES), X86_FEATURE_FSRM
4762306a36Sopenharmony_ci
4862306a36Sopenharmony_ci	/*
4962306a36Sopenharmony_ci	 * movsq instruction have many startup latency
5062306a36Sopenharmony_ci	 * so we handle small size by general register.
5162306a36Sopenharmony_ci	 */
5262306a36Sopenharmony_ci	cmp  $680, %rdx
5362306a36Sopenharmony_ci	jb	3f
5462306a36Sopenharmony_ci	/*
5562306a36Sopenharmony_ci	 * movsq instruction is only good for aligned case.
5662306a36Sopenharmony_ci	 */
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ci	cmpb %dil, %sil
5962306a36Sopenharmony_ci	je 4f
6062306a36Sopenharmony_ci3:
6162306a36Sopenharmony_ci	sub $0x20, %rdx
6262306a36Sopenharmony_ci	/*
6362306a36Sopenharmony_ci	 * We gobble 32 bytes forward in each loop.
6462306a36Sopenharmony_ci	 */
6562306a36Sopenharmony_ci5:
6662306a36Sopenharmony_ci	sub $0x20, %rdx
6762306a36Sopenharmony_ci	movq 0*8(%rsi), %r11
6862306a36Sopenharmony_ci	movq 1*8(%rsi), %r10
6962306a36Sopenharmony_ci	movq 2*8(%rsi), %r9
7062306a36Sopenharmony_ci	movq 3*8(%rsi), %r8
7162306a36Sopenharmony_ci	leaq 4*8(%rsi), %rsi
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci	movq %r11, 0*8(%rdi)
7462306a36Sopenharmony_ci	movq %r10, 1*8(%rdi)
7562306a36Sopenharmony_ci	movq %r9, 2*8(%rdi)
7662306a36Sopenharmony_ci	movq %r8, 3*8(%rdi)
7762306a36Sopenharmony_ci	leaq 4*8(%rdi), %rdi
7862306a36Sopenharmony_ci	jae 5b
7962306a36Sopenharmony_ci	addq $0x20, %rdx
8062306a36Sopenharmony_ci	jmp 1f
8162306a36Sopenharmony_ci	/*
8262306a36Sopenharmony_ci	 * Handle data forward by movsq.
8362306a36Sopenharmony_ci	 */
8462306a36Sopenharmony_ci	.p2align 4
8562306a36Sopenharmony_ci4:
8662306a36Sopenharmony_ci	movq %rdx, %rcx
8762306a36Sopenharmony_ci	movq -8(%rsi, %rdx), %r11
8862306a36Sopenharmony_ci	lea -8(%rdi, %rdx), %r10
8962306a36Sopenharmony_ci	shrq $3, %rcx
9062306a36Sopenharmony_ci	rep movsq
9162306a36Sopenharmony_ci	movq %r11, (%r10)
9262306a36Sopenharmony_ci	jmp 13f
9362306a36Sopenharmony_ci.Lmemmove_end_forward:
9462306a36Sopenharmony_ci
9562306a36Sopenharmony_ci	/*
9662306a36Sopenharmony_ci	 * Handle data backward by movsq.
9762306a36Sopenharmony_ci	 */
9862306a36Sopenharmony_ci	.p2align 4
9962306a36Sopenharmony_ci7:
10062306a36Sopenharmony_ci	movq %rdx, %rcx
10162306a36Sopenharmony_ci	movq (%rsi), %r11
10262306a36Sopenharmony_ci	movq %rdi, %r10
10362306a36Sopenharmony_ci	leaq -8(%rsi, %rdx), %rsi
10462306a36Sopenharmony_ci	leaq -8(%rdi, %rdx), %rdi
10562306a36Sopenharmony_ci	shrq $3, %rcx
10662306a36Sopenharmony_ci	std
10762306a36Sopenharmony_ci	rep movsq
10862306a36Sopenharmony_ci	cld
10962306a36Sopenharmony_ci	movq %r11, (%r10)
11062306a36Sopenharmony_ci	jmp 13f
11162306a36Sopenharmony_ci
11262306a36Sopenharmony_ci	/*
11362306a36Sopenharmony_ci	 * Start to prepare for backward copy.
11462306a36Sopenharmony_ci	 */
11562306a36Sopenharmony_ci	.p2align 4
11662306a36Sopenharmony_ci2:
11762306a36Sopenharmony_ci	cmp $0x20, %rdx
11862306a36Sopenharmony_ci	jb 1f
11962306a36Sopenharmony_ci	cmp $680, %rdx
12062306a36Sopenharmony_ci	jb 6f
12162306a36Sopenharmony_ci	cmp %dil, %sil
12262306a36Sopenharmony_ci	je 7b
12362306a36Sopenharmony_ci6:
12462306a36Sopenharmony_ci	/*
12562306a36Sopenharmony_ci	 * Calculate copy position to tail.
12662306a36Sopenharmony_ci	 */
12762306a36Sopenharmony_ci	addq %rdx, %rsi
12862306a36Sopenharmony_ci	addq %rdx, %rdi
12962306a36Sopenharmony_ci	subq $0x20, %rdx
13062306a36Sopenharmony_ci	/*
13162306a36Sopenharmony_ci	 * We gobble 32 bytes backward in each loop.
13262306a36Sopenharmony_ci	 */
13362306a36Sopenharmony_ci8:
13462306a36Sopenharmony_ci	subq $0x20, %rdx
13562306a36Sopenharmony_ci	movq -1*8(%rsi), %r11
13662306a36Sopenharmony_ci	movq -2*8(%rsi), %r10
13762306a36Sopenharmony_ci	movq -3*8(%rsi), %r9
13862306a36Sopenharmony_ci	movq -4*8(%rsi), %r8
13962306a36Sopenharmony_ci	leaq -4*8(%rsi), %rsi
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci	movq %r11, -1*8(%rdi)
14262306a36Sopenharmony_ci	movq %r10, -2*8(%rdi)
14362306a36Sopenharmony_ci	movq %r9, -3*8(%rdi)
14462306a36Sopenharmony_ci	movq %r8, -4*8(%rdi)
14562306a36Sopenharmony_ci	leaq -4*8(%rdi), %rdi
14662306a36Sopenharmony_ci	jae 8b
14762306a36Sopenharmony_ci	/*
14862306a36Sopenharmony_ci	 * Calculate copy position to head.
14962306a36Sopenharmony_ci	 */
15062306a36Sopenharmony_ci	addq $0x20, %rdx
15162306a36Sopenharmony_ci	subq %rdx, %rsi
15262306a36Sopenharmony_ci	subq %rdx, %rdi
15362306a36Sopenharmony_ci1:
15462306a36Sopenharmony_ci	cmpq $16, %rdx
15562306a36Sopenharmony_ci	jb 9f
15662306a36Sopenharmony_ci	/*
15762306a36Sopenharmony_ci	 * Move data from 16 bytes to 31 bytes.
15862306a36Sopenharmony_ci	 */
15962306a36Sopenharmony_ci	movq 0*8(%rsi), %r11
16062306a36Sopenharmony_ci	movq 1*8(%rsi), %r10
16162306a36Sopenharmony_ci	movq -2*8(%rsi, %rdx), %r9
16262306a36Sopenharmony_ci	movq -1*8(%rsi, %rdx), %r8
16362306a36Sopenharmony_ci	movq %r11, 0*8(%rdi)
16462306a36Sopenharmony_ci	movq %r10, 1*8(%rdi)
16562306a36Sopenharmony_ci	movq %r9, -2*8(%rdi, %rdx)
16662306a36Sopenharmony_ci	movq %r8, -1*8(%rdi, %rdx)
16762306a36Sopenharmony_ci	jmp 13f
16862306a36Sopenharmony_ci	.p2align 4
16962306a36Sopenharmony_ci9:
17062306a36Sopenharmony_ci	cmpq $8, %rdx
17162306a36Sopenharmony_ci	jb 10f
17262306a36Sopenharmony_ci	/*
17362306a36Sopenharmony_ci	 * Move data from 8 bytes to 15 bytes.
17462306a36Sopenharmony_ci	 */
17562306a36Sopenharmony_ci	movq 0*8(%rsi), %r11
17662306a36Sopenharmony_ci	movq -1*8(%rsi, %rdx), %r10
17762306a36Sopenharmony_ci	movq %r11, 0*8(%rdi)
17862306a36Sopenharmony_ci	movq %r10, -1*8(%rdi, %rdx)
17962306a36Sopenharmony_ci	jmp 13f
18062306a36Sopenharmony_ci10:
18162306a36Sopenharmony_ci	cmpq $4, %rdx
18262306a36Sopenharmony_ci	jb 11f
18362306a36Sopenharmony_ci	/*
18462306a36Sopenharmony_ci	 * Move data from 4 bytes to 7 bytes.
18562306a36Sopenharmony_ci	 */
18662306a36Sopenharmony_ci	movl (%rsi), %r11d
18762306a36Sopenharmony_ci	movl -4(%rsi, %rdx), %r10d
18862306a36Sopenharmony_ci	movl %r11d, (%rdi)
18962306a36Sopenharmony_ci	movl %r10d, -4(%rdi, %rdx)
19062306a36Sopenharmony_ci	jmp 13f
19162306a36Sopenharmony_ci11:
19262306a36Sopenharmony_ci	cmp $2, %rdx
19362306a36Sopenharmony_ci	jb 12f
19462306a36Sopenharmony_ci	/*
19562306a36Sopenharmony_ci	 * Move data from 2 bytes to 3 bytes.
19662306a36Sopenharmony_ci	 */
19762306a36Sopenharmony_ci	movw (%rsi), %r11w
19862306a36Sopenharmony_ci	movw -2(%rsi, %rdx), %r10w
19962306a36Sopenharmony_ci	movw %r11w, (%rdi)
20062306a36Sopenharmony_ci	movw %r10w, -2(%rdi, %rdx)
20162306a36Sopenharmony_ci	jmp 13f
20262306a36Sopenharmony_ci12:
20362306a36Sopenharmony_ci	cmp $1, %rdx
20462306a36Sopenharmony_ci	jb 13f
20562306a36Sopenharmony_ci	/*
20662306a36Sopenharmony_ci	 * Move data for 1 byte.
20762306a36Sopenharmony_ci	 */
20862306a36Sopenharmony_ci	movb (%rsi), %r11b
20962306a36Sopenharmony_ci	movb %r11b, (%rdi)
21062306a36Sopenharmony_ci13:
21162306a36Sopenharmony_ci	RET
21262306a36Sopenharmony_ciSYM_FUNC_END(__memmove)
21362306a36Sopenharmony_ciEXPORT_SYMBOL(__memmove)
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ciSYM_FUNC_ALIAS_MEMFUNC(memmove, __memmove)
21662306a36Sopenharmony_ciEXPORT_SYMBOL(memmove)
217