18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Normally compiler builtins are used, but sometimes the compiler calls out
48c2ecf20Sopenharmony_ci * of line code. Based on asm-i386/string.h.
58c2ecf20Sopenharmony_ci *
68c2ecf20Sopenharmony_ci * This assembly file is re-written from memmove_64.c file.
78c2ecf20Sopenharmony_ci *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
88c2ecf20Sopenharmony_ci */
98c2ecf20Sopenharmony_ci#include <linux/linkage.h>
108c2ecf20Sopenharmony_ci#include <asm/cpufeatures.h>
118c2ecf20Sopenharmony_ci#include <asm/alternative.h>
128c2ecf20Sopenharmony_ci#include <asm/export.h>
138c2ecf20Sopenharmony_ci
148c2ecf20Sopenharmony_ci#undef memmove
158c2ecf20Sopenharmony_ci
168c2ecf20Sopenharmony_ci/*
178c2ecf20Sopenharmony_ci * Implement memmove(). This can handle overlap between src and dst.
188c2ecf20Sopenharmony_ci *
198c2ecf20Sopenharmony_ci * Input:
208c2ecf20Sopenharmony_ci * rdi: dest
218c2ecf20Sopenharmony_ci * rsi: src
228c2ecf20Sopenharmony_ci * rdx: count
238c2ecf20Sopenharmony_ci *
248c2ecf20Sopenharmony_ci * Output:
258c2ecf20Sopenharmony_ci * rax: dest
268c2ecf20Sopenharmony_ci */
278c2ecf20Sopenharmony_ciSYM_FUNC_START_WEAK(memmove)
288c2ecf20Sopenharmony_ciSYM_FUNC_START(__memmove)
298c2ecf20Sopenharmony_ci
308c2ecf20Sopenharmony_ci	mov %rdi, %rax
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_ci	/* Decide forward/backward copy mode */
338c2ecf20Sopenharmony_ci	cmp %rdi, %rsi
348c2ecf20Sopenharmony_ci	jge .Lmemmove_begin_forward
358c2ecf20Sopenharmony_ci	mov %rsi, %r8
368c2ecf20Sopenharmony_ci	add %rdx, %r8
378c2ecf20Sopenharmony_ci	cmp %rdi, %r8
388c2ecf20Sopenharmony_ci	jg 2f
398c2ecf20Sopenharmony_ci
408c2ecf20Sopenharmony_ci	/* FSRM implies ERMS => no length checks, do the copy directly */
418c2ecf20Sopenharmony_ci.Lmemmove_begin_forward:
428c2ecf20Sopenharmony_ci	ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
438c2ecf20Sopenharmony_ci	ALTERNATIVE "", "jmp .Lmemmove_erms", X86_FEATURE_ERMS
448c2ecf20Sopenharmony_ci
458c2ecf20Sopenharmony_ci	/*
468c2ecf20Sopenharmony_ci	 * movsq instruction have many startup latency
478c2ecf20Sopenharmony_ci	 * so we handle small size by general register.
488c2ecf20Sopenharmony_ci	 */
498c2ecf20Sopenharmony_ci	cmp  $680, %rdx
508c2ecf20Sopenharmony_ci	jb	3f
518c2ecf20Sopenharmony_ci	/*
528c2ecf20Sopenharmony_ci	 * movsq instruction is only good for aligned case.
538c2ecf20Sopenharmony_ci	 */
548c2ecf20Sopenharmony_ci
558c2ecf20Sopenharmony_ci	cmpb %dil, %sil
568c2ecf20Sopenharmony_ci	je 4f
578c2ecf20Sopenharmony_ci3:
588c2ecf20Sopenharmony_ci	sub $0x20, %rdx
598c2ecf20Sopenharmony_ci	/*
608c2ecf20Sopenharmony_ci	 * We gobble 32 bytes forward in each loop.
618c2ecf20Sopenharmony_ci	 */
628c2ecf20Sopenharmony_ci5:
638c2ecf20Sopenharmony_ci	sub $0x20, %rdx
648c2ecf20Sopenharmony_ci	movq 0*8(%rsi), %r11
658c2ecf20Sopenharmony_ci	movq 1*8(%rsi), %r10
668c2ecf20Sopenharmony_ci	movq 2*8(%rsi), %r9
678c2ecf20Sopenharmony_ci	movq 3*8(%rsi), %r8
688c2ecf20Sopenharmony_ci	leaq 4*8(%rsi), %rsi
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_ci	movq %r11, 0*8(%rdi)
718c2ecf20Sopenharmony_ci	movq %r10, 1*8(%rdi)
728c2ecf20Sopenharmony_ci	movq %r9, 2*8(%rdi)
738c2ecf20Sopenharmony_ci	movq %r8, 3*8(%rdi)
748c2ecf20Sopenharmony_ci	leaq 4*8(%rdi), %rdi
758c2ecf20Sopenharmony_ci	jae 5b
768c2ecf20Sopenharmony_ci	addq $0x20, %rdx
778c2ecf20Sopenharmony_ci	jmp 1f
788c2ecf20Sopenharmony_ci	/*
798c2ecf20Sopenharmony_ci	 * Handle data forward by movsq.
808c2ecf20Sopenharmony_ci	 */
818c2ecf20Sopenharmony_ci	.p2align 4
828c2ecf20Sopenharmony_ci4:
838c2ecf20Sopenharmony_ci	movq %rdx, %rcx
848c2ecf20Sopenharmony_ci	movq -8(%rsi, %rdx), %r11
858c2ecf20Sopenharmony_ci	lea -8(%rdi, %rdx), %r10
868c2ecf20Sopenharmony_ci	shrq $3, %rcx
878c2ecf20Sopenharmony_ci	rep movsq
888c2ecf20Sopenharmony_ci	movq %r11, (%r10)
898c2ecf20Sopenharmony_ci	jmp 13f
908c2ecf20Sopenharmony_ci.Lmemmove_end_forward:
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ci	/*
938c2ecf20Sopenharmony_ci	 * Handle data backward by movsq.
948c2ecf20Sopenharmony_ci	 */
958c2ecf20Sopenharmony_ci	.p2align 4
968c2ecf20Sopenharmony_ci7:
978c2ecf20Sopenharmony_ci	movq %rdx, %rcx
988c2ecf20Sopenharmony_ci	movq (%rsi), %r11
998c2ecf20Sopenharmony_ci	movq %rdi, %r10
1008c2ecf20Sopenharmony_ci	leaq -8(%rsi, %rdx), %rsi
1018c2ecf20Sopenharmony_ci	leaq -8(%rdi, %rdx), %rdi
1028c2ecf20Sopenharmony_ci	shrq $3, %rcx
1038c2ecf20Sopenharmony_ci	std
1048c2ecf20Sopenharmony_ci	rep movsq
1058c2ecf20Sopenharmony_ci	cld
1068c2ecf20Sopenharmony_ci	movq %r11, (%r10)
1078c2ecf20Sopenharmony_ci	jmp 13f
1088c2ecf20Sopenharmony_ci
1098c2ecf20Sopenharmony_ci	/*
1108c2ecf20Sopenharmony_ci	 * Start to prepare for backward copy.
1118c2ecf20Sopenharmony_ci	 */
1128c2ecf20Sopenharmony_ci	.p2align 4
1138c2ecf20Sopenharmony_ci2:
1148c2ecf20Sopenharmony_ci	cmp $0x20, %rdx
1158c2ecf20Sopenharmony_ci	jb 1f
1168c2ecf20Sopenharmony_ci	cmp $680, %rdx
1178c2ecf20Sopenharmony_ci	jb 6f
1188c2ecf20Sopenharmony_ci	cmp %dil, %sil
1198c2ecf20Sopenharmony_ci	je 7b
1208c2ecf20Sopenharmony_ci6:
1218c2ecf20Sopenharmony_ci	/*
1228c2ecf20Sopenharmony_ci	 * Calculate copy position to tail.
1238c2ecf20Sopenharmony_ci	 */
1248c2ecf20Sopenharmony_ci	addq %rdx, %rsi
1258c2ecf20Sopenharmony_ci	addq %rdx, %rdi
1268c2ecf20Sopenharmony_ci	subq $0x20, %rdx
1278c2ecf20Sopenharmony_ci	/*
1288c2ecf20Sopenharmony_ci	 * We gobble 32 bytes backward in each loop.
1298c2ecf20Sopenharmony_ci	 */
1308c2ecf20Sopenharmony_ci8:
1318c2ecf20Sopenharmony_ci	subq $0x20, %rdx
1328c2ecf20Sopenharmony_ci	movq -1*8(%rsi), %r11
1338c2ecf20Sopenharmony_ci	movq -2*8(%rsi), %r10
1348c2ecf20Sopenharmony_ci	movq -3*8(%rsi), %r9
1358c2ecf20Sopenharmony_ci	movq -4*8(%rsi), %r8
1368c2ecf20Sopenharmony_ci	leaq -4*8(%rsi), %rsi
1378c2ecf20Sopenharmony_ci
1388c2ecf20Sopenharmony_ci	movq %r11, -1*8(%rdi)
1398c2ecf20Sopenharmony_ci	movq %r10, -2*8(%rdi)
1408c2ecf20Sopenharmony_ci	movq %r9, -3*8(%rdi)
1418c2ecf20Sopenharmony_ci	movq %r8, -4*8(%rdi)
1428c2ecf20Sopenharmony_ci	leaq -4*8(%rdi), %rdi
1438c2ecf20Sopenharmony_ci	jae 8b
1448c2ecf20Sopenharmony_ci	/*
1458c2ecf20Sopenharmony_ci	 * Calculate copy position to head.
1468c2ecf20Sopenharmony_ci	 */
1478c2ecf20Sopenharmony_ci	addq $0x20, %rdx
1488c2ecf20Sopenharmony_ci	subq %rdx, %rsi
1498c2ecf20Sopenharmony_ci	subq %rdx, %rdi
1508c2ecf20Sopenharmony_ci1:
1518c2ecf20Sopenharmony_ci	cmpq $16, %rdx
1528c2ecf20Sopenharmony_ci	jb 9f
1538c2ecf20Sopenharmony_ci	/*
1548c2ecf20Sopenharmony_ci	 * Move data from 16 bytes to 31 bytes.
1558c2ecf20Sopenharmony_ci	 */
1568c2ecf20Sopenharmony_ci	movq 0*8(%rsi), %r11
1578c2ecf20Sopenharmony_ci	movq 1*8(%rsi), %r10
1588c2ecf20Sopenharmony_ci	movq -2*8(%rsi, %rdx), %r9
1598c2ecf20Sopenharmony_ci	movq -1*8(%rsi, %rdx), %r8
1608c2ecf20Sopenharmony_ci	movq %r11, 0*8(%rdi)
1618c2ecf20Sopenharmony_ci	movq %r10, 1*8(%rdi)
1628c2ecf20Sopenharmony_ci	movq %r9, -2*8(%rdi, %rdx)
1638c2ecf20Sopenharmony_ci	movq %r8, -1*8(%rdi, %rdx)
1648c2ecf20Sopenharmony_ci	jmp 13f
1658c2ecf20Sopenharmony_ci	.p2align 4
1668c2ecf20Sopenharmony_ci9:
1678c2ecf20Sopenharmony_ci	cmpq $8, %rdx
1688c2ecf20Sopenharmony_ci	jb 10f
1698c2ecf20Sopenharmony_ci	/*
1708c2ecf20Sopenharmony_ci	 * Move data from 8 bytes to 15 bytes.
1718c2ecf20Sopenharmony_ci	 */
1728c2ecf20Sopenharmony_ci	movq 0*8(%rsi), %r11
1738c2ecf20Sopenharmony_ci	movq -1*8(%rsi, %rdx), %r10
1748c2ecf20Sopenharmony_ci	movq %r11, 0*8(%rdi)
1758c2ecf20Sopenharmony_ci	movq %r10, -1*8(%rdi, %rdx)
1768c2ecf20Sopenharmony_ci	jmp 13f
1778c2ecf20Sopenharmony_ci10:
1788c2ecf20Sopenharmony_ci	cmpq $4, %rdx
1798c2ecf20Sopenharmony_ci	jb 11f
1808c2ecf20Sopenharmony_ci	/*
1818c2ecf20Sopenharmony_ci	 * Move data from 4 bytes to 7 bytes.
1828c2ecf20Sopenharmony_ci	 */
1838c2ecf20Sopenharmony_ci	movl (%rsi), %r11d
1848c2ecf20Sopenharmony_ci	movl -4(%rsi, %rdx), %r10d
1858c2ecf20Sopenharmony_ci	movl %r11d, (%rdi)
1868c2ecf20Sopenharmony_ci	movl %r10d, -4(%rdi, %rdx)
1878c2ecf20Sopenharmony_ci	jmp 13f
1888c2ecf20Sopenharmony_ci11:
1898c2ecf20Sopenharmony_ci	cmp $2, %rdx
1908c2ecf20Sopenharmony_ci	jb 12f
1918c2ecf20Sopenharmony_ci	/*
1928c2ecf20Sopenharmony_ci	 * Move data from 2 bytes to 3 bytes.
1938c2ecf20Sopenharmony_ci	 */
1948c2ecf20Sopenharmony_ci	movw (%rsi), %r11w
1958c2ecf20Sopenharmony_ci	movw -2(%rsi, %rdx), %r10w
1968c2ecf20Sopenharmony_ci	movw %r11w, (%rdi)
1978c2ecf20Sopenharmony_ci	movw %r10w, -2(%rdi, %rdx)
1988c2ecf20Sopenharmony_ci	jmp 13f
1998c2ecf20Sopenharmony_ci12:
2008c2ecf20Sopenharmony_ci	cmp $1, %rdx
2018c2ecf20Sopenharmony_ci	jb 13f
2028c2ecf20Sopenharmony_ci	/*
2038c2ecf20Sopenharmony_ci	 * Move data for 1 byte.
2048c2ecf20Sopenharmony_ci	 */
2058c2ecf20Sopenharmony_ci	movb (%rsi), %r11b
2068c2ecf20Sopenharmony_ci	movb %r11b, (%rdi)
2078c2ecf20Sopenharmony_ci13:
2088c2ecf20Sopenharmony_ci	RET
2098c2ecf20Sopenharmony_ci
2108c2ecf20Sopenharmony_ci.Lmemmove_erms:
2118c2ecf20Sopenharmony_ci	movq %rdx, %rcx
2128c2ecf20Sopenharmony_ci	rep movsb
2138c2ecf20Sopenharmony_ci	RET
2148c2ecf20Sopenharmony_ciSYM_FUNC_END(__memmove)
2158c2ecf20Sopenharmony_ciSYM_FUNC_END_ALIAS(memmove)
2168c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__memmove)
2178c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memmove)
218