xref: /kernel/linux/linux-5.10/arch/x86/lib/memcpy_64.S (revision 8c2ecf20)
18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
28c2ecf20Sopenharmony_ci/* Copyright 2002 Andi Kleen */
38c2ecf20Sopenharmony_ci
48c2ecf20Sopenharmony_ci#include <linux/linkage.h>
58c2ecf20Sopenharmony_ci#include <asm/errno.h>
68c2ecf20Sopenharmony_ci#include <asm/cpufeatures.h>
78c2ecf20Sopenharmony_ci#include <asm/alternative.h>
88c2ecf20Sopenharmony_ci#include <asm/export.h>
98c2ecf20Sopenharmony_ci
108c2ecf20Sopenharmony_ci.pushsection .noinstr.text, "ax"
118c2ecf20Sopenharmony_ci
128c2ecf20Sopenharmony_ci/*
138c2ecf20Sopenharmony_ci * We build a jump to memcpy_orig by default which gets NOPped out on
148c2ecf20Sopenharmony_ci * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
158c2ecf20Sopenharmony_ci * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
168c2ecf20Sopenharmony_ci * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
178c2ecf20Sopenharmony_ci */
188c2ecf20Sopenharmony_ci
198c2ecf20Sopenharmony_ci/*
208c2ecf20Sopenharmony_ci * memcpy - Copy a memory block.
218c2ecf20Sopenharmony_ci *
228c2ecf20Sopenharmony_ci * Input:
238c2ecf20Sopenharmony_ci *  rdi destination
248c2ecf20Sopenharmony_ci *  rsi source
258c2ecf20Sopenharmony_ci *  rdx count
268c2ecf20Sopenharmony_ci *
278c2ecf20Sopenharmony_ci * Output:
288c2ecf20Sopenharmony_ci * rax original destination
298c2ecf20Sopenharmony_ci */
308c2ecf20Sopenharmony_ciSYM_FUNC_START_ALIAS(__memcpy)
318c2ecf20Sopenharmony_ciSYM_FUNC_START_WEAK(memcpy)
328c2ecf20Sopenharmony_ci	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
338c2ecf20Sopenharmony_ci		      "jmp memcpy_erms", X86_FEATURE_ERMS
348c2ecf20Sopenharmony_ci
358c2ecf20Sopenharmony_ci	movq %rdi, %rax
368c2ecf20Sopenharmony_ci	movq %rdx, %rcx
378c2ecf20Sopenharmony_ci	shrq $3, %rcx
388c2ecf20Sopenharmony_ci	andl $7, %edx
398c2ecf20Sopenharmony_ci	rep movsq
408c2ecf20Sopenharmony_ci	movl %edx, %ecx
418c2ecf20Sopenharmony_ci	rep movsb
428c2ecf20Sopenharmony_ci	RET
438c2ecf20Sopenharmony_ciSYM_FUNC_END(memcpy)
448c2ecf20Sopenharmony_ciSYM_FUNC_END_ALIAS(__memcpy)
458c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memcpy)
468c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__memcpy)
478c2ecf20Sopenharmony_ci
488c2ecf20Sopenharmony_ci/*
498c2ecf20Sopenharmony_ci * memcpy_erms() - enhanced fast string memcpy. This is faster and
508c2ecf20Sopenharmony_ci * simpler than memcpy. Use memcpy_erms when possible.
518c2ecf20Sopenharmony_ci */
528c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(memcpy_erms)
538c2ecf20Sopenharmony_ci	movq %rdi, %rax
548c2ecf20Sopenharmony_ci	movq %rdx, %rcx
558c2ecf20Sopenharmony_ci	rep movsb
568c2ecf20Sopenharmony_ci	RET
578c2ecf20Sopenharmony_ciSYM_FUNC_END(memcpy_erms)
588c2ecf20Sopenharmony_ci
598c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(memcpy_orig)
608c2ecf20Sopenharmony_ci	movq %rdi, %rax
618c2ecf20Sopenharmony_ci
628c2ecf20Sopenharmony_ci	cmpq $0x20, %rdx
638c2ecf20Sopenharmony_ci	jb .Lhandle_tail
648c2ecf20Sopenharmony_ci
658c2ecf20Sopenharmony_ci	/*
668c2ecf20Sopenharmony_ci	 * We check whether memory false dependence could occur,
678c2ecf20Sopenharmony_ci	 * then jump to corresponding copy mode.
688c2ecf20Sopenharmony_ci	 */
698c2ecf20Sopenharmony_ci	cmp  %dil, %sil
708c2ecf20Sopenharmony_ci	jl .Lcopy_backward
718c2ecf20Sopenharmony_ci	subq $0x20, %rdx
728c2ecf20Sopenharmony_ci.Lcopy_forward_loop:
738c2ecf20Sopenharmony_ci	subq $0x20,	%rdx
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_ci	/*
768c2ecf20Sopenharmony_ci	 * Move in blocks of 4x8 bytes:
778c2ecf20Sopenharmony_ci	 */
788c2ecf20Sopenharmony_ci	movq 0*8(%rsi),	%r8
798c2ecf20Sopenharmony_ci	movq 1*8(%rsi),	%r9
808c2ecf20Sopenharmony_ci	movq 2*8(%rsi),	%r10
818c2ecf20Sopenharmony_ci	movq 3*8(%rsi),	%r11
828c2ecf20Sopenharmony_ci	leaq 4*8(%rsi),	%rsi
838c2ecf20Sopenharmony_ci
848c2ecf20Sopenharmony_ci	movq %r8,	0*8(%rdi)
858c2ecf20Sopenharmony_ci	movq %r9,	1*8(%rdi)
868c2ecf20Sopenharmony_ci	movq %r10,	2*8(%rdi)
878c2ecf20Sopenharmony_ci	movq %r11,	3*8(%rdi)
888c2ecf20Sopenharmony_ci	leaq 4*8(%rdi),	%rdi
898c2ecf20Sopenharmony_ci	jae  .Lcopy_forward_loop
908c2ecf20Sopenharmony_ci	addl $0x20,	%edx
918c2ecf20Sopenharmony_ci	jmp  .Lhandle_tail
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci.Lcopy_backward:
948c2ecf20Sopenharmony_ci	/*
958c2ecf20Sopenharmony_ci	 * Calculate copy position to tail.
968c2ecf20Sopenharmony_ci	 */
978c2ecf20Sopenharmony_ci	addq %rdx,	%rsi
988c2ecf20Sopenharmony_ci	addq %rdx,	%rdi
998c2ecf20Sopenharmony_ci	subq $0x20,	%rdx
1008c2ecf20Sopenharmony_ci	/*
1018c2ecf20Sopenharmony_ci	 * At most 3 ALU operations in one cycle,
1028c2ecf20Sopenharmony_ci	 * so append NOPS in the same 16 bytes trunk.
1038c2ecf20Sopenharmony_ci	 */
1048c2ecf20Sopenharmony_ci	.p2align 4
1058c2ecf20Sopenharmony_ci.Lcopy_backward_loop:
1068c2ecf20Sopenharmony_ci	subq $0x20,	%rdx
1078c2ecf20Sopenharmony_ci	movq -1*8(%rsi),	%r8
1088c2ecf20Sopenharmony_ci	movq -2*8(%rsi),	%r9
1098c2ecf20Sopenharmony_ci	movq -3*8(%rsi),	%r10
1108c2ecf20Sopenharmony_ci	movq -4*8(%rsi),	%r11
1118c2ecf20Sopenharmony_ci	leaq -4*8(%rsi),	%rsi
1128c2ecf20Sopenharmony_ci	movq %r8,		-1*8(%rdi)
1138c2ecf20Sopenharmony_ci	movq %r9,		-2*8(%rdi)
1148c2ecf20Sopenharmony_ci	movq %r10,		-3*8(%rdi)
1158c2ecf20Sopenharmony_ci	movq %r11,		-4*8(%rdi)
1168c2ecf20Sopenharmony_ci	leaq -4*8(%rdi),	%rdi
1178c2ecf20Sopenharmony_ci	jae  .Lcopy_backward_loop
1188c2ecf20Sopenharmony_ci
1198c2ecf20Sopenharmony_ci	/*
1208c2ecf20Sopenharmony_ci	 * Calculate copy position to head.
1218c2ecf20Sopenharmony_ci	 */
1228c2ecf20Sopenharmony_ci	addl $0x20,	%edx
1238c2ecf20Sopenharmony_ci	subq %rdx,	%rsi
1248c2ecf20Sopenharmony_ci	subq %rdx,	%rdi
1258c2ecf20Sopenharmony_ci.Lhandle_tail:
1268c2ecf20Sopenharmony_ci	cmpl $16,	%edx
1278c2ecf20Sopenharmony_ci	jb   .Lless_16bytes
1288c2ecf20Sopenharmony_ci
1298c2ecf20Sopenharmony_ci	/*
1308c2ecf20Sopenharmony_ci	 * Move data from 16 bytes to 31 bytes.
1318c2ecf20Sopenharmony_ci	 */
1328c2ecf20Sopenharmony_ci	movq 0*8(%rsi), %r8
1338c2ecf20Sopenharmony_ci	movq 1*8(%rsi),	%r9
1348c2ecf20Sopenharmony_ci	movq -2*8(%rsi, %rdx),	%r10
1358c2ecf20Sopenharmony_ci	movq -1*8(%rsi, %rdx),	%r11
1368c2ecf20Sopenharmony_ci	movq %r8,	0*8(%rdi)
1378c2ecf20Sopenharmony_ci	movq %r9,	1*8(%rdi)
1388c2ecf20Sopenharmony_ci	movq %r10,	-2*8(%rdi, %rdx)
1398c2ecf20Sopenharmony_ci	movq %r11,	-1*8(%rdi, %rdx)
1408c2ecf20Sopenharmony_ci	RET
1418c2ecf20Sopenharmony_ci	.p2align 4
1428c2ecf20Sopenharmony_ci.Lless_16bytes:
1438c2ecf20Sopenharmony_ci	cmpl $8,	%edx
1448c2ecf20Sopenharmony_ci	jb   .Lless_8bytes
1458c2ecf20Sopenharmony_ci	/*
1468c2ecf20Sopenharmony_ci	 * Move data from 8 bytes to 15 bytes.
1478c2ecf20Sopenharmony_ci	 */
1488c2ecf20Sopenharmony_ci	movq 0*8(%rsi),	%r8
1498c2ecf20Sopenharmony_ci	movq -1*8(%rsi, %rdx),	%r9
1508c2ecf20Sopenharmony_ci	movq %r8,	0*8(%rdi)
1518c2ecf20Sopenharmony_ci	movq %r9,	-1*8(%rdi, %rdx)
1528c2ecf20Sopenharmony_ci	RET
1538c2ecf20Sopenharmony_ci	.p2align 4
1548c2ecf20Sopenharmony_ci.Lless_8bytes:
1558c2ecf20Sopenharmony_ci	cmpl $4,	%edx
1568c2ecf20Sopenharmony_ci	jb   .Lless_3bytes
1578c2ecf20Sopenharmony_ci
1588c2ecf20Sopenharmony_ci	/*
1598c2ecf20Sopenharmony_ci	 * Move data from 4 bytes to 7 bytes.
1608c2ecf20Sopenharmony_ci	 */
1618c2ecf20Sopenharmony_ci	movl (%rsi), %ecx
1628c2ecf20Sopenharmony_ci	movl -4(%rsi, %rdx), %r8d
1638c2ecf20Sopenharmony_ci	movl %ecx, (%rdi)
1648c2ecf20Sopenharmony_ci	movl %r8d, -4(%rdi, %rdx)
1658c2ecf20Sopenharmony_ci	RET
1668c2ecf20Sopenharmony_ci	.p2align 4
1678c2ecf20Sopenharmony_ci.Lless_3bytes:
1688c2ecf20Sopenharmony_ci	subl $1, %edx
1698c2ecf20Sopenharmony_ci	jb .Lend
1708c2ecf20Sopenharmony_ci	/*
1718c2ecf20Sopenharmony_ci	 * Move data from 1 bytes to 3 bytes.
1728c2ecf20Sopenharmony_ci	 */
1738c2ecf20Sopenharmony_ci	movzbl (%rsi), %ecx
1748c2ecf20Sopenharmony_ci	jz .Lstore_1byte
1758c2ecf20Sopenharmony_ci	movzbq 1(%rsi), %r8
1768c2ecf20Sopenharmony_ci	movzbq (%rsi, %rdx), %r9
1778c2ecf20Sopenharmony_ci	movb %r8b, 1(%rdi)
1788c2ecf20Sopenharmony_ci	movb %r9b, (%rdi, %rdx)
1798c2ecf20Sopenharmony_ci.Lstore_1byte:
1808c2ecf20Sopenharmony_ci	movb %cl, (%rdi)
1818c2ecf20Sopenharmony_ci
1828c2ecf20Sopenharmony_ci.Lend:
1838c2ecf20Sopenharmony_ci	RET
1848c2ecf20Sopenharmony_ciSYM_FUNC_END(memcpy_orig)
1858c2ecf20Sopenharmony_ci
1868c2ecf20Sopenharmony_ci.popsection
187