1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /* Copyright 2002 Andi Kleen */
3 
4 #include <linux/linkage.h>
5 #include <asm/errno.h>
6 #include <asm/cpufeatures.h>
7 #include <asm/alternative.h>
8 #include <asm/export.h>
9 
10 .pushsection .noinstr.text, "ax"
11 
12 /*
13  * We build a jump to memcpy_orig by default which gets NOPped out on
14  * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
15  * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
16  * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
17  */
18 
19 /*
20  * memcpy - Copy a memory block.
21  *
22  * Input:
23  *  rdi destination
24  *  rsi source
25  *  rdx count
26  *
27  * Output:
28  * rax original destination
29  */
30 SYM_FUNC_START_ALIAS(__memcpy)
31 SYM_FUNC_START_WEAK(memcpy)
32 	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
33 		      "jmp memcpy_erms", X86_FEATURE_ERMS
34 
35 	movq %rdi, %rax
36 	movq %rdx, %rcx
37 	shrq $3, %rcx
38 	andl $7, %edx
39 	rep movsq
40 	movl %edx, %ecx
41 	rep movsb
42 	RET
43 SYM_FUNC_END(memcpy)
44 SYM_FUNC_END_ALIAS(__memcpy)
45 EXPORT_SYMBOL(memcpy)
46 EXPORT_SYMBOL(__memcpy)
47 
48 /*
49  * memcpy_erms() - enhanced fast string memcpy. This is faster and
50  * simpler than memcpy. Use memcpy_erms when possible.
51  */
52 SYM_FUNC_START_LOCAL(memcpy_erms)
53 	movq %rdi, %rax
54 	movq %rdx, %rcx
55 	rep movsb
56 	RET
57 SYM_FUNC_END(memcpy_erms)
58 
59 SYM_FUNC_START_LOCAL(memcpy_orig)
60 	movq %rdi, %rax
61 
62 	cmpq $0x20, %rdx
63 	jb .Lhandle_tail
64 
65 	/*
66 	 * We check whether memory false dependence could occur,
67 	 * then jump to corresponding copy mode.
68 	 */
69 	cmp  %dil, %sil
70 	jl .Lcopy_backward
71 	subq $0x20, %rdx
72 .Lcopy_forward_loop:
73 	subq $0x20,	%rdx
74 
75 	/*
76 	 * Move in blocks of 4x8 bytes:
77 	 */
78 	movq 0*8(%rsi),	%r8
79 	movq 1*8(%rsi),	%r9
80 	movq 2*8(%rsi),	%r10
81 	movq 3*8(%rsi),	%r11
82 	leaq 4*8(%rsi),	%rsi
83 
84 	movq %r8,	0*8(%rdi)
85 	movq %r9,	1*8(%rdi)
86 	movq %r10,	2*8(%rdi)
87 	movq %r11,	3*8(%rdi)
88 	leaq 4*8(%rdi),	%rdi
89 	jae  .Lcopy_forward_loop
90 	addl $0x20,	%edx
91 	jmp  .Lhandle_tail
92 
93 .Lcopy_backward:
94 	/*
95 	 * Calculate copy position to tail.
96 	 */
97 	addq %rdx,	%rsi
98 	addq %rdx,	%rdi
99 	subq $0x20,	%rdx
100 	/*
101 	 * At most 3 ALU operations in one cycle,
102 	 * so append NOPS in the same 16 bytes trunk.
103 	 */
104 	.p2align 4
105 .Lcopy_backward_loop:
106 	subq $0x20,	%rdx
107 	movq -1*8(%rsi),	%r8
108 	movq -2*8(%rsi),	%r9
109 	movq -3*8(%rsi),	%r10
110 	movq -4*8(%rsi),	%r11
111 	leaq -4*8(%rsi),	%rsi
112 	movq %r8,		-1*8(%rdi)
113 	movq %r9,		-2*8(%rdi)
114 	movq %r10,		-3*8(%rdi)
115 	movq %r11,		-4*8(%rdi)
116 	leaq -4*8(%rdi),	%rdi
117 	jae  .Lcopy_backward_loop
118 
119 	/*
120 	 * Calculate copy position to head.
121 	 */
122 	addl $0x20,	%edx
123 	subq %rdx,	%rsi
124 	subq %rdx,	%rdi
125 .Lhandle_tail:
126 	cmpl $16,	%edx
127 	jb   .Lless_16bytes
128 
129 	/*
130 	 * Move data from 16 bytes to 31 bytes.
131 	 */
132 	movq 0*8(%rsi), %r8
133 	movq 1*8(%rsi),	%r9
134 	movq -2*8(%rsi, %rdx),	%r10
135 	movq -1*8(%rsi, %rdx),	%r11
136 	movq %r8,	0*8(%rdi)
137 	movq %r9,	1*8(%rdi)
138 	movq %r10,	-2*8(%rdi, %rdx)
139 	movq %r11,	-1*8(%rdi, %rdx)
140 	RET
141 	.p2align 4
142 .Lless_16bytes:
143 	cmpl $8,	%edx
144 	jb   .Lless_8bytes
145 	/*
146 	 * Move data from 8 bytes to 15 bytes.
147 	 */
148 	movq 0*8(%rsi),	%r8
149 	movq -1*8(%rsi, %rdx),	%r9
150 	movq %r8,	0*8(%rdi)
151 	movq %r9,	-1*8(%rdi, %rdx)
152 	RET
153 	.p2align 4
154 .Lless_8bytes:
155 	cmpl $4,	%edx
156 	jb   .Lless_3bytes
157 
158 	/*
159 	 * Move data from 4 bytes to 7 bytes.
160 	 */
161 	movl (%rsi), %ecx
162 	movl -4(%rsi, %rdx), %r8d
163 	movl %ecx, (%rdi)
164 	movl %r8d, -4(%rdi, %rdx)
165 	RET
166 	.p2align 4
167 .Lless_3bytes:
168 	subl $1, %edx
169 	jb .Lend
170 	/*
171 	 * Move data from 1 bytes to 3 bytes.
172 	 */
173 	movzbl (%rsi), %ecx
174 	jz .Lstore_1byte
175 	movzbq 1(%rsi), %r8
176 	movzbq (%rsi, %rdx), %r9
177 	movb %r8b, 1(%rdi)
178 	movb %r9b, (%rdi, %rdx)
179 .Lstore_1byte:
180 	movb %cl, (%rdi)
181 
182 .Lend:
183 	RET
184 SYM_FUNC_END(memcpy_orig)
185 
186 .popsection
187