1 /*
2  * memcpy - copy memory area
3  *
4  * Copyright (c) 2012-2020, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 /* Assumptions:
9  *
10  * ARMv8-a, AArch64, unaligned accesses.
11  *
12  */
13 
14 #include "../asmdefs.h"
15 
16 #define dstin	x0
17 #define src	x1
18 #define count	x2
19 #define dst	x3
20 #define srcend	x4
21 #define dstend	x5
22 #define A_l	x6
23 #define A_lw	w6
24 #define A_h	x7
25 #define B_l	x8
26 #define B_lw	w8
27 #define B_h	x9
28 #define C_l	x10
29 #define C_lw	w10
30 #define C_h	x11
31 #define D_l	x12
32 #define D_h	x13
33 #define E_l	x14
34 #define E_h	x15
35 #define F_l	x16
36 #define F_h	x17
37 #define G_l	count
38 #define G_h	dst
39 #define H_l	src
40 #define H_h	srcend
41 #define tmp1	x14
42 
43 /* This implementation handles overlaps and supports both memcpy and memmove
44    from a single entry point.  It uses unaligned accesses and branchless
45    sequences to keep the code small, simple and improve performance.
46 
47    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
48    copies of up to 128 bytes, and large copies.  The overhead of the overlap
49    check is negligible since it is only required for large copies.
50 
51    Large copies use a software pipelined loop processing 64 bytes per iteration.
52    The destination pointer is 16-byte aligned to minimize unaligned accesses.
53    The loop tail is handled by always copying 64 bytes from the end.
54 */
55 
56 ENTRY_ALIAS (__memmove_aarch64)
57 ENTRY (__memcpy_aarch64)
58 	PTR_ARG (0)
59 	PTR_ARG (1)
60 	SIZE_ARG (2)
61 	prfm	PLDL1KEEP, [src]
62 	add	srcend, src, count
63 	add	dstend, dstin, count
64 	cmp	count, 128
65 	b.hi	L(copy_long)
66 	cmp	count, 32
67 	b.hi	L(copy32_128)
68 
69 	/* Small copies: 0..32 bytes.  */
70 	cmp	count, 16
71 	b.lo	L(copy16)
72 	ldp	A_l, A_h, [src]
73 	ldp	D_l, D_h, [srcend, -16]
74 	stp	A_l, A_h, [dstin]
75 	stp	D_l, D_h, [dstend, -16]
76 	ret
77 
78 	/* Copy 8-15 bytes.  */
79 L(copy16):
80 	tbz	count, 3, L(copy8)
81 	ldr	A_l, [src]
82 	ldr	A_h, [srcend, -8]
83 	str	A_l, [dstin]
84 	str	A_h, [dstend, -8]
85 	ret
86 
87 	.p2align 3
88 	/* Copy 4-7 bytes.  */
89 L(copy8):
90 	tbz	count, 2, L(copy4)
91 	ldr	A_lw, [src]
92 	ldr	B_lw, [srcend, -4]
93 	str	A_lw, [dstin]
94 	str	B_lw, [dstend, -4]
95 	ret
96 
97 	/* Copy 0..3 bytes using a branchless sequence.  */
98 L(copy4):
99 	cbz	count, L(copy0)
100 	lsr	tmp1, count, 1
101 	ldrb	A_lw, [src]
102 	ldrb	C_lw, [srcend, -1]
103 	ldrb	B_lw, [src, tmp1]
104 	strb	A_lw, [dstin]
105 	strb	B_lw, [dstin, tmp1]
106 	strb	C_lw, [dstend, -1]
107 L(copy0):
108 	ret
109 
110 	.p2align 4
111 	/* Medium copies: 33..128 bytes.  */
112 L(copy32_128):
113 	ldp	A_l, A_h, [src]
114 	ldp	B_l, B_h, [src, 16]
115 	ldp	C_l, C_h, [srcend, -32]
116 	ldp	D_l, D_h, [srcend, -16]
117 	cmp	count, 64
118 	b.hi	L(copy128)
119 	stp	A_l, A_h, [dstin]
120 	stp	B_l, B_h, [dstin, 16]
121 	stp	C_l, C_h, [dstend, -32]
122 	stp	D_l, D_h, [dstend, -16]
123 	ret
124 
125 	.p2align 4
126 	/* Copy 65..128 bytes.  */
127 L(copy128):
128 	ldp	E_l, E_h, [src, 32]
129 	ldp	F_l, F_h, [src, 48]
130 	cmp	count, 96
131 	b.ls	L(copy96)
132 	ldp	G_l, G_h, [srcend, -64]
133 	ldp	H_l, H_h, [srcend, -48]
134 	stp	G_l, G_h, [dstend, -64]
135 	stp	H_l, H_h, [dstend, -48]
136 L(copy96):
137 	stp	A_l, A_h, [dstin]
138 	stp	B_l, B_h, [dstin, 16]
139 	stp	E_l, E_h, [dstin, 32]
140 	stp	F_l, F_h, [dstin, 48]
141 	stp	C_l, C_h, [dstend, -32]
142 	stp	D_l, D_h, [dstend, -16]
143 	ret
144 
145 	.p2align 4
146 	/* Copy more than 128 bytes.  */
147 L(copy_long):
148 	/* Use backwards copy if there is an overlap.  */
149 	sub	tmp1, dstin, src
150 	cbz	tmp1, L(copy0)
151 	cmp	tmp1, count
152 	b.lo	L(copy_long_backwards)
153 
154 	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
155 
156 	ldp	D_l, D_h, [src]
157 	and	tmp1, dstin, 15
158 	bic	dst, dstin, 15
159 	sub	src, src, tmp1
160 	add	count, count, tmp1	/* Count is now 16 too large.  */
161 	ldp	A_l, A_h, [src, 16]
162 	stp	D_l, D_h, [dstin]
163 	ldp	B_l, B_h, [src, 32]
164 	ldp	C_l, C_h, [src, 48]
165 	ldp	D_l, D_h, [src, 64]!
166 	subs	count, count, 128 + 16	/* Test and readjust count.  */
167 	b.ls	L(copy64_from_end)
168 
169 L(loop64):
170 	stp	A_l, A_h, [dst, 16]
171 	ldp	A_l, A_h, [src, 16]
172 	stp	B_l, B_h, [dst, 32]
173 	ldp	B_l, B_h, [src, 32]
174 	stp	C_l, C_h, [dst, 48]
175 	ldp	C_l, C_h, [src, 48]
176 	stp	D_l, D_h, [dst, 64]!
177 	ldp	D_l, D_h, [src, 64]!
178 	subs	count, count, 64
179 	b.hi	L(loop64)
180 
181 	/* Write the last iteration and copy 64 bytes from the end.  */
182 L(copy64_from_end):
183 	ldp	E_l, E_h, [srcend, -64]
184 	stp	A_l, A_h, [dst, 16]
185 	ldp	A_l, A_h, [srcend, -48]
186 	stp	B_l, B_h, [dst, 32]
187 	ldp	B_l, B_h, [srcend, -32]
188 	stp	C_l, C_h, [dst, 48]
189 	ldp	C_l, C_h, [srcend, -16]
190 	stp	D_l, D_h, [dst, 64]
191 	stp	E_l, E_h, [dstend, -64]
192 	stp	A_l, A_h, [dstend, -48]
193 	stp	B_l, B_h, [dstend, -32]
194 	stp	C_l, C_h, [dstend, -16]
195 	ret
196 
197 	.p2align 4
198 
199 	/* Large backwards copy for overlapping copies.
200 	   Copy 16 bytes and then align dst to 16-byte alignment.  */
201 L(copy_long_backwards):
202 	ldp	D_l, D_h, [srcend, -16]
203 	and	tmp1, dstend, 15
204 	sub	srcend, srcend, tmp1
205 	sub	count, count, tmp1
206 	ldp	A_l, A_h, [srcend, -16]
207 	stp	D_l, D_h, [dstend, -16]
208 	ldp	B_l, B_h, [srcend, -32]
209 	ldp	C_l, C_h, [srcend, -48]
210 	ldp	D_l, D_h, [srcend, -64]!
211 	sub	dstend, dstend, tmp1
212 	subs	count, count, 128
213 	b.ls	L(copy64_from_start)
214 
215 L(loop64_backwards):
216 	stp	A_l, A_h, [dstend, -16]
217 	ldp	A_l, A_h, [srcend, -16]
218 	stp	B_l, B_h, [dstend, -32]
219 	ldp	B_l, B_h, [srcend, -32]
220 	stp	C_l, C_h, [dstend, -48]
221 	ldp	C_l, C_h, [srcend, -48]
222 	stp	D_l, D_h, [dstend, -64]!
223 	ldp	D_l, D_h, [srcend, -64]!
224 	subs	count, count, 64
225 	b.hi	L(loop64_backwards)
226 
227 	/* Write the last iteration and copy 64 bytes from the start.  */
228 L(copy64_from_start):
229 	ldp	G_l, G_h, [src, 48]
230 	stp	A_l, A_h, [dstend, -16]
231 	ldp	A_l, A_h, [src, 32]
232 	stp	B_l, B_h, [dstend, -32]
233 	ldp	B_l, B_h, [src, 16]
234 	stp	C_l, C_h, [dstend, -48]
235 	ldp	C_l, C_h, [src]
236 	stp	D_l, D_h, [dstend, -64]
237 	stp	G_l, G_h, [dstin, 48]
238 	stp	A_l, A_h, [dstin, 32]
239 	stp	B_l, B_h, [dstin, 16]
240 	stp	C_l, C_h, [dstin]
241 	ret
242 
243 END (__memcpy_aarch64)
244 
245