1 /*
2  * memcpy - copy memory area
3  *
4  * Copyright (c) 2019-2020, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 /* Assumptions:
9  *
10  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11  *
12  */
13 
14 #include "../asmdefs.h"
15 
16 #define dstin	x0
17 #define src	x1
18 #define count	x2
19 #define dst	x3
20 #define srcend	x4
21 #define dstend	x5
22 #define A_l	x6
23 #define A_lw	w6
24 #define A_h	x7
25 #define B_l	x8
26 #define B_lw	w8
27 #define B_h	x9
28 #define C_lw	w10
29 #define tmp1	x14
30 
31 #define A_q	q0
32 #define B_q	q1
33 #define C_q	q2
34 #define D_q	q3
35 #define E_q	q4
36 #define F_q	q5
37 #define G_q	q6
38 #define H_q	q7
39 
40 /* This implementation handles overlaps and supports both memcpy and memmove
41    from a single entry point.  It uses unaligned accesses and branchless
42    sequences to keep the code small, simple and improve performance.
43 
44    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
45    copies of up to 128 bytes, and large copies.  The overhead of the overlap
46    check is negligible since it is only required for large copies.
47 
48    Large copies use a software pipelined loop processing 64 bytes per iteration.
49    The source pointer is 16-byte aligned to minimize unaligned accesses.
50    The loop tail is handled by always copying 64 bytes from the end.
51 */
52 
53 ENTRY_ALIAS (__memmove_aarch64_simd)
54 ENTRY (__memcpy_aarch64_simd)
55 	PTR_ARG (0)
56 	PTR_ARG (1)
57 	SIZE_ARG (2)
58 	add	srcend, src, count
59 	add	dstend, dstin, count
60 	cmp	count, 128
61 	b.hi	L(copy_long)
62 	cmp	count, 32
63 	b.hi	L(copy32_128)
64 
65 	/* Small copies: 0..32 bytes.  */
66 	cmp	count, 16
67 	b.lo	L(copy16)
68 	ldr	A_q, [src]
69 	ldr	B_q, [srcend, -16]
70 	str	A_q, [dstin]
71 	str	B_q, [dstend, -16]
72 	ret
73 
74 	/* Copy 8-15 bytes.  */
75 L(copy16):
76 	tbz	count, 3, L(copy8)
77 	ldr	A_l, [src]
78 	ldr	A_h, [srcend, -8]
79 	str	A_l, [dstin]
80 	str	A_h, [dstend, -8]
81 	ret
82 
83 	.p2align 3
84 	/* Copy 4-7 bytes.  */
85 L(copy8):
86 	tbz	count, 2, L(copy4)
87 	ldr	A_lw, [src]
88 	ldr	B_lw, [srcend, -4]
89 	str	A_lw, [dstin]
90 	str	B_lw, [dstend, -4]
91 	ret
92 
93 	/* Copy 0..3 bytes using a branchless sequence.  */
94 L(copy4):
95 	cbz	count, L(copy0)
96 	lsr	tmp1, count, 1
97 	ldrb	A_lw, [src]
98 	ldrb	C_lw, [srcend, -1]
99 	ldrb	B_lw, [src, tmp1]
100 	strb	A_lw, [dstin]
101 	strb	B_lw, [dstin, tmp1]
102 	strb	C_lw, [dstend, -1]
103 L(copy0):
104 	ret
105 
106 	.p2align 4
107 	/* Medium copies: 33..128 bytes.  */
108 L(copy32_128):
109 	ldp	A_q, B_q, [src]
110 	ldp	C_q, D_q, [srcend, -32]
111 	cmp	count, 64
112 	b.hi	L(copy128)
113 	stp	A_q, B_q, [dstin]
114 	stp	C_q, D_q, [dstend, -32]
115 	ret
116 
117 	.p2align 4
118 	/* Copy 65..128 bytes.  */
119 L(copy128):
120 	ldp	E_q, F_q, [src, 32]
121 	cmp	count, 96
122 	b.ls	L(copy96)
123 	ldp	G_q, H_q, [srcend, -64]
124 	stp	G_q, H_q, [dstend, -64]
125 L(copy96):
126 	stp	A_q, B_q, [dstin]
127 	stp	E_q, F_q, [dstin, 32]
128 	stp	C_q, D_q, [dstend, -32]
129 	ret
130 
131 	/* Copy more than 128 bytes.  */
132 L(copy_long):
133 	/* Use backwards copy if there is an overlap.  */
134 	sub	tmp1, dstin, src
135 	cmp	tmp1, count
136 	b.lo	L(copy_long_backwards)
137 
138 	/* Copy 16 bytes and then align src to 16-byte alignment.  */
139 	ldr	D_q, [src]
140 	and	tmp1, src, 15
141 	bic	src, src, 15
142 	sub	dst, dstin, tmp1
143 	add	count, count, tmp1	/* Count is now 16 too large.  */
144 	ldp	A_q, B_q, [src, 16]
145 	str	D_q, [dstin]
146 	ldp	C_q, D_q, [src, 48]
147 	subs	count, count, 128 + 16	/* Test and readjust count.  */
148 	b.ls	L(copy64_from_end)
149 L(loop64):
150 	stp	A_q, B_q, [dst, 16]
151 	ldp	A_q, B_q, [src, 80]
152 	stp	C_q, D_q, [dst, 48]
153 	ldp	C_q, D_q, [src, 112]
154 	add	src, src, 64
155 	add	dst, dst, 64
156 	subs	count, count, 64
157 	b.hi	L(loop64)
158 
159 	/* Write the last iteration and copy 64 bytes from the end.  */
160 L(copy64_from_end):
161 	ldp	E_q, F_q, [srcend, -64]
162 	stp	A_q, B_q, [dst, 16]
163 	ldp	A_q, B_q, [srcend, -32]
164 	stp	C_q, D_q, [dst, 48]
165 	stp	E_q, F_q, [dstend, -64]
166 	stp	A_q, B_q, [dstend, -32]
167 	ret
168 
169 	/* Large backwards copy for overlapping copies.
170 	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
171 L(copy_long_backwards):
172 	cbz	tmp1, L(copy0)
173 	ldr	D_q, [srcend, -16]
174 	and	tmp1, srcend, 15
175 	bic	srcend, srcend, 15
176 	sub	count, count, tmp1
177 	ldp	A_q, B_q, [srcend, -32]
178 	str	D_q, [dstend, -16]
179 	ldp	C_q, D_q, [srcend, -64]
180 	sub	dstend, dstend, tmp1
181 	subs	count, count, 128
182 	b.ls	L(copy64_from_start)
183 
184 L(loop64_backwards):
185 	str	B_q, [dstend, -16]
186 	str	A_q, [dstend, -32]
187 	ldp	A_q, B_q, [srcend, -96]
188 	str	D_q, [dstend, -48]
189 	str	C_q, [dstend, -64]!
190 	ldp	C_q, D_q, [srcend, -128]
191 	sub	srcend, srcend, 64
192 	subs	count, count, 64
193 	b.hi	L(loop64_backwards)
194 
195 	/* Write the last iteration and copy 64 bytes from the start.  */
196 L(copy64_from_start):
197 	ldp	E_q, F_q, [src, 32]
198 	stp	A_q, B_q, [dstend, -32]
199 	ldp	A_q, B_q, [src]
200 	stp	C_q, D_q, [dstend, -64]
201 	stp	E_q, F_q, [dstin, 32]
202 	stp	A_q, B_q, [dstin]
203 	ret
204 
205 END (__memcpy_aarch64_simd)
206 
207