1bbbf1280Sopenharmony_ci/*
2bbbf1280Sopenharmony_ci * memcpy - copy memory area
3bbbf1280Sopenharmony_ci *
4bbbf1280Sopenharmony_ci * Copyright (c) 2019-2020, Arm Limited.
5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT
6bbbf1280Sopenharmony_ci */
7bbbf1280Sopenharmony_ci
8bbbf1280Sopenharmony_ci/* Assumptions:
9bbbf1280Sopenharmony_ci *
10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11bbbf1280Sopenharmony_ci *
12bbbf1280Sopenharmony_ci */
13bbbf1280Sopenharmony_ci
14bbbf1280Sopenharmony_ci#include "../asmdefs.h"
15bbbf1280Sopenharmony_ci
16bbbf1280Sopenharmony_ci#define dstin	x0
17bbbf1280Sopenharmony_ci#define src	x1
18bbbf1280Sopenharmony_ci#define count	x2
19bbbf1280Sopenharmony_ci#define dst	x3
20bbbf1280Sopenharmony_ci#define srcend	x4
21bbbf1280Sopenharmony_ci#define dstend	x5
22bbbf1280Sopenharmony_ci#define A_l	x6
23bbbf1280Sopenharmony_ci#define A_lw	w6
24bbbf1280Sopenharmony_ci#define A_h	x7
25bbbf1280Sopenharmony_ci#define B_l	x8
26bbbf1280Sopenharmony_ci#define B_lw	w8
27bbbf1280Sopenharmony_ci#define B_h	x9
28bbbf1280Sopenharmony_ci#define C_lw	w10
29bbbf1280Sopenharmony_ci#define tmp1	x14
30bbbf1280Sopenharmony_ci
31bbbf1280Sopenharmony_ci#define A_q	q0
32bbbf1280Sopenharmony_ci#define B_q	q1
33bbbf1280Sopenharmony_ci#define C_q	q2
34bbbf1280Sopenharmony_ci#define D_q	q3
35bbbf1280Sopenharmony_ci#define E_q	q4
36bbbf1280Sopenharmony_ci#define F_q	q5
37bbbf1280Sopenharmony_ci#define G_q	q6
38bbbf1280Sopenharmony_ci#define H_q	q7
39bbbf1280Sopenharmony_ci
40bbbf1280Sopenharmony_ci/* This implementation handles overlaps and supports both memcpy and memmove
41bbbf1280Sopenharmony_ci   from a single entry point.  It uses unaligned accesses and branchless
42bbbf1280Sopenharmony_ci   sequences to keep the code small, simple and improve performance.
43bbbf1280Sopenharmony_ci
44bbbf1280Sopenharmony_ci   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
45bbbf1280Sopenharmony_ci   copies of up to 128 bytes, and large copies.  The overhead of the overlap
46bbbf1280Sopenharmony_ci   check is negligible since it is only required for large copies.
47bbbf1280Sopenharmony_ci
48bbbf1280Sopenharmony_ci   Large copies use a software pipelined loop processing 64 bytes per iteration.
49bbbf1280Sopenharmony_ci   The source pointer is 16-byte aligned to minimize unaligned accesses.
50bbbf1280Sopenharmony_ci   The loop tail is handled by always copying 64 bytes from the end.
51bbbf1280Sopenharmony_ci*/
52bbbf1280Sopenharmony_ci
53bbbf1280Sopenharmony_ciENTRY_ALIAS (__memmove_aarch64_simd)
54bbbf1280Sopenharmony_ciENTRY (__memcpy_aarch64_simd)
55bbbf1280Sopenharmony_ci	PTR_ARG (0)
56bbbf1280Sopenharmony_ci	PTR_ARG (1)
57bbbf1280Sopenharmony_ci	SIZE_ARG (2)
58bbbf1280Sopenharmony_ci	add	srcend, src, count
59bbbf1280Sopenharmony_ci	add	dstend, dstin, count
60bbbf1280Sopenharmony_ci	cmp	count, 128
61bbbf1280Sopenharmony_ci	b.hi	L(copy_long)
62bbbf1280Sopenharmony_ci	cmp	count, 32
63bbbf1280Sopenharmony_ci	b.hi	L(copy32_128)
64bbbf1280Sopenharmony_ci
65bbbf1280Sopenharmony_ci	/* Small copies: 0..32 bytes.  */
66bbbf1280Sopenharmony_ci	cmp	count, 16
67bbbf1280Sopenharmony_ci	b.lo	L(copy16)
68bbbf1280Sopenharmony_ci	ldr	A_q, [src]
69bbbf1280Sopenharmony_ci	ldr	B_q, [srcend, -16]
70bbbf1280Sopenharmony_ci	str	A_q, [dstin]
71bbbf1280Sopenharmony_ci	str	B_q, [dstend, -16]
72bbbf1280Sopenharmony_ci	ret
73bbbf1280Sopenharmony_ci
74bbbf1280Sopenharmony_ci	/* Copy 8-15 bytes.  */
75bbbf1280Sopenharmony_ciL(copy16):
76bbbf1280Sopenharmony_ci	tbz	count, 3, L(copy8)
77bbbf1280Sopenharmony_ci	ldr	A_l, [src]
78bbbf1280Sopenharmony_ci	ldr	A_h, [srcend, -8]
79bbbf1280Sopenharmony_ci	str	A_l, [dstin]
80bbbf1280Sopenharmony_ci	str	A_h, [dstend, -8]
81bbbf1280Sopenharmony_ci	ret
82bbbf1280Sopenharmony_ci
83bbbf1280Sopenharmony_ci	.p2align 3
84bbbf1280Sopenharmony_ci	/* Copy 4-7 bytes.  */
85bbbf1280Sopenharmony_ciL(copy8):
86bbbf1280Sopenharmony_ci	tbz	count, 2, L(copy4)
87bbbf1280Sopenharmony_ci	ldr	A_lw, [src]
88bbbf1280Sopenharmony_ci	ldr	B_lw, [srcend, -4]
89bbbf1280Sopenharmony_ci	str	A_lw, [dstin]
90bbbf1280Sopenharmony_ci	str	B_lw, [dstend, -4]
91bbbf1280Sopenharmony_ci	ret
92bbbf1280Sopenharmony_ci
93bbbf1280Sopenharmony_ci	/* Copy 0..3 bytes using a branchless sequence.  */
94bbbf1280Sopenharmony_ciL(copy4):
95bbbf1280Sopenharmony_ci	cbz	count, L(copy0)
96bbbf1280Sopenharmony_ci	lsr	tmp1, count, 1
97bbbf1280Sopenharmony_ci	ldrb	A_lw, [src]
98bbbf1280Sopenharmony_ci	ldrb	C_lw, [srcend, -1]
99bbbf1280Sopenharmony_ci	ldrb	B_lw, [src, tmp1]
100bbbf1280Sopenharmony_ci	strb	A_lw, [dstin]
101bbbf1280Sopenharmony_ci	strb	B_lw, [dstin, tmp1]
102bbbf1280Sopenharmony_ci	strb	C_lw, [dstend, -1]
103bbbf1280Sopenharmony_ciL(copy0):
104bbbf1280Sopenharmony_ci	ret
105bbbf1280Sopenharmony_ci
106bbbf1280Sopenharmony_ci	.p2align 4
107bbbf1280Sopenharmony_ci	/* Medium copies: 33..128 bytes.  */
108bbbf1280Sopenharmony_ciL(copy32_128):
109bbbf1280Sopenharmony_ci	ldp	A_q, B_q, [src]
110bbbf1280Sopenharmony_ci	ldp	C_q, D_q, [srcend, -32]
111bbbf1280Sopenharmony_ci	cmp	count, 64
112bbbf1280Sopenharmony_ci	b.hi	L(copy128)
113bbbf1280Sopenharmony_ci	stp	A_q, B_q, [dstin]
114bbbf1280Sopenharmony_ci	stp	C_q, D_q, [dstend, -32]
115bbbf1280Sopenharmony_ci	ret
116bbbf1280Sopenharmony_ci
117bbbf1280Sopenharmony_ci	.p2align 4
118bbbf1280Sopenharmony_ci	/* Copy 65..128 bytes.  */
119bbbf1280Sopenharmony_ciL(copy128):
120bbbf1280Sopenharmony_ci	ldp	E_q, F_q, [src, 32]
121bbbf1280Sopenharmony_ci	cmp	count, 96
122bbbf1280Sopenharmony_ci	b.ls	L(copy96)
123bbbf1280Sopenharmony_ci	ldp	G_q, H_q, [srcend, -64]
124bbbf1280Sopenharmony_ci	stp	G_q, H_q, [dstend, -64]
125bbbf1280Sopenharmony_ciL(copy96):
126bbbf1280Sopenharmony_ci	stp	A_q, B_q, [dstin]
127bbbf1280Sopenharmony_ci	stp	E_q, F_q, [dstin, 32]
128bbbf1280Sopenharmony_ci	stp	C_q, D_q, [dstend, -32]
129bbbf1280Sopenharmony_ci	ret
130bbbf1280Sopenharmony_ci
131bbbf1280Sopenharmony_ci	/* Copy more than 128 bytes.  */
132bbbf1280Sopenharmony_ciL(copy_long):
133bbbf1280Sopenharmony_ci	/* Use backwards copy if there is an overlap.  */
134bbbf1280Sopenharmony_ci	sub	tmp1, dstin, src
135bbbf1280Sopenharmony_ci	cmp	tmp1, count
136bbbf1280Sopenharmony_ci	b.lo	L(copy_long_backwards)
137bbbf1280Sopenharmony_ci
138bbbf1280Sopenharmony_ci	/* Copy 16 bytes and then align src to 16-byte alignment.  */
139bbbf1280Sopenharmony_ci	ldr	D_q, [src]
140bbbf1280Sopenharmony_ci	and	tmp1, src, 15
141bbbf1280Sopenharmony_ci	bic	src, src, 15
142bbbf1280Sopenharmony_ci	sub	dst, dstin, tmp1
143bbbf1280Sopenharmony_ci	add	count, count, tmp1	/* Count is now 16 too large.  */
144bbbf1280Sopenharmony_ci	ldp	A_q, B_q, [src, 16]
145bbbf1280Sopenharmony_ci	str	D_q, [dstin]
146bbbf1280Sopenharmony_ci	ldp	C_q, D_q, [src, 48]
147bbbf1280Sopenharmony_ci	subs	count, count, 128 + 16	/* Test and readjust count.  */
148bbbf1280Sopenharmony_ci	b.ls	L(copy64_from_end)
149bbbf1280Sopenharmony_ciL(loop64):
150bbbf1280Sopenharmony_ci	stp	A_q, B_q, [dst, 16]
151bbbf1280Sopenharmony_ci	ldp	A_q, B_q, [src, 80]
152bbbf1280Sopenharmony_ci	stp	C_q, D_q, [dst, 48]
153bbbf1280Sopenharmony_ci	ldp	C_q, D_q, [src, 112]
154bbbf1280Sopenharmony_ci	add	src, src, 64
155bbbf1280Sopenharmony_ci	add	dst, dst, 64
156bbbf1280Sopenharmony_ci	subs	count, count, 64
157bbbf1280Sopenharmony_ci	b.hi	L(loop64)
158bbbf1280Sopenharmony_ci
159bbbf1280Sopenharmony_ci	/* Write the last iteration and copy 64 bytes from the end.  */
160bbbf1280Sopenharmony_ciL(copy64_from_end):
161bbbf1280Sopenharmony_ci	ldp	E_q, F_q, [srcend, -64]
162bbbf1280Sopenharmony_ci	stp	A_q, B_q, [dst, 16]
163bbbf1280Sopenharmony_ci	ldp	A_q, B_q, [srcend, -32]
164bbbf1280Sopenharmony_ci	stp	C_q, D_q, [dst, 48]
165bbbf1280Sopenharmony_ci	stp	E_q, F_q, [dstend, -64]
166bbbf1280Sopenharmony_ci	stp	A_q, B_q, [dstend, -32]
167bbbf1280Sopenharmony_ci	ret
168bbbf1280Sopenharmony_ci
169bbbf1280Sopenharmony_ci	/* Large backwards copy for overlapping copies.
170bbbf1280Sopenharmony_ci	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
171bbbf1280Sopenharmony_ciL(copy_long_backwards):
172bbbf1280Sopenharmony_ci	cbz	tmp1, L(copy0)
173bbbf1280Sopenharmony_ci	ldr	D_q, [srcend, -16]
174bbbf1280Sopenharmony_ci	and	tmp1, srcend, 15
175bbbf1280Sopenharmony_ci	bic	srcend, srcend, 15
176bbbf1280Sopenharmony_ci	sub	count, count, tmp1
177bbbf1280Sopenharmony_ci	ldp	A_q, B_q, [srcend, -32]
178bbbf1280Sopenharmony_ci	str	D_q, [dstend, -16]
179bbbf1280Sopenharmony_ci	ldp	C_q, D_q, [srcend, -64]
180bbbf1280Sopenharmony_ci	sub	dstend, dstend, tmp1
181bbbf1280Sopenharmony_ci	subs	count, count, 128
182bbbf1280Sopenharmony_ci	b.ls	L(copy64_from_start)
183bbbf1280Sopenharmony_ci
184bbbf1280Sopenharmony_ciL(loop64_backwards):
185bbbf1280Sopenharmony_ci	str	B_q, [dstend, -16]
186bbbf1280Sopenharmony_ci	str	A_q, [dstend, -32]
187bbbf1280Sopenharmony_ci	ldp	A_q, B_q, [srcend, -96]
188bbbf1280Sopenharmony_ci	str	D_q, [dstend, -48]
189bbbf1280Sopenharmony_ci	str	C_q, [dstend, -64]!
190bbbf1280Sopenharmony_ci	ldp	C_q, D_q, [srcend, -128]
191bbbf1280Sopenharmony_ci	sub	srcend, srcend, 64
192bbbf1280Sopenharmony_ci	subs	count, count, 64
193bbbf1280Sopenharmony_ci	b.hi	L(loop64_backwards)
194bbbf1280Sopenharmony_ci
195bbbf1280Sopenharmony_ci	/* Write the last iteration and copy 64 bytes from the start.  */
196bbbf1280Sopenharmony_ciL(copy64_from_start):
197bbbf1280Sopenharmony_ci	ldp	E_q, F_q, [src, 32]
198bbbf1280Sopenharmony_ci	stp	A_q, B_q, [dstend, -32]
199bbbf1280Sopenharmony_ci	ldp	A_q, B_q, [src]
200bbbf1280Sopenharmony_ci	stp	C_q, D_q, [dstend, -64]
201bbbf1280Sopenharmony_ci	stp	E_q, F_q, [dstin, 32]
202bbbf1280Sopenharmony_ci	stp	A_q, B_q, [dstin]
203bbbf1280Sopenharmony_ci	ret
204bbbf1280Sopenharmony_ci
205bbbf1280Sopenharmony_ciEND (__memcpy_aarch64_simd)
206bbbf1280Sopenharmony_ci
207