18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Copyright (C) 2013 ARM Ltd.
48c2ecf20Sopenharmony_ci * Copyright (C) 2013 Linaro.
58c2ecf20Sopenharmony_ci *
68c2ecf20Sopenharmony_ci * This code is based on glibc cortex strings work originally authored by Linaro
78c2ecf20Sopenharmony_ci * be found @
88c2ecf20Sopenharmony_ci *
98c2ecf20Sopenharmony_ci * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
108c2ecf20Sopenharmony_ci * files/head:/src/aarch64/
118c2ecf20Sopenharmony_ci */
128c2ecf20Sopenharmony_ci
138c2ecf20Sopenharmony_ci
148c2ecf20Sopenharmony_ci/*
158c2ecf20Sopenharmony_ci * Copy a buffer from src to dest (alignment handled by the hardware)
168c2ecf20Sopenharmony_ci *
178c2ecf20Sopenharmony_ci * Parameters:
188c2ecf20Sopenharmony_ci *	x0 - dest
198c2ecf20Sopenharmony_ci *	x1 - src
208c2ecf20Sopenharmony_ci *	x2 - n
218c2ecf20Sopenharmony_ci * Returns:
228c2ecf20Sopenharmony_ci *	x0 - dest
238c2ecf20Sopenharmony_ci */
248c2ecf20Sopenharmony_cidstin	.req	x0
258c2ecf20Sopenharmony_cisrc	.req	x1
268c2ecf20Sopenharmony_cicount	.req	x2
278c2ecf20Sopenharmony_citmp1	.req	x3
288c2ecf20Sopenharmony_citmp1w	.req	w3
298c2ecf20Sopenharmony_citmp2	.req	x4
308c2ecf20Sopenharmony_citmp2w	.req	w4
318c2ecf20Sopenharmony_cidst	.req	x6
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_ciA_l	.req	x7
348c2ecf20Sopenharmony_ciA_h	.req	x8
358c2ecf20Sopenharmony_ciB_l	.req	x9
368c2ecf20Sopenharmony_ciB_h	.req	x10
378c2ecf20Sopenharmony_ciC_l	.req	x11
388c2ecf20Sopenharmony_ciC_h	.req	x12
398c2ecf20Sopenharmony_ciD_l	.req	x13
408c2ecf20Sopenharmony_ciD_h	.req	x14
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_ci	mov	dst, dstin
438c2ecf20Sopenharmony_ci	cmp	count, #16
448c2ecf20Sopenharmony_ci	/*When memory length is less than 16, the accessed are not aligned.*/
458c2ecf20Sopenharmony_ci	b.lo	.Ltiny15
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci	neg	tmp2, src
488c2ecf20Sopenharmony_ci	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
498c2ecf20Sopenharmony_ci	b.eq	.LSrcAligned
508c2ecf20Sopenharmony_ci	sub	count, count, tmp2
518c2ecf20Sopenharmony_ci	/*
528c2ecf20Sopenharmony_ci	* Copy the leading memory data from src to dst in an increasing
538c2ecf20Sopenharmony_ci	* address order.By this way,the risk of overwriting the source
548c2ecf20Sopenharmony_ci	* memory data is eliminated when the distance between src and
558c2ecf20Sopenharmony_ci	* dst is less than 16. The memory accesses here are alignment.
568c2ecf20Sopenharmony_ci	*/
578c2ecf20Sopenharmony_ci	tbz	tmp2, #0, 1f
588c2ecf20Sopenharmony_ci	ldrb1	tmp1w, src, #1
598c2ecf20Sopenharmony_ci	strb1	tmp1w, dst, #1
608c2ecf20Sopenharmony_ci1:
618c2ecf20Sopenharmony_ci	tbz	tmp2, #1, 2f
628c2ecf20Sopenharmony_ci	ldrh1	tmp1w, src, #2
638c2ecf20Sopenharmony_ci	strh1	tmp1w, dst, #2
648c2ecf20Sopenharmony_ci2:
658c2ecf20Sopenharmony_ci	tbz	tmp2, #2, 3f
668c2ecf20Sopenharmony_ci	ldr1	tmp1w, src, #4
678c2ecf20Sopenharmony_ci	str1	tmp1w, dst, #4
688c2ecf20Sopenharmony_ci3:
698c2ecf20Sopenharmony_ci	tbz	tmp2, #3, .LSrcAligned
708c2ecf20Sopenharmony_ci	ldr1	tmp1, src, #8
718c2ecf20Sopenharmony_ci	str1	tmp1, dst, #8
728c2ecf20Sopenharmony_ci
738c2ecf20Sopenharmony_ci.LSrcAligned:
748c2ecf20Sopenharmony_ci	cmp	count, #64
758c2ecf20Sopenharmony_ci	b.ge	.Lcpy_over64
768c2ecf20Sopenharmony_ci	/*
778c2ecf20Sopenharmony_ci	* Deal with small copies quickly by dropping straight into the
788c2ecf20Sopenharmony_ci	* exit block.
798c2ecf20Sopenharmony_ci	*/
808c2ecf20Sopenharmony_ci.Ltail63:
818c2ecf20Sopenharmony_ci	/*
828c2ecf20Sopenharmony_ci	* Copy up to 48 bytes of data. At this point we only need the
838c2ecf20Sopenharmony_ci	* bottom 6 bits of count to be accurate.
848c2ecf20Sopenharmony_ci	*/
858c2ecf20Sopenharmony_ci	ands	tmp1, count, #0x30
868c2ecf20Sopenharmony_ci	b.eq	.Ltiny15
878c2ecf20Sopenharmony_ci	cmp	tmp1w, #0x20
888c2ecf20Sopenharmony_ci	b.eq	1f
898c2ecf20Sopenharmony_ci	b.lt	2f
908c2ecf20Sopenharmony_ci	ldp1	A_l, A_h, src, #16
918c2ecf20Sopenharmony_ci	stp1	A_l, A_h, dst, #16
928c2ecf20Sopenharmony_ci1:
938c2ecf20Sopenharmony_ci	ldp1	A_l, A_h, src, #16
948c2ecf20Sopenharmony_ci	stp1	A_l, A_h, dst, #16
958c2ecf20Sopenharmony_ci2:
968c2ecf20Sopenharmony_ci	ldp1	A_l, A_h, src, #16
978c2ecf20Sopenharmony_ci	stp1	A_l, A_h, dst, #16
988c2ecf20Sopenharmony_ci.Ltiny15:
998c2ecf20Sopenharmony_ci	/*
1008c2ecf20Sopenharmony_ci	* Prefer to break one ldp/stp into several load/store to access
1018c2ecf20Sopenharmony_ci	* memory in an increasing address order,rather than to load/store 16
1028c2ecf20Sopenharmony_ci	* bytes from (src-16) to (dst-16) and to backward the src to aligned
1038c2ecf20Sopenharmony_ci	* address,which way is used in original cortex memcpy. If keeping
1048c2ecf20Sopenharmony_ci	* the original memcpy process here, memmove need to satisfy the
1058c2ecf20Sopenharmony_ci	* precondition that src address is at least 16 bytes bigger than dst
1068c2ecf20Sopenharmony_ci	* address,otherwise some source data will be overwritten when memove
1078c2ecf20Sopenharmony_ci	* call memcpy directly. To make memmove simpler and decouple the
1088c2ecf20Sopenharmony_ci	* memcpy's dependency on memmove, withdrew the original process.
1098c2ecf20Sopenharmony_ci	*/
1108c2ecf20Sopenharmony_ci	tbz	count, #3, 1f
1118c2ecf20Sopenharmony_ci	ldr1	tmp1, src, #8
1128c2ecf20Sopenharmony_ci	str1	tmp1, dst, #8
1138c2ecf20Sopenharmony_ci1:
1148c2ecf20Sopenharmony_ci	tbz	count, #2, 2f
1158c2ecf20Sopenharmony_ci	ldr1	tmp1w, src, #4
1168c2ecf20Sopenharmony_ci	str1	tmp1w, dst, #4
1178c2ecf20Sopenharmony_ci2:
1188c2ecf20Sopenharmony_ci	tbz	count, #1, 3f
1198c2ecf20Sopenharmony_ci	ldrh1	tmp1w, src, #2
1208c2ecf20Sopenharmony_ci	strh1	tmp1w, dst, #2
1218c2ecf20Sopenharmony_ci3:
1228c2ecf20Sopenharmony_ci	tbz	count, #0, .Lexitfunc
1238c2ecf20Sopenharmony_ci	ldrb1	tmp1w, src, #1
1248c2ecf20Sopenharmony_ci	strb1	tmp1w, dst, #1
1258c2ecf20Sopenharmony_ci
1268c2ecf20Sopenharmony_ci	b	.Lexitfunc
1278c2ecf20Sopenharmony_ci
1288c2ecf20Sopenharmony_ci.Lcpy_over64:
1298c2ecf20Sopenharmony_ci	subs	count, count, #128
1308c2ecf20Sopenharmony_ci	b.ge	.Lcpy_body_large
1318c2ecf20Sopenharmony_ci	/*
1328c2ecf20Sopenharmony_ci	* Less than 128 bytes to copy, so handle 64 here and then jump
1338c2ecf20Sopenharmony_ci	* to the tail.
1348c2ecf20Sopenharmony_ci	*/
1358c2ecf20Sopenharmony_ci	ldp1	A_l, A_h, src, #16
1368c2ecf20Sopenharmony_ci	stp1	A_l, A_h, dst, #16
1378c2ecf20Sopenharmony_ci	ldp1	B_l, B_h, src, #16
1388c2ecf20Sopenharmony_ci	ldp1	C_l, C_h, src, #16
1398c2ecf20Sopenharmony_ci	stp1	B_l, B_h, dst, #16
1408c2ecf20Sopenharmony_ci	stp1	C_l, C_h, dst, #16
1418c2ecf20Sopenharmony_ci	ldp1	D_l, D_h, src, #16
1428c2ecf20Sopenharmony_ci	stp1	D_l, D_h, dst, #16
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci	tst	count, #0x3f
1458c2ecf20Sopenharmony_ci	b.ne	.Ltail63
1468c2ecf20Sopenharmony_ci	b	.Lexitfunc
1478c2ecf20Sopenharmony_ci
1488c2ecf20Sopenharmony_ci	/*
1498c2ecf20Sopenharmony_ci	* Critical loop.  Start at a new cache line boundary.  Assuming
1508c2ecf20Sopenharmony_ci	* 64 bytes per line this ensures the entire loop is in one line.
1518c2ecf20Sopenharmony_ci	*/
1528c2ecf20Sopenharmony_ci	.p2align	L1_CACHE_SHIFT
1538c2ecf20Sopenharmony_ci.Lcpy_body_large:
1548c2ecf20Sopenharmony_ci	/* pre-get 64 bytes data. */
1558c2ecf20Sopenharmony_ci	ldp1	A_l, A_h, src, #16
1568c2ecf20Sopenharmony_ci	ldp1	B_l, B_h, src, #16
1578c2ecf20Sopenharmony_ci	ldp1	C_l, C_h, src, #16
1588c2ecf20Sopenharmony_ci	ldp1	D_l, D_h, src, #16
1598c2ecf20Sopenharmony_ci1:
1608c2ecf20Sopenharmony_ci	/*
1618c2ecf20Sopenharmony_ci	* interlace the load of next 64 bytes data block with store of the last
1628c2ecf20Sopenharmony_ci	* loaded 64 bytes data.
1638c2ecf20Sopenharmony_ci	*/
1648c2ecf20Sopenharmony_ci	stp1	A_l, A_h, dst, #16
1658c2ecf20Sopenharmony_ci	ldp1	A_l, A_h, src, #16
1668c2ecf20Sopenharmony_ci	stp1	B_l, B_h, dst, #16
1678c2ecf20Sopenharmony_ci	ldp1	B_l, B_h, src, #16
1688c2ecf20Sopenharmony_ci	stp1	C_l, C_h, dst, #16
1698c2ecf20Sopenharmony_ci	ldp1	C_l, C_h, src, #16
1708c2ecf20Sopenharmony_ci	stp1	D_l, D_h, dst, #16
1718c2ecf20Sopenharmony_ci	ldp1	D_l, D_h, src, #16
1728c2ecf20Sopenharmony_ci	subs	count, count, #64
1738c2ecf20Sopenharmony_ci	b.ge	1b
1748c2ecf20Sopenharmony_ci	stp1	A_l, A_h, dst, #16
1758c2ecf20Sopenharmony_ci	stp1	B_l, B_h, dst, #16
1768c2ecf20Sopenharmony_ci	stp1	C_l, C_h, dst, #16
1778c2ecf20Sopenharmony_ci	stp1	D_l, D_h, dst, #16
1788c2ecf20Sopenharmony_ci
1798c2ecf20Sopenharmony_ci	tst	count, #0x3f
1808c2ecf20Sopenharmony_ci	b.ne	.Ltail63
1818c2ecf20Sopenharmony_ci.Lexitfunc:
182