18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Copyright (C) 2013 ARM Ltd.
48c2ecf20Sopenharmony_ci * Copyright (C) 2013 Linaro.
58c2ecf20Sopenharmony_ci *
68c2ecf20Sopenharmony_ci * This code is based on glibc cortex strings work originally authored by Linaro
78c2ecf20Sopenharmony_ci * be found @
88c2ecf20Sopenharmony_ci *
98c2ecf20Sopenharmony_ci * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
108c2ecf20Sopenharmony_ci * files/head:/src/aarch64/
118c2ecf20Sopenharmony_ci */
128c2ecf20Sopenharmony_ci
138c2ecf20Sopenharmony_ci#include <linux/linkage.h>
148c2ecf20Sopenharmony_ci#include <asm/assembler.h>
158c2ecf20Sopenharmony_ci#include <asm/cache.h>
168c2ecf20Sopenharmony_ci
178c2ecf20Sopenharmony_ci/*
188c2ecf20Sopenharmony_ci * Move a buffer from src to test (alignment handled by the hardware).
198c2ecf20Sopenharmony_ci * If dest <= src, call memcpy, otherwise copy in reverse order.
208c2ecf20Sopenharmony_ci *
218c2ecf20Sopenharmony_ci * Parameters:
228c2ecf20Sopenharmony_ci *	x0 - dest
238c2ecf20Sopenharmony_ci *	x1 - src
248c2ecf20Sopenharmony_ci *	x2 - n
258c2ecf20Sopenharmony_ci * Returns:
268c2ecf20Sopenharmony_ci *	x0 - dest
278c2ecf20Sopenharmony_ci */
288c2ecf20Sopenharmony_cidstin	.req	x0
298c2ecf20Sopenharmony_cisrc	.req	x1
308c2ecf20Sopenharmony_cicount	.req	x2
318c2ecf20Sopenharmony_citmp1	.req	x3
328c2ecf20Sopenharmony_citmp1w	.req	w3
338c2ecf20Sopenharmony_citmp2	.req	x4
348c2ecf20Sopenharmony_citmp2w	.req	w4
358c2ecf20Sopenharmony_citmp3	.req	x5
368c2ecf20Sopenharmony_citmp3w	.req	w5
378c2ecf20Sopenharmony_cidst	.req	x6
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_ciA_l	.req	x7
408c2ecf20Sopenharmony_ciA_h	.req	x8
418c2ecf20Sopenharmony_ciB_l	.req	x9
428c2ecf20Sopenharmony_ciB_h	.req	x10
438c2ecf20Sopenharmony_ciC_l	.req	x11
448c2ecf20Sopenharmony_ciC_h	.req	x12
458c2ecf20Sopenharmony_ciD_l	.req	x13
468c2ecf20Sopenharmony_ciD_h	.req	x14
478c2ecf20Sopenharmony_ci
488c2ecf20Sopenharmony_ciSYM_FUNC_START_ALIAS(__memmove)
498c2ecf20Sopenharmony_ciSYM_FUNC_START_WEAK_PI(memmove)
508c2ecf20Sopenharmony_ci	cmp	dstin, src
518c2ecf20Sopenharmony_ci	b.lo	__memcpy
528c2ecf20Sopenharmony_ci	add	tmp1, src, count
538c2ecf20Sopenharmony_ci	cmp	dstin, tmp1
548c2ecf20Sopenharmony_ci	b.hs	__memcpy		/* No overlap.  */
558c2ecf20Sopenharmony_ci
568c2ecf20Sopenharmony_ci	add	dst, dstin, count
578c2ecf20Sopenharmony_ci	add	src, src, count
588c2ecf20Sopenharmony_ci	cmp	count, #16
598c2ecf20Sopenharmony_ci	b.lo	.Ltail15  /*probably non-alignment accesses.*/
608c2ecf20Sopenharmony_ci
618c2ecf20Sopenharmony_ci	ands	tmp2, src, #15     /* Bytes to reach alignment.  */
628c2ecf20Sopenharmony_ci	b.eq	.LSrcAligned
638c2ecf20Sopenharmony_ci	sub	count, count, tmp2
648c2ecf20Sopenharmony_ci	/*
658c2ecf20Sopenharmony_ci	* process the aligned offset length to make the src aligned firstly.
668c2ecf20Sopenharmony_ci	* those extra instructions' cost is acceptable. It also make the
678c2ecf20Sopenharmony_ci	* coming accesses are based on aligned address.
688c2ecf20Sopenharmony_ci	*/
698c2ecf20Sopenharmony_ci	tbz	tmp2, #0, 1f
708c2ecf20Sopenharmony_ci	ldrb	tmp1w, [src, #-1]!
718c2ecf20Sopenharmony_ci	strb	tmp1w, [dst, #-1]!
728c2ecf20Sopenharmony_ci1:
738c2ecf20Sopenharmony_ci	tbz	tmp2, #1, 2f
748c2ecf20Sopenharmony_ci	ldrh	tmp1w, [src, #-2]!
758c2ecf20Sopenharmony_ci	strh	tmp1w, [dst, #-2]!
768c2ecf20Sopenharmony_ci2:
778c2ecf20Sopenharmony_ci	tbz	tmp2, #2, 3f
788c2ecf20Sopenharmony_ci	ldr	tmp1w, [src, #-4]!
798c2ecf20Sopenharmony_ci	str	tmp1w, [dst, #-4]!
808c2ecf20Sopenharmony_ci3:
818c2ecf20Sopenharmony_ci	tbz	tmp2, #3, .LSrcAligned
828c2ecf20Sopenharmony_ci	ldr	tmp1, [src, #-8]!
838c2ecf20Sopenharmony_ci	str	tmp1, [dst, #-8]!
848c2ecf20Sopenharmony_ci
858c2ecf20Sopenharmony_ci.LSrcAligned:
868c2ecf20Sopenharmony_ci	cmp	count, #64
878c2ecf20Sopenharmony_ci	b.ge	.Lcpy_over64
888c2ecf20Sopenharmony_ci
898c2ecf20Sopenharmony_ci	/*
908c2ecf20Sopenharmony_ci	* Deal with small copies quickly by dropping straight into the
918c2ecf20Sopenharmony_ci	* exit block.
928c2ecf20Sopenharmony_ci	*/
938c2ecf20Sopenharmony_ci.Ltail63:
948c2ecf20Sopenharmony_ci	/*
958c2ecf20Sopenharmony_ci	* Copy up to 48 bytes of data. At this point we only need the
968c2ecf20Sopenharmony_ci	* bottom 6 bits of count to be accurate.
978c2ecf20Sopenharmony_ci	*/
988c2ecf20Sopenharmony_ci	ands	tmp1, count, #0x30
998c2ecf20Sopenharmony_ci	b.eq	.Ltail15
1008c2ecf20Sopenharmony_ci	cmp	tmp1w, #0x20
1018c2ecf20Sopenharmony_ci	b.eq	1f
1028c2ecf20Sopenharmony_ci	b.lt	2f
1038c2ecf20Sopenharmony_ci	ldp	A_l, A_h, [src, #-16]!
1048c2ecf20Sopenharmony_ci	stp	A_l, A_h, [dst, #-16]!
1058c2ecf20Sopenharmony_ci1:
1068c2ecf20Sopenharmony_ci	ldp	A_l, A_h, [src, #-16]!
1078c2ecf20Sopenharmony_ci	stp	A_l, A_h, [dst, #-16]!
1088c2ecf20Sopenharmony_ci2:
1098c2ecf20Sopenharmony_ci	ldp	A_l, A_h, [src, #-16]!
1108c2ecf20Sopenharmony_ci	stp	A_l, A_h, [dst, #-16]!
1118c2ecf20Sopenharmony_ci
1128c2ecf20Sopenharmony_ci.Ltail15:
1138c2ecf20Sopenharmony_ci	tbz	count, #3, 1f
1148c2ecf20Sopenharmony_ci	ldr	tmp1, [src, #-8]!
1158c2ecf20Sopenharmony_ci	str	tmp1, [dst, #-8]!
1168c2ecf20Sopenharmony_ci1:
1178c2ecf20Sopenharmony_ci	tbz	count, #2, 2f
1188c2ecf20Sopenharmony_ci	ldr	tmp1w, [src, #-4]!
1198c2ecf20Sopenharmony_ci	str	tmp1w, [dst, #-4]!
1208c2ecf20Sopenharmony_ci2:
1218c2ecf20Sopenharmony_ci	tbz	count, #1, 3f
1228c2ecf20Sopenharmony_ci	ldrh	tmp1w, [src, #-2]!
1238c2ecf20Sopenharmony_ci	strh	tmp1w, [dst, #-2]!
1248c2ecf20Sopenharmony_ci3:
1258c2ecf20Sopenharmony_ci	tbz	count, #0, .Lexitfunc
1268c2ecf20Sopenharmony_ci	ldrb	tmp1w, [src, #-1]
1278c2ecf20Sopenharmony_ci	strb	tmp1w, [dst, #-1]
1288c2ecf20Sopenharmony_ci
1298c2ecf20Sopenharmony_ci.Lexitfunc:
1308c2ecf20Sopenharmony_ci	ret
1318c2ecf20Sopenharmony_ci
1328c2ecf20Sopenharmony_ci.Lcpy_over64:
1338c2ecf20Sopenharmony_ci	subs	count, count, #128
1348c2ecf20Sopenharmony_ci	b.ge	.Lcpy_body_large
1358c2ecf20Sopenharmony_ci	/*
1368c2ecf20Sopenharmony_ci	* Less than 128 bytes to copy, so handle 64 bytes here and then jump
1378c2ecf20Sopenharmony_ci	* to the tail.
1388c2ecf20Sopenharmony_ci	*/
1398c2ecf20Sopenharmony_ci	ldp	A_l, A_h, [src, #-16]
1408c2ecf20Sopenharmony_ci	stp	A_l, A_h, [dst, #-16]
1418c2ecf20Sopenharmony_ci	ldp	B_l, B_h, [src, #-32]
1428c2ecf20Sopenharmony_ci	ldp	C_l, C_h, [src, #-48]
1438c2ecf20Sopenharmony_ci	stp	B_l, B_h, [dst, #-32]
1448c2ecf20Sopenharmony_ci	stp	C_l, C_h, [dst, #-48]
1458c2ecf20Sopenharmony_ci	ldp	D_l, D_h, [src, #-64]!
1468c2ecf20Sopenharmony_ci	stp	D_l, D_h, [dst, #-64]!
1478c2ecf20Sopenharmony_ci
1488c2ecf20Sopenharmony_ci	tst	count, #0x3f
1498c2ecf20Sopenharmony_ci	b.ne	.Ltail63
1508c2ecf20Sopenharmony_ci	ret
1518c2ecf20Sopenharmony_ci
1528c2ecf20Sopenharmony_ci	/*
1538c2ecf20Sopenharmony_ci	* Critical loop. Start at a new cache line boundary. Assuming
1548c2ecf20Sopenharmony_ci	* 64 bytes per line this ensures the entire loop is in one line.
1558c2ecf20Sopenharmony_ci	*/
1568c2ecf20Sopenharmony_ci	.p2align	L1_CACHE_SHIFT
1578c2ecf20Sopenharmony_ci.Lcpy_body_large:
1588c2ecf20Sopenharmony_ci	/* pre-load 64 bytes data. */
1598c2ecf20Sopenharmony_ci	ldp	A_l, A_h, [src, #-16]
1608c2ecf20Sopenharmony_ci	ldp	B_l, B_h, [src, #-32]
1618c2ecf20Sopenharmony_ci	ldp	C_l, C_h, [src, #-48]
1628c2ecf20Sopenharmony_ci	ldp	D_l, D_h, [src, #-64]!
1638c2ecf20Sopenharmony_ci1:
1648c2ecf20Sopenharmony_ci	/*
1658c2ecf20Sopenharmony_ci	* interlace the load of next 64 bytes data block with store of the last
1668c2ecf20Sopenharmony_ci	* loaded 64 bytes data.
1678c2ecf20Sopenharmony_ci	*/
1688c2ecf20Sopenharmony_ci	stp	A_l, A_h, [dst, #-16]
1698c2ecf20Sopenharmony_ci	ldp	A_l, A_h, [src, #-16]
1708c2ecf20Sopenharmony_ci	stp	B_l, B_h, [dst, #-32]
1718c2ecf20Sopenharmony_ci	ldp	B_l, B_h, [src, #-32]
1728c2ecf20Sopenharmony_ci	stp	C_l, C_h, [dst, #-48]
1738c2ecf20Sopenharmony_ci	ldp	C_l, C_h, [src, #-48]
1748c2ecf20Sopenharmony_ci	stp	D_l, D_h, [dst, #-64]!
1758c2ecf20Sopenharmony_ci	ldp	D_l, D_h, [src, #-64]!
1768c2ecf20Sopenharmony_ci	subs	count, count, #64
1778c2ecf20Sopenharmony_ci	b.ge	1b
1788c2ecf20Sopenharmony_ci	stp	A_l, A_h, [dst, #-16]
1798c2ecf20Sopenharmony_ci	stp	B_l, B_h, [dst, #-32]
1808c2ecf20Sopenharmony_ci	stp	C_l, C_h, [dst, #-48]
1818c2ecf20Sopenharmony_ci	stp	D_l, D_h, [dst, #-64]!
1828c2ecf20Sopenharmony_ci
1838c2ecf20Sopenharmony_ci	tst	count, #0x3f
1848c2ecf20Sopenharmony_ci	b.ne	.Ltail63
1858c2ecf20Sopenharmony_ci	ret
1868c2ecf20Sopenharmony_ciSYM_FUNC_END_PI(memmove)
1878c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memmove)
1888c2ecf20Sopenharmony_ciSYM_FUNC_END_ALIAS(__memmove)
1898c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__memmove)
190