162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2013 ARM Ltd.
462306a36Sopenharmony_ci * Copyright (C) 2013 Linaro.
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * This code is based on glibc cortex strings work originally authored by Linaro
762306a36Sopenharmony_ci * be found @
862306a36Sopenharmony_ci *
962306a36Sopenharmony_ci * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
1062306a36Sopenharmony_ci * files/head:/src/aarch64/
1162306a36Sopenharmony_ci */
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci/*
1562306a36Sopenharmony_ci * Copy a buffer from src to dest (alignment handled by the hardware)
1662306a36Sopenharmony_ci *
1762306a36Sopenharmony_ci * Parameters:
1862306a36Sopenharmony_ci *	x0 - dest
1962306a36Sopenharmony_ci *	x1 - src
2062306a36Sopenharmony_ci *	x2 - n
2162306a36Sopenharmony_ci * Returns:
2262306a36Sopenharmony_ci *	x0 - dest
2362306a36Sopenharmony_ci */
2462306a36Sopenharmony_cidstin	.req	x0
2562306a36Sopenharmony_cisrc	.req	x1
2662306a36Sopenharmony_cicount	.req	x2
2762306a36Sopenharmony_citmp1	.req	x3
2862306a36Sopenharmony_citmp1w	.req	w3
2962306a36Sopenharmony_citmp2	.req	x4
3062306a36Sopenharmony_citmp2w	.req	w4
3162306a36Sopenharmony_cidst	.req	x6
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ciA_l	.req	x7
3462306a36Sopenharmony_ciA_h	.req	x8
3562306a36Sopenharmony_ciB_l	.req	x9
3662306a36Sopenharmony_ciB_h	.req	x10
3762306a36Sopenharmony_ciC_l	.req	x11
3862306a36Sopenharmony_ciC_h	.req	x12
3962306a36Sopenharmony_ciD_l	.req	x13
4062306a36Sopenharmony_ciD_h	.req	x14
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci	mov	dst, dstin
4362306a36Sopenharmony_ci	cmp	count, #16
4462306a36Sopenharmony_ci	/*When memory length is less than 16, the accessed are not aligned.*/
4562306a36Sopenharmony_ci	b.lo	.Ltiny15
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_ci	neg	tmp2, src
4862306a36Sopenharmony_ci	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
4962306a36Sopenharmony_ci	b.eq	.LSrcAligned
5062306a36Sopenharmony_ci	sub	count, count, tmp2
5162306a36Sopenharmony_ci	/*
5262306a36Sopenharmony_ci	* Copy the leading memory data from src to dst in an increasing
5362306a36Sopenharmony_ci	* address order.By this way,the risk of overwriting the source
5462306a36Sopenharmony_ci	* memory data is eliminated when the distance between src and
5562306a36Sopenharmony_ci	* dst is less than 16. The memory accesses here are alignment.
5662306a36Sopenharmony_ci	*/
5762306a36Sopenharmony_ci	tbz	tmp2, #0, 1f
5862306a36Sopenharmony_ci	ldrb1	tmp1w, src, #1
5962306a36Sopenharmony_ci	strb1	tmp1w, dst, #1
6062306a36Sopenharmony_ci1:
6162306a36Sopenharmony_ci	tbz	tmp2, #1, 2f
6262306a36Sopenharmony_ci	ldrh1	tmp1w, src, #2
6362306a36Sopenharmony_ci	strh1	tmp1w, dst, #2
6462306a36Sopenharmony_ci2:
6562306a36Sopenharmony_ci	tbz	tmp2, #2, 3f
6662306a36Sopenharmony_ci	ldr1	tmp1w, src, #4
6762306a36Sopenharmony_ci	str1	tmp1w, dst, #4
6862306a36Sopenharmony_ci3:
6962306a36Sopenharmony_ci	tbz	tmp2, #3, .LSrcAligned
7062306a36Sopenharmony_ci	ldr1	tmp1, src, #8
7162306a36Sopenharmony_ci	str1	tmp1, dst, #8
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci.LSrcAligned:
7462306a36Sopenharmony_ci	cmp	count, #64
7562306a36Sopenharmony_ci	b.ge	.Lcpy_over64
7662306a36Sopenharmony_ci	/*
7762306a36Sopenharmony_ci	* Deal with small copies quickly by dropping straight into the
7862306a36Sopenharmony_ci	* exit block.
7962306a36Sopenharmony_ci	*/
8062306a36Sopenharmony_ci.Ltail63:
8162306a36Sopenharmony_ci	/*
8262306a36Sopenharmony_ci	* Copy up to 48 bytes of data. At this point we only need the
8362306a36Sopenharmony_ci	* bottom 6 bits of count to be accurate.
8462306a36Sopenharmony_ci	*/
8562306a36Sopenharmony_ci	ands	tmp1, count, #0x30
8662306a36Sopenharmony_ci	b.eq	.Ltiny15
8762306a36Sopenharmony_ci	cmp	tmp1w, #0x20
8862306a36Sopenharmony_ci	b.eq	1f
8962306a36Sopenharmony_ci	b.lt	2f
9062306a36Sopenharmony_ci	ldp1	A_l, A_h, src, #16
9162306a36Sopenharmony_ci	stp1	A_l, A_h, dst, #16
9262306a36Sopenharmony_ci1:
9362306a36Sopenharmony_ci	ldp1	A_l, A_h, src, #16
9462306a36Sopenharmony_ci	stp1	A_l, A_h, dst, #16
9562306a36Sopenharmony_ci2:
9662306a36Sopenharmony_ci	ldp1	A_l, A_h, src, #16
9762306a36Sopenharmony_ci	stp1	A_l, A_h, dst, #16
9862306a36Sopenharmony_ci.Ltiny15:
9962306a36Sopenharmony_ci	/*
10062306a36Sopenharmony_ci	* Prefer to break one ldp/stp into several load/store to access
10162306a36Sopenharmony_ci	* memory in an increasing address order,rather than to load/store 16
10262306a36Sopenharmony_ci	* bytes from (src-16) to (dst-16) and to backward the src to aligned
10362306a36Sopenharmony_ci	* address,which way is used in original cortex memcpy. If keeping
10462306a36Sopenharmony_ci	* the original memcpy process here, memmove need to satisfy the
10562306a36Sopenharmony_ci	* precondition that src address is at least 16 bytes bigger than dst
10662306a36Sopenharmony_ci	* address,otherwise some source data will be overwritten when memove
10762306a36Sopenharmony_ci	* call memcpy directly. To make memmove simpler and decouple the
10862306a36Sopenharmony_ci	* memcpy's dependency on memmove, withdrew the original process.
10962306a36Sopenharmony_ci	*/
11062306a36Sopenharmony_ci	tbz	count, #3, 1f
11162306a36Sopenharmony_ci	ldr1	tmp1, src, #8
11262306a36Sopenharmony_ci	str1	tmp1, dst, #8
11362306a36Sopenharmony_ci1:
11462306a36Sopenharmony_ci	tbz	count, #2, 2f
11562306a36Sopenharmony_ci	ldr1	tmp1w, src, #4
11662306a36Sopenharmony_ci	str1	tmp1w, dst, #4
11762306a36Sopenharmony_ci2:
11862306a36Sopenharmony_ci	tbz	count, #1, 3f
11962306a36Sopenharmony_ci	ldrh1	tmp1w, src, #2
12062306a36Sopenharmony_ci	strh1	tmp1w, dst, #2
12162306a36Sopenharmony_ci3:
12262306a36Sopenharmony_ci	tbz	count, #0, .Lexitfunc
12362306a36Sopenharmony_ci	ldrb1	tmp1w, src, #1
12462306a36Sopenharmony_ci	strb1	tmp1w, dst, #1
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci	b	.Lexitfunc
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci.Lcpy_over64:
12962306a36Sopenharmony_ci	subs	count, count, #128
13062306a36Sopenharmony_ci	b.ge	.Lcpy_body_large
13162306a36Sopenharmony_ci	/*
13262306a36Sopenharmony_ci	* Less than 128 bytes to copy, so handle 64 here and then jump
13362306a36Sopenharmony_ci	* to the tail.
13462306a36Sopenharmony_ci	*/
13562306a36Sopenharmony_ci	ldp1	A_l, A_h, src, #16
13662306a36Sopenharmony_ci	stp1	A_l, A_h, dst, #16
13762306a36Sopenharmony_ci	ldp1	B_l, B_h, src, #16
13862306a36Sopenharmony_ci	ldp1	C_l, C_h, src, #16
13962306a36Sopenharmony_ci	stp1	B_l, B_h, dst, #16
14062306a36Sopenharmony_ci	stp1	C_l, C_h, dst, #16
14162306a36Sopenharmony_ci	ldp1	D_l, D_h, src, #16
14262306a36Sopenharmony_ci	stp1	D_l, D_h, dst, #16
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci	tst	count, #0x3f
14562306a36Sopenharmony_ci	b.ne	.Ltail63
14662306a36Sopenharmony_ci	b	.Lexitfunc
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_ci	/*
14962306a36Sopenharmony_ci	* Critical loop.  Start at a new cache line boundary.  Assuming
15062306a36Sopenharmony_ci	* 64 bytes per line this ensures the entire loop is in one line.
15162306a36Sopenharmony_ci	*/
15262306a36Sopenharmony_ci	.p2align	L1_CACHE_SHIFT
15362306a36Sopenharmony_ci.Lcpy_body_large:
15462306a36Sopenharmony_ci	/* pre-get 64 bytes data. */
15562306a36Sopenharmony_ci	ldp1	A_l, A_h, src, #16
15662306a36Sopenharmony_ci	ldp1	B_l, B_h, src, #16
15762306a36Sopenharmony_ci	ldp1	C_l, C_h, src, #16
15862306a36Sopenharmony_ci	ldp1	D_l, D_h, src, #16
15962306a36Sopenharmony_ci1:
16062306a36Sopenharmony_ci	/*
16162306a36Sopenharmony_ci	* interlace the load of next 64 bytes data block with store of the last
16262306a36Sopenharmony_ci	* loaded 64 bytes data.
16362306a36Sopenharmony_ci	*/
16462306a36Sopenharmony_ci	stp1	A_l, A_h, dst, #16
16562306a36Sopenharmony_ci	ldp1	A_l, A_h, src, #16
16662306a36Sopenharmony_ci	stp1	B_l, B_h, dst, #16
16762306a36Sopenharmony_ci	ldp1	B_l, B_h, src, #16
16862306a36Sopenharmony_ci	stp1	C_l, C_h, dst, #16
16962306a36Sopenharmony_ci	ldp1	C_l, C_h, src, #16
17062306a36Sopenharmony_ci	stp1	D_l, D_h, dst, #16
17162306a36Sopenharmony_ci	ldp1	D_l, D_h, src, #16
17262306a36Sopenharmony_ci	subs	count, count, #64
17362306a36Sopenharmony_ci	b.ge	1b
17462306a36Sopenharmony_ci	stp1	A_l, A_h, dst, #16
17562306a36Sopenharmony_ci	stp1	B_l, B_h, dst, #16
17662306a36Sopenharmony_ci	stp1	C_l, C_h, dst, #16
17762306a36Sopenharmony_ci	stp1	D_l, D_h, dst, #16
17862306a36Sopenharmony_ci
17962306a36Sopenharmony_ci	tst	count, #0x3f
18062306a36Sopenharmony_ci	b.ne	.Ltail63
18162306a36Sopenharmony_ci.Lexitfunc:
182