162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (c) 2012-2021, Arm Limited.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Adapted from the original at:
662306a36Sopenharmony_ci * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
762306a36Sopenharmony_ci */
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci#include <linux/linkage.h>
1062306a36Sopenharmony_ci#include <asm/assembler.h>
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci/* Assumptions:
1362306a36Sopenharmony_ci *
1462306a36Sopenharmony_ci * ARMv8-a, AArch64, unaligned accesses.
1562306a36Sopenharmony_ci *
1662306a36Sopenharmony_ci */
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci#define L(label) .L ## label
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci#define dstin	x0
2162306a36Sopenharmony_ci#define src	x1
2262306a36Sopenharmony_ci#define count	x2
2362306a36Sopenharmony_ci#define dst	x3
2462306a36Sopenharmony_ci#define srcend	x4
2562306a36Sopenharmony_ci#define dstend	x5
2662306a36Sopenharmony_ci#define A_l	x6
2762306a36Sopenharmony_ci#define A_lw	w6
2862306a36Sopenharmony_ci#define A_h	x7
2962306a36Sopenharmony_ci#define B_l	x8
3062306a36Sopenharmony_ci#define B_lw	w8
3162306a36Sopenharmony_ci#define B_h	x9
3262306a36Sopenharmony_ci#define C_l	x10
3362306a36Sopenharmony_ci#define C_lw	w10
3462306a36Sopenharmony_ci#define C_h	x11
3562306a36Sopenharmony_ci#define D_l	x12
3662306a36Sopenharmony_ci#define D_h	x13
3762306a36Sopenharmony_ci#define E_l	x14
3862306a36Sopenharmony_ci#define E_h	x15
3962306a36Sopenharmony_ci#define F_l	x16
4062306a36Sopenharmony_ci#define F_h	x17
4162306a36Sopenharmony_ci#define G_l	count
4262306a36Sopenharmony_ci#define G_h	dst
4362306a36Sopenharmony_ci#define H_l	src
4462306a36Sopenharmony_ci#define H_h	srcend
4562306a36Sopenharmony_ci#define tmp1	x14
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_ci/* This implementation handles overlaps and supports both memcpy and memmove
4862306a36Sopenharmony_ci   from a single entry point.  It uses unaligned accesses and branchless
4962306a36Sopenharmony_ci   sequences to keep the code small, simple and improve performance.
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
5262306a36Sopenharmony_ci   copies of up to 128 bytes, and large copies.  The overhead of the overlap
5362306a36Sopenharmony_ci   check is negligible since it is only required for large copies.
5462306a36Sopenharmony_ci
5562306a36Sopenharmony_ci   Large copies use a software pipelined loop processing 64 bytes per iteration.
5662306a36Sopenharmony_ci   The destination pointer is 16-byte aligned to minimize unaligned accesses.
5762306a36Sopenharmony_ci   The loop tail is handled by always copying 64 bytes from the end.
5862306a36Sopenharmony_ci*/
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ciSYM_FUNC_START(__pi_memcpy)
6162306a36Sopenharmony_ci	add	srcend, src, count
6262306a36Sopenharmony_ci	add	dstend, dstin, count
6362306a36Sopenharmony_ci	cmp	count, 128
6462306a36Sopenharmony_ci	b.hi	L(copy_long)
6562306a36Sopenharmony_ci	cmp	count, 32
6662306a36Sopenharmony_ci	b.hi	L(copy32_128)
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_ci	/* Small copies: 0..32 bytes.  */
6962306a36Sopenharmony_ci	cmp	count, 16
7062306a36Sopenharmony_ci	b.lo	L(copy16)
7162306a36Sopenharmony_ci	ldp	A_l, A_h, [src]
7262306a36Sopenharmony_ci	ldp	D_l, D_h, [srcend, -16]
7362306a36Sopenharmony_ci	stp	A_l, A_h, [dstin]
7462306a36Sopenharmony_ci	stp	D_l, D_h, [dstend, -16]
7562306a36Sopenharmony_ci	ret
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_ci	/* Copy 8-15 bytes.  */
7862306a36Sopenharmony_ciL(copy16):
7962306a36Sopenharmony_ci	tbz	count, 3, L(copy8)
8062306a36Sopenharmony_ci	ldr	A_l, [src]
8162306a36Sopenharmony_ci	ldr	A_h, [srcend, -8]
8262306a36Sopenharmony_ci	str	A_l, [dstin]
8362306a36Sopenharmony_ci	str	A_h, [dstend, -8]
8462306a36Sopenharmony_ci	ret
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_ci	.p2align 3
8762306a36Sopenharmony_ci	/* Copy 4-7 bytes.  */
8862306a36Sopenharmony_ciL(copy8):
8962306a36Sopenharmony_ci	tbz	count, 2, L(copy4)
9062306a36Sopenharmony_ci	ldr	A_lw, [src]
9162306a36Sopenharmony_ci	ldr	B_lw, [srcend, -4]
9262306a36Sopenharmony_ci	str	A_lw, [dstin]
9362306a36Sopenharmony_ci	str	B_lw, [dstend, -4]
9462306a36Sopenharmony_ci	ret
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci	/* Copy 0..3 bytes using a branchless sequence.  */
9762306a36Sopenharmony_ciL(copy4):
9862306a36Sopenharmony_ci	cbz	count, L(copy0)
9962306a36Sopenharmony_ci	lsr	tmp1, count, 1
10062306a36Sopenharmony_ci	ldrb	A_lw, [src]
10162306a36Sopenharmony_ci	ldrb	C_lw, [srcend, -1]
10262306a36Sopenharmony_ci	ldrb	B_lw, [src, tmp1]
10362306a36Sopenharmony_ci	strb	A_lw, [dstin]
10462306a36Sopenharmony_ci	strb	B_lw, [dstin, tmp1]
10562306a36Sopenharmony_ci	strb	C_lw, [dstend, -1]
10662306a36Sopenharmony_ciL(copy0):
10762306a36Sopenharmony_ci	ret
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci	.p2align 4
11062306a36Sopenharmony_ci	/* Medium copies: 33..128 bytes.  */
11162306a36Sopenharmony_ciL(copy32_128):
11262306a36Sopenharmony_ci	ldp	A_l, A_h, [src]
11362306a36Sopenharmony_ci	ldp	B_l, B_h, [src, 16]
11462306a36Sopenharmony_ci	ldp	C_l, C_h, [srcend, -32]
11562306a36Sopenharmony_ci	ldp	D_l, D_h, [srcend, -16]
11662306a36Sopenharmony_ci	cmp	count, 64
11762306a36Sopenharmony_ci	b.hi	L(copy128)
11862306a36Sopenharmony_ci	stp	A_l, A_h, [dstin]
11962306a36Sopenharmony_ci	stp	B_l, B_h, [dstin, 16]
12062306a36Sopenharmony_ci	stp	C_l, C_h, [dstend, -32]
12162306a36Sopenharmony_ci	stp	D_l, D_h, [dstend, -16]
12262306a36Sopenharmony_ci	ret
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_ci	.p2align 4
12562306a36Sopenharmony_ci	/* Copy 65..128 bytes.  */
12662306a36Sopenharmony_ciL(copy128):
12762306a36Sopenharmony_ci	ldp	E_l, E_h, [src, 32]
12862306a36Sopenharmony_ci	ldp	F_l, F_h, [src, 48]
12962306a36Sopenharmony_ci	cmp	count, 96
13062306a36Sopenharmony_ci	b.ls	L(copy96)
13162306a36Sopenharmony_ci	ldp	G_l, G_h, [srcend, -64]
13262306a36Sopenharmony_ci	ldp	H_l, H_h, [srcend, -48]
13362306a36Sopenharmony_ci	stp	G_l, G_h, [dstend, -64]
13462306a36Sopenharmony_ci	stp	H_l, H_h, [dstend, -48]
13562306a36Sopenharmony_ciL(copy96):
13662306a36Sopenharmony_ci	stp	A_l, A_h, [dstin]
13762306a36Sopenharmony_ci	stp	B_l, B_h, [dstin, 16]
13862306a36Sopenharmony_ci	stp	E_l, E_h, [dstin, 32]
13962306a36Sopenharmony_ci	stp	F_l, F_h, [dstin, 48]
14062306a36Sopenharmony_ci	stp	C_l, C_h, [dstend, -32]
14162306a36Sopenharmony_ci	stp	D_l, D_h, [dstend, -16]
14262306a36Sopenharmony_ci	ret
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci	.p2align 4
14562306a36Sopenharmony_ci	/* Copy more than 128 bytes.  */
14662306a36Sopenharmony_ciL(copy_long):
14762306a36Sopenharmony_ci	/* Use backwards copy if there is an overlap.  */
14862306a36Sopenharmony_ci	sub	tmp1, dstin, src
14962306a36Sopenharmony_ci	cbz	tmp1, L(copy0)
15062306a36Sopenharmony_ci	cmp	tmp1, count
15162306a36Sopenharmony_ci	b.lo	L(copy_long_backwards)
15262306a36Sopenharmony_ci
15362306a36Sopenharmony_ci	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci	ldp	D_l, D_h, [src]
15662306a36Sopenharmony_ci	and	tmp1, dstin, 15
15762306a36Sopenharmony_ci	bic	dst, dstin, 15
15862306a36Sopenharmony_ci	sub	src, src, tmp1
15962306a36Sopenharmony_ci	add	count, count, tmp1	/* Count is now 16 too large.  */
16062306a36Sopenharmony_ci	ldp	A_l, A_h, [src, 16]
16162306a36Sopenharmony_ci	stp	D_l, D_h, [dstin]
16262306a36Sopenharmony_ci	ldp	B_l, B_h, [src, 32]
16362306a36Sopenharmony_ci	ldp	C_l, C_h, [src, 48]
16462306a36Sopenharmony_ci	ldp	D_l, D_h, [src, 64]!
16562306a36Sopenharmony_ci	subs	count, count, 128 + 16	/* Test and readjust count.  */
16662306a36Sopenharmony_ci	b.ls	L(copy64_from_end)
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ciL(loop64):
16962306a36Sopenharmony_ci	stp	A_l, A_h, [dst, 16]
17062306a36Sopenharmony_ci	ldp	A_l, A_h, [src, 16]
17162306a36Sopenharmony_ci	stp	B_l, B_h, [dst, 32]
17262306a36Sopenharmony_ci	ldp	B_l, B_h, [src, 32]
17362306a36Sopenharmony_ci	stp	C_l, C_h, [dst, 48]
17462306a36Sopenharmony_ci	ldp	C_l, C_h, [src, 48]
17562306a36Sopenharmony_ci	stp	D_l, D_h, [dst, 64]!
17662306a36Sopenharmony_ci	ldp	D_l, D_h, [src, 64]!
17762306a36Sopenharmony_ci	subs	count, count, 64
17862306a36Sopenharmony_ci	b.hi	L(loop64)
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_ci	/* Write the last iteration and copy 64 bytes from the end.  */
18162306a36Sopenharmony_ciL(copy64_from_end):
18262306a36Sopenharmony_ci	ldp	E_l, E_h, [srcend, -64]
18362306a36Sopenharmony_ci	stp	A_l, A_h, [dst, 16]
18462306a36Sopenharmony_ci	ldp	A_l, A_h, [srcend, -48]
18562306a36Sopenharmony_ci	stp	B_l, B_h, [dst, 32]
18662306a36Sopenharmony_ci	ldp	B_l, B_h, [srcend, -32]
18762306a36Sopenharmony_ci	stp	C_l, C_h, [dst, 48]
18862306a36Sopenharmony_ci	ldp	C_l, C_h, [srcend, -16]
18962306a36Sopenharmony_ci	stp	D_l, D_h, [dst, 64]
19062306a36Sopenharmony_ci	stp	E_l, E_h, [dstend, -64]
19162306a36Sopenharmony_ci	stp	A_l, A_h, [dstend, -48]
19262306a36Sopenharmony_ci	stp	B_l, B_h, [dstend, -32]
19362306a36Sopenharmony_ci	stp	C_l, C_h, [dstend, -16]
19462306a36Sopenharmony_ci	ret
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ci	.p2align 4
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci	/* Large backwards copy for overlapping copies.
19962306a36Sopenharmony_ci	   Copy 16 bytes and then align dst to 16-byte alignment.  */
20062306a36Sopenharmony_ciL(copy_long_backwards):
20162306a36Sopenharmony_ci	ldp	D_l, D_h, [srcend, -16]
20262306a36Sopenharmony_ci	and	tmp1, dstend, 15
20362306a36Sopenharmony_ci	sub	srcend, srcend, tmp1
20462306a36Sopenharmony_ci	sub	count, count, tmp1
20562306a36Sopenharmony_ci	ldp	A_l, A_h, [srcend, -16]
20662306a36Sopenharmony_ci	stp	D_l, D_h, [dstend, -16]
20762306a36Sopenharmony_ci	ldp	B_l, B_h, [srcend, -32]
20862306a36Sopenharmony_ci	ldp	C_l, C_h, [srcend, -48]
20962306a36Sopenharmony_ci	ldp	D_l, D_h, [srcend, -64]!
21062306a36Sopenharmony_ci	sub	dstend, dstend, tmp1
21162306a36Sopenharmony_ci	subs	count, count, 128
21262306a36Sopenharmony_ci	b.ls	L(copy64_from_start)
21362306a36Sopenharmony_ci
21462306a36Sopenharmony_ciL(loop64_backwards):
21562306a36Sopenharmony_ci	stp	A_l, A_h, [dstend, -16]
21662306a36Sopenharmony_ci	ldp	A_l, A_h, [srcend, -16]
21762306a36Sopenharmony_ci	stp	B_l, B_h, [dstend, -32]
21862306a36Sopenharmony_ci	ldp	B_l, B_h, [srcend, -32]
21962306a36Sopenharmony_ci	stp	C_l, C_h, [dstend, -48]
22062306a36Sopenharmony_ci	ldp	C_l, C_h, [srcend, -48]
22162306a36Sopenharmony_ci	stp	D_l, D_h, [dstend, -64]!
22262306a36Sopenharmony_ci	ldp	D_l, D_h, [srcend, -64]!
22362306a36Sopenharmony_ci	subs	count, count, 64
22462306a36Sopenharmony_ci	b.hi	L(loop64_backwards)
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci	/* Write the last iteration and copy 64 bytes from the start.  */
22762306a36Sopenharmony_ciL(copy64_from_start):
22862306a36Sopenharmony_ci	ldp	G_l, G_h, [src, 48]
22962306a36Sopenharmony_ci	stp	A_l, A_h, [dstend, -16]
23062306a36Sopenharmony_ci	ldp	A_l, A_h, [src, 32]
23162306a36Sopenharmony_ci	stp	B_l, B_h, [dstend, -32]
23262306a36Sopenharmony_ci	ldp	B_l, B_h, [src, 16]
23362306a36Sopenharmony_ci	stp	C_l, C_h, [dstend, -48]
23462306a36Sopenharmony_ci	ldp	C_l, C_h, [src]
23562306a36Sopenharmony_ci	stp	D_l, D_h, [dstend, -64]
23662306a36Sopenharmony_ci	stp	G_l, G_h, [dstin, 48]
23762306a36Sopenharmony_ci	stp	A_l, A_h, [dstin, 32]
23862306a36Sopenharmony_ci	stp	B_l, B_h, [dstin, 16]
23962306a36Sopenharmony_ci	stp	C_l, C_h, [dstin]
24062306a36Sopenharmony_ci	ret
24162306a36Sopenharmony_ciSYM_FUNC_END(__pi_memcpy)
24262306a36Sopenharmony_ci
24362306a36Sopenharmony_ciSYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
24462306a36Sopenharmony_ciEXPORT_SYMBOL(__memcpy)
24562306a36Sopenharmony_ciSYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
24662306a36Sopenharmony_ciEXPORT_SYMBOL(memcpy)
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ciSYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ciSYM_FUNC_ALIAS(__memmove, __pi_memmove)
25162306a36Sopenharmony_ciEXPORT_SYMBOL(__memmove)
25262306a36Sopenharmony_ciSYM_FUNC_ALIAS_WEAK(memmove, __memmove)
25362306a36Sopenharmony_ciEXPORT_SYMBOL(memmove)
254