162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * arch/alpha/lib/ev6-memcpy.S
462306a36Sopenharmony_ci * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * Reasonably optimized memcpy() routine for the Alpha 21264
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci *	- memory accessed as aligned quadwords only
962306a36Sopenharmony_ci *	- uses bcmpge to compare 8 bytes in parallel
1062306a36Sopenharmony_ci *
1162306a36Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from:
1262306a36Sopenharmony_ci *	Compiler Writer's Guide for the Alpha 21264
1362306a36Sopenharmony_ci *	abbreviated as 'CWG' in other comments here
1462306a36Sopenharmony_ci *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
1562306a36Sopenharmony_ci * Scheduling notation:
1662306a36Sopenharmony_ci *	E	- either cluster
1762306a36Sopenharmony_ci *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
1862306a36Sopenharmony_ci *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
1962306a36Sopenharmony_ci *
2062306a36Sopenharmony_ci * Temp usage notes:
2162306a36Sopenharmony_ci *	$1,$2,		- scratch
2262306a36Sopenharmony_ci */
2362306a36Sopenharmony_ci#include <linux/export.h>
2462306a36Sopenharmony_ci	.set noreorder
2562306a36Sopenharmony_ci	.set noat
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci	.align	4
2862306a36Sopenharmony_ci	.globl memcpy
2962306a36Sopenharmony_ci	.ent memcpy
3062306a36Sopenharmony_cimemcpy:
3162306a36Sopenharmony_ci	.frame $30,0,$26,0
3262306a36Sopenharmony_ci	.prologue 0
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci	mov	$16, $0			# E : copy dest to return
3562306a36Sopenharmony_ci	ble	$18, $nomoredata	# U : done with the copy?
3662306a36Sopenharmony_ci	xor	$16, $17, $1		# E : are source and dest alignments the same?
3762306a36Sopenharmony_ci	and	$1, 7, $1		# E : are they the same mod 8?
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci	bne	$1, $misaligned		# U : Nope - gotta do this the slow way
4062306a36Sopenharmony_ci	/* source and dest are same mod 8 address */
4162306a36Sopenharmony_ci	and	$16, 7, $1		# E : Are both 0mod8?
4262306a36Sopenharmony_ci	beq	$1, $both_0mod8		# U : Yes
4362306a36Sopenharmony_ci	nop				# E :
4462306a36Sopenharmony_ci
4562306a36Sopenharmony_ci	/*
4662306a36Sopenharmony_ci	 * source and dest are same misalignment.  move a byte at a time
4762306a36Sopenharmony_ci	 * until a 0mod8 alignment for both is reached.
4862306a36Sopenharmony_ci	 * At least one byte more to move
4962306a36Sopenharmony_ci	 */
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci$head_align:
5262306a36Sopenharmony_ci	ldbu	$1, 0($17)		# L : grab a byte
5362306a36Sopenharmony_ci	subq	$18, 1, $18		# E : count--
5462306a36Sopenharmony_ci	addq	$17, 1, $17		# E : src++
5562306a36Sopenharmony_ci	stb	$1, 0($16)		# L :
5662306a36Sopenharmony_ci	addq	$16, 1, $16		# E : dest++
5762306a36Sopenharmony_ci	and	$16, 7, $1		# E : Are we at 0mod8 yet?
5862306a36Sopenharmony_ci	ble	$18, $nomoredata	# U : done with the copy?
5962306a36Sopenharmony_ci	bne	$1, $head_align		# U :
6062306a36Sopenharmony_ci
6162306a36Sopenharmony_ci$both_0mod8:
6262306a36Sopenharmony_ci	cmple	$18, 127, $1		# E : Can we unroll the loop?
6362306a36Sopenharmony_ci	bne	$1, $no_unroll		# U :
6462306a36Sopenharmony_ci	and	$16, 63, $1		# E : get mod64 alignment
6562306a36Sopenharmony_ci	beq	$1, $do_unroll		# U : no single quads to fiddle
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ci$single_head_quad:
6862306a36Sopenharmony_ci	ldq	$1, 0($17)		# L : get 8 bytes
6962306a36Sopenharmony_ci	subq	$18, 8, $18		# E : count -= 8
7062306a36Sopenharmony_ci	addq	$17, 8, $17		# E : src += 8
7162306a36Sopenharmony_ci	nop				# E :
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci	stq	$1, 0($16)		# L : store
7462306a36Sopenharmony_ci	addq	$16, 8, $16		# E : dest += 8
7562306a36Sopenharmony_ci	and	$16, 63, $1		# E : get mod64 alignment
7662306a36Sopenharmony_ci	bne	$1, $single_head_quad	# U : still not fully aligned
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_ci$do_unroll:
7962306a36Sopenharmony_ci	addq	$16, 64, $7		# E : Initial (+1 trip) wh64 address
8062306a36Sopenharmony_ci	cmple	$18, 127, $1		# E : Can we go through the unrolled loop?
8162306a36Sopenharmony_ci	bne	$1, $tail_quads		# U : Nope
8262306a36Sopenharmony_ci	nop				# E :
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci$unroll_body:
8562306a36Sopenharmony_ci	wh64	($7)			# L1 : memory subsystem hint: 64 bytes at
8662306a36Sopenharmony_ci					# ($7) are about to be over-written
8762306a36Sopenharmony_ci	ldq	$6, 0($17)		# L0 : bytes 0..7
8862306a36Sopenharmony_ci	nop				# E :
8962306a36Sopenharmony_ci	nop				# E :
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci	ldq	$4, 8($17)		# L : bytes 8..15
9262306a36Sopenharmony_ci	ldq	$5, 16($17)		# L : bytes 16..23
9362306a36Sopenharmony_ci	addq	$7, 64, $7		# E : Update next wh64 address
9462306a36Sopenharmony_ci	nop				# E :
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci	ldq	$3, 24($17)		# L : bytes 24..31
9762306a36Sopenharmony_ci	addq	$16, 64, $1		# E : fallback value for wh64
9862306a36Sopenharmony_ci	nop				# E :
9962306a36Sopenharmony_ci	nop				# E :
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci	addq	$17, 32, $17		# E : src += 32 bytes
10262306a36Sopenharmony_ci	stq	$6, 0($16)		# L : bytes 0..7
10362306a36Sopenharmony_ci	nop				# E :
10462306a36Sopenharmony_ci	nop				# E :
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci	stq	$4, 8($16)		# L : bytes 8..15
10762306a36Sopenharmony_ci	stq	$5, 16($16)		# L : bytes 16..23
10862306a36Sopenharmony_ci	subq	$18, 192, $2		# E : At least two more trips to go?
10962306a36Sopenharmony_ci	nop				# E :
11062306a36Sopenharmony_ci
11162306a36Sopenharmony_ci	stq	$3, 24($16)		# L : bytes 24..31
11262306a36Sopenharmony_ci	addq	$16, 32, $16		# E : dest += 32 bytes
11362306a36Sopenharmony_ci	nop				# E :
11462306a36Sopenharmony_ci	nop				# E :
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci	ldq	$6, 0($17)		# L : bytes 0..7
11762306a36Sopenharmony_ci	ldq	$4, 8($17)		# L : bytes 8..15
11862306a36Sopenharmony_ci	cmovlt	$2, $1, $7		# E : Latency 2, extra map slot - Use
11962306a36Sopenharmony_ci					# fallback wh64 address if < 2 more trips
12062306a36Sopenharmony_ci	nop				# E :
12162306a36Sopenharmony_ci
12262306a36Sopenharmony_ci	ldq	$5, 16($17)		# L : bytes 16..23
12362306a36Sopenharmony_ci	ldq	$3, 24($17)		# L : bytes 24..31
12462306a36Sopenharmony_ci	addq	$16, 32, $16		# E : dest += 32
12562306a36Sopenharmony_ci	subq	$18, 64, $18		# E : count -= 64
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci	addq	$17, 32, $17		# E : src += 32
12862306a36Sopenharmony_ci	stq	$6, -32($16)		# L : bytes 0..7
12962306a36Sopenharmony_ci	stq	$4, -24($16)		# L : bytes 8..15
13062306a36Sopenharmony_ci	cmple	$18, 63, $1		# E : At least one more trip?
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci	stq	$5, -16($16)		# L : bytes 16..23
13362306a36Sopenharmony_ci	stq	$3, -8($16)		# L : bytes 24..31
13462306a36Sopenharmony_ci	nop				# E :
13562306a36Sopenharmony_ci	beq	$1, $unroll_body
13662306a36Sopenharmony_ci
13762306a36Sopenharmony_ci$tail_quads:
13862306a36Sopenharmony_ci$no_unroll:
13962306a36Sopenharmony_ci	.align 4
14062306a36Sopenharmony_ci	subq	$18, 8, $18		# E : At least a quad left?
14162306a36Sopenharmony_ci	blt	$18, $less_than_8	# U : Nope
14262306a36Sopenharmony_ci	nop				# E :
14362306a36Sopenharmony_ci	nop				# E :
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_ci$move_a_quad:
14662306a36Sopenharmony_ci	ldq	$1, 0($17)		# L : fetch 8
14762306a36Sopenharmony_ci	subq	$18, 8, $18		# E : count -= 8
14862306a36Sopenharmony_ci	addq	$17, 8, $17		# E : src += 8
14962306a36Sopenharmony_ci	nop				# E :
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci	stq	$1, 0($16)		# L : store 8
15262306a36Sopenharmony_ci	addq	$16, 8, $16		# E : dest += 8
15362306a36Sopenharmony_ci	bge	$18, $move_a_quad	# U :
15462306a36Sopenharmony_ci	nop				# E :
15562306a36Sopenharmony_ci
15662306a36Sopenharmony_ci$less_than_8:
15762306a36Sopenharmony_ci	.align 4
15862306a36Sopenharmony_ci	addq	$18, 8, $18		# E : add back for trailing bytes
15962306a36Sopenharmony_ci	ble	$18, $nomoredata	# U : All-done
16062306a36Sopenharmony_ci	nop				# E :
16162306a36Sopenharmony_ci	nop				# E :
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci	/* Trailing bytes */
16462306a36Sopenharmony_ci$tail_bytes:
16562306a36Sopenharmony_ci	subq	$18, 1, $18		# E : count--
16662306a36Sopenharmony_ci	ldbu	$1, 0($17)		# L : fetch a byte
16762306a36Sopenharmony_ci	addq	$17, 1, $17		# E : src++
16862306a36Sopenharmony_ci	nop				# E :
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci	stb	$1, 0($16)		# L : store a byte
17162306a36Sopenharmony_ci	addq	$16, 1, $16		# E : dest++
17262306a36Sopenharmony_ci	bgt	$18, $tail_bytes	# U : more to be done?
17362306a36Sopenharmony_ci	nop				# E :
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_ci	/* branching to exit takes 3 extra cycles, so replicate exit here */
17662306a36Sopenharmony_ci	ret	$31, ($26), 1		# L0 :
17762306a36Sopenharmony_ci	nop				# E :
17862306a36Sopenharmony_ci	nop				# E :
17962306a36Sopenharmony_ci	nop				# E :
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_ci$misaligned:
18262306a36Sopenharmony_ci	mov	$0, $4			# E : dest temp
18362306a36Sopenharmony_ci	and	$0, 7, $1		# E : dest alignment mod8
18462306a36Sopenharmony_ci	beq	$1, $dest_0mod8		# U : life doesnt totally suck
18562306a36Sopenharmony_ci	nop
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_ci$aligndest:
18862306a36Sopenharmony_ci	ble	$18, $nomoredata	# U :
18962306a36Sopenharmony_ci	ldbu	$1, 0($17)		# L : fetch a byte
19062306a36Sopenharmony_ci	subq	$18, 1, $18		# E : count--
19162306a36Sopenharmony_ci	addq	$17, 1, $17		# E : src++
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci	stb	$1, 0($4)		# L : store it
19462306a36Sopenharmony_ci	addq	$4, 1, $4		# E : dest++
19562306a36Sopenharmony_ci	and	$4, 7, $1		# E : dest 0mod8 yet?
19662306a36Sopenharmony_ci	bne	$1, $aligndest		# U : go until we are aligned.
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci	/* Source has unknown alignment, but dest is known to be 0mod8 */
19962306a36Sopenharmony_ci$dest_0mod8:
20062306a36Sopenharmony_ci	subq	$18, 8, $18		# E : At least a quad left?
20162306a36Sopenharmony_ci	blt	$18, $misalign_tail	# U : Nope
20262306a36Sopenharmony_ci	ldq_u	$3, 0($17)		# L : seed (rotating load) of 8 bytes
20362306a36Sopenharmony_ci	nop				# E :
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci$mis_quad:
20662306a36Sopenharmony_ci	ldq_u	$16, 8($17)		# L : Fetch next 8
20762306a36Sopenharmony_ci	extql	$3, $17, $3		# U : masking
20862306a36Sopenharmony_ci	extqh	$16, $17, $1		# U : masking
20962306a36Sopenharmony_ci	bis	$3, $1, $1		# E : merged bytes to store
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_ci	subq	$18, 8, $18		# E : count -= 8
21262306a36Sopenharmony_ci	addq	$17, 8, $17		# E : src += 8
21362306a36Sopenharmony_ci	stq	$1, 0($4)		# L : store 8 (aligned)
21462306a36Sopenharmony_ci	mov	$16, $3			# E : "rotate" source data
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci	addq	$4, 8, $4		# E : dest += 8
21762306a36Sopenharmony_ci	bge	$18, $mis_quad		# U : More quads to move
21862306a36Sopenharmony_ci	nop
21962306a36Sopenharmony_ci	nop
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ci$misalign_tail:
22262306a36Sopenharmony_ci	addq	$18, 8, $18		# E : account for tail stuff
22362306a36Sopenharmony_ci	ble	$18, $nomoredata	# U :
22462306a36Sopenharmony_ci	nop
22562306a36Sopenharmony_ci	nop
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci$misalign_byte:
22862306a36Sopenharmony_ci	ldbu	$1, 0($17)		# L : fetch 1
22962306a36Sopenharmony_ci	subq	$18, 1, $18		# E : count--
23062306a36Sopenharmony_ci	addq	$17, 1, $17		# E : src++
23162306a36Sopenharmony_ci	nop				# E :
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_ci	stb	$1, 0($4)		# L : store
23462306a36Sopenharmony_ci	addq	$4, 1, $4		# E : dest++
23562306a36Sopenharmony_ci	bgt	$18, $misalign_byte	# U : more to go?
23662306a36Sopenharmony_ci	nop
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci
23962306a36Sopenharmony_ci$nomoredata:
24062306a36Sopenharmony_ci	ret	$31, ($26), 1		# L0 :
24162306a36Sopenharmony_ci	nop				# E :
24262306a36Sopenharmony_ci	nop				# E :
24362306a36Sopenharmony_ci	nop				# E :
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci	.end memcpy
24662306a36Sopenharmony_ci	EXPORT_SYMBOL(memcpy)
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci/* For backwards module compatibility.  */
24962306a36Sopenharmony_ci__memcpy = memcpy
25062306a36Sopenharmony_ci.globl __memcpy
251