18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * arch/alpha/lib/ev6-memcpy.S
48c2ecf20Sopenharmony_ci * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
58c2ecf20Sopenharmony_ci *
68c2ecf20Sopenharmony_ci * Reasonably optimized memcpy() routine for the Alpha 21264
78c2ecf20Sopenharmony_ci *
88c2ecf20Sopenharmony_ci *	- memory accessed as aligned quadwords only
98c2ecf20Sopenharmony_ci *	- uses bcmpge to compare 8 bytes in parallel
108c2ecf20Sopenharmony_ci *
118c2ecf20Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from:
128c2ecf20Sopenharmony_ci *	Compiler Writer's Guide for the Alpha 21264
138c2ecf20Sopenharmony_ci *	abbreviated as 'CWG' in other comments here
148c2ecf20Sopenharmony_ci *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
158c2ecf20Sopenharmony_ci * Scheduling notation:
168c2ecf20Sopenharmony_ci *	E	- either cluster
178c2ecf20Sopenharmony_ci *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
188c2ecf20Sopenharmony_ci *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
198c2ecf20Sopenharmony_ci *
208c2ecf20Sopenharmony_ci * Temp usage notes:
218c2ecf20Sopenharmony_ci *	$1,$2,		- scratch
228c2ecf20Sopenharmony_ci */
238c2ecf20Sopenharmony_ci#include <asm/export.h>
248c2ecf20Sopenharmony_ci	.set noreorder
258c2ecf20Sopenharmony_ci	.set noat
268c2ecf20Sopenharmony_ci
278c2ecf20Sopenharmony_ci	.align	4
288c2ecf20Sopenharmony_ci	.globl memcpy
298c2ecf20Sopenharmony_ci	.ent memcpy
308c2ecf20Sopenharmony_cimemcpy:
318c2ecf20Sopenharmony_ci	.frame $30,0,$26,0
328c2ecf20Sopenharmony_ci	.prologue 0
338c2ecf20Sopenharmony_ci
348c2ecf20Sopenharmony_ci	mov	$16, $0			# E : copy dest to return
358c2ecf20Sopenharmony_ci	ble	$18, $nomoredata	# U : done with the copy?
368c2ecf20Sopenharmony_ci	xor	$16, $17, $1		# E : are source and dest alignments the same?
378c2ecf20Sopenharmony_ci	and	$1, 7, $1		# E : are they the same mod 8?
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_ci	bne	$1, $misaligned		# U : Nope - gotta do this the slow way
408c2ecf20Sopenharmony_ci	/* source and dest are same mod 8 address */
418c2ecf20Sopenharmony_ci	and	$16, 7, $1		# E : Are both 0mod8?
428c2ecf20Sopenharmony_ci	beq	$1, $both_0mod8		# U : Yes
438c2ecf20Sopenharmony_ci	nop				# E :
448c2ecf20Sopenharmony_ci
458c2ecf20Sopenharmony_ci	/*
468c2ecf20Sopenharmony_ci	 * source and dest are same misalignment.  move a byte at a time
478c2ecf20Sopenharmony_ci	 * until a 0mod8 alignment for both is reached.
488c2ecf20Sopenharmony_ci	 * At least one byte more to move
498c2ecf20Sopenharmony_ci	 */
508c2ecf20Sopenharmony_ci
518c2ecf20Sopenharmony_ci$head_align:
528c2ecf20Sopenharmony_ci	ldbu	$1, 0($17)		# L : grab a byte
538c2ecf20Sopenharmony_ci	subq	$18, 1, $18		# E : count--
548c2ecf20Sopenharmony_ci	addq	$17, 1, $17		# E : src++
558c2ecf20Sopenharmony_ci	stb	$1, 0($16)		# L :
568c2ecf20Sopenharmony_ci	addq	$16, 1, $16		# E : dest++
578c2ecf20Sopenharmony_ci	and	$16, 7, $1		# E : Are we at 0mod8 yet?
588c2ecf20Sopenharmony_ci	ble	$18, $nomoredata	# U : done with the copy?
598c2ecf20Sopenharmony_ci	bne	$1, $head_align		# U :
608c2ecf20Sopenharmony_ci
618c2ecf20Sopenharmony_ci$both_0mod8:
628c2ecf20Sopenharmony_ci	cmple	$18, 127, $1		# E : Can we unroll the loop?
638c2ecf20Sopenharmony_ci	bne	$1, $no_unroll		# U :
648c2ecf20Sopenharmony_ci	and	$16, 63, $1		# E : get mod64 alignment
658c2ecf20Sopenharmony_ci	beq	$1, $do_unroll		# U : no single quads to fiddle
668c2ecf20Sopenharmony_ci
678c2ecf20Sopenharmony_ci$single_head_quad:
688c2ecf20Sopenharmony_ci	ldq	$1, 0($17)		# L : get 8 bytes
698c2ecf20Sopenharmony_ci	subq	$18, 8, $18		# E : count -= 8
708c2ecf20Sopenharmony_ci	addq	$17, 8, $17		# E : src += 8
718c2ecf20Sopenharmony_ci	nop				# E :
728c2ecf20Sopenharmony_ci
738c2ecf20Sopenharmony_ci	stq	$1, 0($16)		# L : store
748c2ecf20Sopenharmony_ci	addq	$16, 8, $16		# E : dest += 8
758c2ecf20Sopenharmony_ci	and	$16, 63, $1		# E : get mod64 alignment
768c2ecf20Sopenharmony_ci	bne	$1, $single_head_quad	# U : still not fully aligned
778c2ecf20Sopenharmony_ci
788c2ecf20Sopenharmony_ci$do_unroll:
798c2ecf20Sopenharmony_ci	addq	$16, 64, $7		# E : Initial (+1 trip) wh64 address
808c2ecf20Sopenharmony_ci	cmple	$18, 127, $1		# E : Can we go through the unrolled loop?
818c2ecf20Sopenharmony_ci	bne	$1, $tail_quads		# U : Nope
828c2ecf20Sopenharmony_ci	nop				# E :
838c2ecf20Sopenharmony_ci
848c2ecf20Sopenharmony_ci$unroll_body:
858c2ecf20Sopenharmony_ci	wh64	($7)			# L1 : memory subsystem hint: 64 bytes at
868c2ecf20Sopenharmony_ci					# ($7) are about to be over-written
878c2ecf20Sopenharmony_ci	ldq	$6, 0($17)		# L0 : bytes 0..7
888c2ecf20Sopenharmony_ci	nop				# E :
898c2ecf20Sopenharmony_ci	nop				# E :
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci	ldq	$4, 8($17)		# L : bytes 8..15
928c2ecf20Sopenharmony_ci	ldq	$5, 16($17)		# L : bytes 16..23
938c2ecf20Sopenharmony_ci	addq	$7, 64, $7		# E : Update next wh64 address
948c2ecf20Sopenharmony_ci	nop				# E :
958c2ecf20Sopenharmony_ci
968c2ecf20Sopenharmony_ci	ldq	$3, 24($17)		# L : bytes 24..31
978c2ecf20Sopenharmony_ci	addq	$16, 64, $1		# E : fallback value for wh64
988c2ecf20Sopenharmony_ci	nop				# E :
998c2ecf20Sopenharmony_ci	nop				# E :
1008c2ecf20Sopenharmony_ci
1018c2ecf20Sopenharmony_ci	addq	$17, 32, $17		# E : src += 32 bytes
1028c2ecf20Sopenharmony_ci	stq	$6, 0($16)		# L : bytes 0..7
1038c2ecf20Sopenharmony_ci	nop				# E :
1048c2ecf20Sopenharmony_ci	nop				# E :
1058c2ecf20Sopenharmony_ci
1068c2ecf20Sopenharmony_ci	stq	$4, 8($16)		# L : bytes 8..15
1078c2ecf20Sopenharmony_ci	stq	$5, 16($16)		# L : bytes 16..23
1088c2ecf20Sopenharmony_ci	subq	$18, 192, $2		# E : At least two more trips to go?
1098c2ecf20Sopenharmony_ci	nop				# E :
1108c2ecf20Sopenharmony_ci
1118c2ecf20Sopenharmony_ci	stq	$3, 24($16)		# L : bytes 24..31
1128c2ecf20Sopenharmony_ci	addq	$16, 32, $16		# E : dest += 32 bytes
1138c2ecf20Sopenharmony_ci	nop				# E :
1148c2ecf20Sopenharmony_ci	nop				# E :
1158c2ecf20Sopenharmony_ci
1168c2ecf20Sopenharmony_ci	ldq	$6, 0($17)		# L : bytes 0..7
1178c2ecf20Sopenharmony_ci	ldq	$4, 8($17)		# L : bytes 8..15
1188c2ecf20Sopenharmony_ci	cmovlt	$2, $1, $7		# E : Latency 2, extra map slot - Use
1198c2ecf20Sopenharmony_ci					# fallback wh64 address if < 2 more trips
1208c2ecf20Sopenharmony_ci	nop				# E :
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci	ldq	$5, 16($17)		# L : bytes 16..23
1238c2ecf20Sopenharmony_ci	ldq	$3, 24($17)		# L : bytes 24..31
1248c2ecf20Sopenharmony_ci	addq	$16, 32, $16		# E : dest += 32
1258c2ecf20Sopenharmony_ci	subq	$18, 64, $18		# E : count -= 64
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_ci	addq	$17, 32, $17		# E : src += 32
1288c2ecf20Sopenharmony_ci	stq	$6, -32($16)		# L : bytes 0..7
1298c2ecf20Sopenharmony_ci	stq	$4, -24($16)		# L : bytes 8..15
1308c2ecf20Sopenharmony_ci	cmple	$18, 63, $1		# E : At least one more trip?
1318c2ecf20Sopenharmony_ci
1328c2ecf20Sopenharmony_ci	stq	$5, -16($16)		# L : bytes 16..23
1338c2ecf20Sopenharmony_ci	stq	$3, -8($16)		# L : bytes 24..31
1348c2ecf20Sopenharmony_ci	nop				# E :
1358c2ecf20Sopenharmony_ci	beq	$1, $unroll_body
1368c2ecf20Sopenharmony_ci
1378c2ecf20Sopenharmony_ci$tail_quads:
1388c2ecf20Sopenharmony_ci$no_unroll:
1398c2ecf20Sopenharmony_ci	.align 4
1408c2ecf20Sopenharmony_ci	subq	$18, 8, $18		# E : At least a quad left?
1418c2ecf20Sopenharmony_ci	blt	$18, $less_than_8	# U : Nope
1428c2ecf20Sopenharmony_ci	nop				# E :
1438c2ecf20Sopenharmony_ci	nop				# E :
1448c2ecf20Sopenharmony_ci
1458c2ecf20Sopenharmony_ci$move_a_quad:
1468c2ecf20Sopenharmony_ci	ldq	$1, 0($17)		# L : fetch 8
1478c2ecf20Sopenharmony_ci	subq	$18, 8, $18		# E : count -= 8
1488c2ecf20Sopenharmony_ci	addq	$17, 8, $17		# E : src += 8
1498c2ecf20Sopenharmony_ci	nop				# E :
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_ci	stq	$1, 0($16)		# L : store 8
1528c2ecf20Sopenharmony_ci	addq	$16, 8, $16		# E : dest += 8
1538c2ecf20Sopenharmony_ci	bge	$18, $move_a_quad	# U :
1548c2ecf20Sopenharmony_ci	nop				# E :
1558c2ecf20Sopenharmony_ci
1568c2ecf20Sopenharmony_ci$less_than_8:
1578c2ecf20Sopenharmony_ci	.align 4
1588c2ecf20Sopenharmony_ci	addq	$18, 8, $18		# E : add back for trailing bytes
1598c2ecf20Sopenharmony_ci	ble	$18, $nomoredata	# U : All-done
1608c2ecf20Sopenharmony_ci	nop				# E :
1618c2ecf20Sopenharmony_ci	nop				# E :
1628c2ecf20Sopenharmony_ci
1638c2ecf20Sopenharmony_ci	/* Trailing bytes */
1648c2ecf20Sopenharmony_ci$tail_bytes:
1658c2ecf20Sopenharmony_ci	subq	$18, 1, $18		# E : count--
1668c2ecf20Sopenharmony_ci	ldbu	$1, 0($17)		# L : fetch a byte
1678c2ecf20Sopenharmony_ci	addq	$17, 1, $17		# E : src++
1688c2ecf20Sopenharmony_ci	nop				# E :
1698c2ecf20Sopenharmony_ci
1708c2ecf20Sopenharmony_ci	stb	$1, 0($16)		# L : store a byte
1718c2ecf20Sopenharmony_ci	addq	$16, 1, $16		# E : dest++
1728c2ecf20Sopenharmony_ci	bgt	$18, $tail_bytes	# U : more to be done?
1738c2ecf20Sopenharmony_ci	nop				# E :
1748c2ecf20Sopenharmony_ci
1758c2ecf20Sopenharmony_ci	/* branching to exit takes 3 extra cycles, so replicate exit here */
1768c2ecf20Sopenharmony_ci	ret	$31, ($26), 1		# L0 :
1778c2ecf20Sopenharmony_ci	nop				# E :
1788c2ecf20Sopenharmony_ci	nop				# E :
1798c2ecf20Sopenharmony_ci	nop				# E :
1808c2ecf20Sopenharmony_ci
1818c2ecf20Sopenharmony_ci$misaligned:
1828c2ecf20Sopenharmony_ci	mov	$0, $4			# E : dest temp
1838c2ecf20Sopenharmony_ci	and	$0, 7, $1		# E : dest alignment mod8
1848c2ecf20Sopenharmony_ci	beq	$1, $dest_0mod8		# U : life doesnt totally suck
1858c2ecf20Sopenharmony_ci	nop
1868c2ecf20Sopenharmony_ci
1878c2ecf20Sopenharmony_ci$aligndest:
1888c2ecf20Sopenharmony_ci	ble	$18, $nomoredata	# U :
1898c2ecf20Sopenharmony_ci	ldbu	$1, 0($17)		# L : fetch a byte
1908c2ecf20Sopenharmony_ci	subq	$18, 1, $18		# E : count--
1918c2ecf20Sopenharmony_ci	addq	$17, 1, $17		# E : src++
1928c2ecf20Sopenharmony_ci
1938c2ecf20Sopenharmony_ci	stb	$1, 0($4)		# L : store it
1948c2ecf20Sopenharmony_ci	addq	$4, 1, $4		# E : dest++
1958c2ecf20Sopenharmony_ci	and	$4, 7, $1		# E : dest 0mod8 yet?
1968c2ecf20Sopenharmony_ci	bne	$1, $aligndest		# U : go until we are aligned.
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_ci	/* Source has unknown alignment, but dest is known to be 0mod8 */
1998c2ecf20Sopenharmony_ci$dest_0mod8:
2008c2ecf20Sopenharmony_ci	subq	$18, 8, $18		# E : At least a quad left?
2018c2ecf20Sopenharmony_ci	blt	$18, $misalign_tail	# U : Nope
2028c2ecf20Sopenharmony_ci	ldq_u	$3, 0($17)		# L : seed (rotating load) of 8 bytes
2038c2ecf20Sopenharmony_ci	nop				# E :
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci$mis_quad:
2068c2ecf20Sopenharmony_ci	ldq_u	$16, 8($17)		# L : Fetch next 8
2078c2ecf20Sopenharmony_ci	extql	$3, $17, $3		# U : masking
2088c2ecf20Sopenharmony_ci	extqh	$16, $17, $1		# U : masking
2098c2ecf20Sopenharmony_ci	bis	$3, $1, $1		# E : merged bytes to store
2108c2ecf20Sopenharmony_ci
2118c2ecf20Sopenharmony_ci	subq	$18, 8, $18		# E : count -= 8
2128c2ecf20Sopenharmony_ci	addq	$17, 8, $17		# E : src += 8
2138c2ecf20Sopenharmony_ci	stq	$1, 0($4)		# L : store 8 (aligned)
2148c2ecf20Sopenharmony_ci	mov	$16, $3			# E : "rotate" source data
2158c2ecf20Sopenharmony_ci
2168c2ecf20Sopenharmony_ci	addq	$4, 8, $4		# E : dest += 8
2178c2ecf20Sopenharmony_ci	bge	$18, $mis_quad		# U : More quads to move
2188c2ecf20Sopenharmony_ci	nop
2198c2ecf20Sopenharmony_ci	nop
2208c2ecf20Sopenharmony_ci
2218c2ecf20Sopenharmony_ci$misalign_tail:
2228c2ecf20Sopenharmony_ci	addq	$18, 8, $18		# E : account for tail stuff
2238c2ecf20Sopenharmony_ci	ble	$18, $nomoredata	# U :
2248c2ecf20Sopenharmony_ci	nop
2258c2ecf20Sopenharmony_ci	nop
2268c2ecf20Sopenharmony_ci
2278c2ecf20Sopenharmony_ci$misalign_byte:
2288c2ecf20Sopenharmony_ci	ldbu	$1, 0($17)		# L : fetch 1
2298c2ecf20Sopenharmony_ci	subq	$18, 1, $18		# E : count--
2308c2ecf20Sopenharmony_ci	addq	$17, 1, $17		# E : src++
2318c2ecf20Sopenharmony_ci	nop				# E :
2328c2ecf20Sopenharmony_ci
2338c2ecf20Sopenharmony_ci	stb	$1, 0($4)		# L : store
2348c2ecf20Sopenharmony_ci	addq	$4, 1, $4		# E : dest++
2358c2ecf20Sopenharmony_ci	bgt	$18, $misalign_byte	# U : more to go?
2368c2ecf20Sopenharmony_ci	nop
2378c2ecf20Sopenharmony_ci
2388c2ecf20Sopenharmony_ci
2398c2ecf20Sopenharmony_ci$nomoredata:
2408c2ecf20Sopenharmony_ci	ret	$31, ($26), 1		# L0 :
2418c2ecf20Sopenharmony_ci	nop				# E :
2428c2ecf20Sopenharmony_ci	nop				# E :
2438c2ecf20Sopenharmony_ci	nop				# E :
2448c2ecf20Sopenharmony_ci
2458c2ecf20Sopenharmony_ci	.end memcpy
2468c2ecf20Sopenharmony_ci	EXPORT_SYMBOL(memcpy)
2478c2ecf20Sopenharmony_ci
2488c2ecf20Sopenharmony_ci/* For backwards module compatibility.  */
2498c2ecf20Sopenharmony_ci__memcpy = memcpy
2508c2ecf20Sopenharmony_ci.globl __memcpy
251