18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * arch/alpha/lib/ev6-memset.S
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * This is an efficient (and relatively small) implementation of the C library
68c2ecf20Sopenharmony_ci * "memset()" function for the 21264 implementation of Alpha.
78c2ecf20Sopenharmony_ci *
88c2ecf20Sopenharmony_ci * 21264 version  contributed by Rick Gorton <rick.gorton@alpha-processor.com>
98c2ecf20Sopenharmony_ci *
108c2ecf20Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from:
118c2ecf20Sopenharmony_ci *	Compiler Writer's Guide for the Alpha 21264
128c2ecf20Sopenharmony_ci *	abbreviated as 'CWG' in other comments here
138c2ecf20Sopenharmony_ci *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
148c2ecf20Sopenharmony_ci * Scheduling notation:
158c2ecf20Sopenharmony_ci *	E	- either cluster
168c2ecf20Sopenharmony_ci *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
178c2ecf20Sopenharmony_ci *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
188c2ecf20Sopenharmony_ci * The algorithm for the leading and trailing quadwords remains the same,
198c2ecf20Sopenharmony_ci * however the loop has been unrolled to enable better memory throughput,
208c2ecf20Sopenharmony_ci * and the code has been replicated for each of the entry points: __memset
218c2ecf20Sopenharmony_ci * and __memset16 to permit better scheduling to eliminate the stalling
228c2ecf20Sopenharmony_ci * encountered during the mask replication.
238c2ecf20Sopenharmony_ci * A future enhancement might be to put in a byte store loop for really
248c2ecf20Sopenharmony_ci * small (say < 32 bytes) memset()s.  Whether or not that change would be
258c2ecf20Sopenharmony_ci * a win in the kernel would depend upon the contextual usage.
268c2ecf20Sopenharmony_ci * WARNING: Maintaining this is going to be more work than the above version,
278c2ecf20Sopenharmony_ci * as fixes will need to be made in multiple places.  The performance gain
288c2ecf20Sopenharmony_ci * is worth it.
298c2ecf20Sopenharmony_ci */
308c2ecf20Sopenharmony_ci#include <asm/export.h>
318c2ecf20Sopenharmony_ci	.set noat
328c2ecf20Sopenharmony_ci	.set noreorder
338c2ecf20Sopenharmony_ci.text
348c2ecf20Sopenharmony_ci	.globl memset
358c2ecf20Sopenharmony_ci	.globl __memset
368c2ecf20Sopenharmony_ci	.globl ___memset
378c2ecf20Sopenharmony_ci	.globl __memset16
388c2ecf20Sopenharmony_ci	.globl __constant_c_memset
398c2ecf20Sopenharmony_ci
408c2ecf20Sopenharmony_ci	.ent ___memset
418c2ecf20Sopenharmony_ci.align 5
428c2ecf20Sopenharmony_ci___memset:
438c2ecf20Sopenharmony_ci	.frame $30,0,$26,0
448c2ecf20Sopenharmony_ci	.prologue 0
458c2ecf20Sopenharmony_ci
468c2ecf20Sopenharmony_ci	/*
478c2ecf20Sopenharmony_ci	 * Serious stalling happens.  The only way to mitigate this is to
488c2ecf20Sopenharmony_ci	 * undertake a major re-write to interleave the constant materialization
498c2ecf20Sopenharmony_ci	 * with other parts of the fall-through code.  This is important, even
508c2ecf20Sopenharmony_ci	 * though it makes maintenance tougher.
518c2ecf20Sopenharmony_ci	 * Do this later.
528c2ecf20Sopenharmony_ci	 */
538c2ecf20Sopenharmony_ci	and $17,255,$1		# E : 00000000000000ch
548c2ecf20Sopenharmony_ci	insbl $17,1,$2		# U : 000000000000ch00
558c2ecf20Sopenharmony_ci	bis $16,$16,$0		# E : return value
568c2ecf20Sopenharmony_ci	ble $18,end_b		# U : zero length requested?
578c2ecf20Sopenharmony_ci
588c2ecf20Sopenharmony_ci	addq $18,$16,$6		# E : max address to write to
598c2ecf20Sopenharmony_ci	bis	$1,$2,$17	# E : 000000000000chch
608c2ecf20Sopenharmony_ci	insbl	$1,2,$3		# U : 0000000000ch0000
618c2ecf20Sopenharmony_ci	insbl	$1,3,$4		# U : 00000000ch000000
628c2ecf20Sopenharmony_ci
638c2ecf20Sopenharmony_ci	or	$3,$4,$3	# E : 00000000chch0000
648c2ecf20Sopenharmony_ci	inswl	$17,4,$5	# U : 0000chch00000000
658c2ecf20Sopenharmony_ci	xor	$16,$6,$1	# E : will complete write be within one quadword?
668c2ecf20Sopenharmony_ci	inswl	$17,6,$2	# U : chch000000000000
678c2ecf20Sopenharmony_ci
688c2ecf20Sopenharmony_ci	or	$17,$3,$17	# E : 00000000chchchch
698c2ecf20Sopenharmony_ci	or	$2,$5,$2	# E : chchchch00000000
708c2ecf20Sopenharmony_ci	bic	$1,7,$1		# E : fit within a single quadword?
718c2ecf20Sopenharmony_ci	and	$16,7,$3	# E : Target addr misalignment
728c2ecf20Sopenharmony_ci
738c2ecf20Sopenharmony_ci	or	$17,$2,$17	# E : chchchchchchchch
748c2ecf20Sopenharmony_ci	beq	$1,within_quad_b # U :
758c2ecf20Sopenharmony_ci	nop			# E :
768c2ecf20Sopenharmony_ci	beq	$3,aligned_b	# U : target is 0mod8
778c2ecf20Sopenharmony_ci
788c2ecf20Sopenharmony_ci	/*
798c2ecf20Sopenharmony_ci	 * Target address is misaligned, and won't fit within a quadword
808c2ecf20Sopenharmony_ci	 */
818c2ecf20Sopenharmony_ci	ldq_u $4,0($16)		# L : Fetch first partial
828c2ecf20Sopenharmony_ci	bis $16,$16,$5		# E : Save the address
838c2ecf20Sopenharmony_ci	insql $17,$16,$2	# U : Insert new bytes
848c2ecf20Sopenharmony_ci	subq $3,8,$3		# E : Invert (for addressing uses)
858c2ecf20Sopenharmony_ci
868c2ecf20Sopenharmony_ci	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
878c2ecf20Sopenharmony_ci	mskql $4,$16,$4		# U : clear relevant parts of the quad
888c2ecf20Sopenharmony_ci	subq $16,$3,$16		# E : $16 is new aligned destination
898c2ecf20Sopenharmony_ci	bis $2,$4,$1		# E : Final bytes
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci	nop
928c2ecf20Sopenharmony_ci	stq_u $1,0($5)		# L : Store result
938c2ecf20Sopenharmony_ci	nop
948c2ecf20Sopenharmony_ci	nop
958c2ecf20Sopenharmony_ci
968c2ecf20Sopenharmony_ci.align 4
978c2ecf20Sopenharmony_cialigned_b:
988c2ecf20Sopenharmony_ci	/*
998c2ecf20Sopenharmony_ci	 * We are now guaranteed to be quad aligned, with at least
1008c2ecf20Sopenharmony_ci	 * one partial quad to write.
1018c2ecf20Sopenharmony_ci	 */
1028c2ecf20Sopenharmony_ci
1038c2ecf20Sopenharmony_ci	sra $18,3,$3		# U : Number of remaining quads to write
1048c2ecf20Sopenharmony_ci	and $18,7,$18		# E : Number of trailing bytes to write
1058c2ecf20Sopenharmony_ci	bis $16,$16,$5		# E : Save dest address
1068c2ecf20Sopenharmony_ci	beq $3,no_quad_b	# U : tail stuff only
1078c2ecf20Sopenharmony_ci
1088c2ecf20Sopenharmony_ci	/*
1098c2ecf20Sopenharmony_ci	 * it's worth the effort to unroll this and use wh64 if possible
1108c2ecf20Sopenharmony_ci	 * Lifted a bunch of code from clear_user.S
1118c2ecf20Sopenharmony_ci	 * At this point, entry values are:
1128c2ecf20Sopenharmony_ci	 * $16	Current destination address
1138c2ecf20Sopenharmony_ci	 * $5	A copy of $16
1148c2ecf20Sopenharmony_ci	 * $6	The max quadword address to write to
1158c2ecf20Sopenharmony_ci	 * $18	Number trailer bytes
1168c2ecf20Sopenharmony_ci	 * $3	Number quads to write
1178c2ecf20Sopenharmony_ci	 */
1188c2ecf20Sopenharmony_ci
1198c2ecf20Sopenharmony_ci	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
1208c2ecf20Sopenharmony_ci	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
1218c2ecf20Sopenharmony_ci	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
1228c2ecf20Sopenharmony_ci	blt	$4, loop_b	# U :
1238c2ecf20Sopenharmony_ci
1248c2ecf20Sopenharmony_ci	/*
1258c2ecf20Sopenharmony_ci	 * We know we've got at least 16 quads, minimum of one trip
1268c2ecf20Sopenharmony_ci	 * through unrolled loop.  Do a quad at a time to get us 0mod64
1278c2ecf20Sopenharmony_ci	 * aligned.
1288c2ecf20Sopenharmony_ci	 */
1298c2ecf20Sopenharmony_ci
1308c2ecf20Sopenharmony_ci	nop			# E :
1318c2ecf20Sopenharmony_ci	nop			# E :
1328c2ecf20Sopenharmony_ci	nop			# E :
1338c2ecf20Sopenharmony_ci	beq	$1, $bigalign_b	# U :
1348c2ecf20Sopenharmony_ci
1358c2ecf20Sopenharmony_ci$alignmod64_b:
1368c2ecf20Sopenharmony_ci	stq	$17, 0($5)	# L :
1378c2ecf20Sopenharmony_ci	subq	$3, 1, $3	# E : For consistency later
1388c2ecf20Sopenharmony_ci	addq	$1, 8, $1	# E : Increment towards zero for alignment
1398c2ecf20Sopenharmony_ci	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
1408c2ecf20Sopenharmony_ci
1418c2ecf20Sopenharmony_ci	nop
1428c2ecf20Sopenharmony_ci	nop
1438c2ecf20Sopenharmony_ci	addq	$5, 8, $5	# E : Inc address
1448c2ecf20Sopenharmony_ci	blt	$1, $alignmod64_b # U :
1458c2ecf20Sopenharmony_ci
1468c2ecf20Sopenharmony_ci$bigalign_b:
1478c2ecf20Sopenharmony_ci	/*
1488c2ecf20Sopenharmony_ci	 * $3 - number quads left to go
1498c2ecf20Sopenharmony_ci	 * $5 - target address (aligned 0mod64)
1508c2ecf20Sopenharmony_ci	 * $17 - mask of stuff to store
1518c2ecf20Sopenharmony_ci	 * Scratch registers available: $7, $2, $4, $1
1528c2ecf20Sopenharmony_ci	 * we know that we'll be taking a minimum of one trip through
1538c2ecf20Sopenharmony_ci 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
1548c2ecf20Sopenharmony_ci	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
1558c2ecf20Sopenharmony_ci	 * The wh64 is issued on for the starting destination address for trip +2
1568c2ecf20Sopenharmony_ci	 * through the loop, and if there are less than two trips left, the target
1578c2ecf20Sopenharmony_ci	 * address will be for the current trip.
1588c2ecf20Sopenharmony_ci	 */
1598c2ecf20Sopenharmony_ci
1608c2ecf20Sopenharmony_ci$do_wh64_b:
1618c2ecf20Sopenharmony_ci	wh64	($4)		# L1 : memory subsystem write hint
1628c2ecf20Sopenharmony_ci	subq	$3, 24, $2	# E : For determining future wh64 addresses
1638c2ecf20Sopenharmony_ci	stq	$17, 0($5)	# L :
1648c2ecf20Sopenharmony_ci	nop			# E :
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ci	addq	$5, 128, $4	# E : speculative target of next wh64
1678c2ecf20Sopenharmony_ci	stq	$17, 8($5)	# L :
1688c2ecf20Sopenharmony_ci	stq	$17, 16($5)	# L :
1698c2ecf20Sopenharmony_ci	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
1708c2ecf20Sopenharmony_ci
1718c2ecf20Sopenharmony_ci	stq	$17, 24($5)	# L :
1728c2ecf20Sopenharmony_ci	stq	$17, 32($5)	# L :
1738c2ecf20Sopenharmony_ci	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
1748c2ecf20Sopenharmony_ci	nop
1758c2ecf20Sopenharmony_ci
1768c2ecf20Sopenharmony_ci	stq	$17, 40($5)	# L :
1778c2ecf20Sopenharmony_ci	stq	$17, 48($5)	# L :
1788c2ecf20Sopenharmony_ci	subq	$3, 16, $2	# E : Repeat the loop at least once more?
1798c2ecf20Sopenharmony_ci	nop
1808c2ecf20Sopenharmony_ci
1818c2ecf20Sopenharmony_ci	stq	$17, 56($5)	# L :
1828c2ecf20Sopenharmony_ci	addq	$5, 64, $5	# E :
1838c2ecf20Sopenharmony_ci	subq	$3, 8, $3	# E :
1848c2ecf20Sopenharmony_ci	bge	$2, $do_wh64_b	# U :
1858c2ecf20Sopenharmony_ci
1868c2ecf20Sopenharmony_ci	nop
1878c2ecf20Sopenharmony_ci	nop
1888c2ecf20Sopenharmony_ci	nop
1898c2ecf20Sopenharmony_ci	beq	$3, no_quad_b	# U : Might have finished already
1908c2ecf20Sopenharmony_ci
1918c2ecf20Sopenharmony_ci.align 4
1928c2ecf20Sopenharmony_ci	/*
1938c2ecf20Sopenharmony_ci	 * Simple loop for trailing quadwords, or for small amounts
1948c2ecf20Sopenharmony_ci	 * of data (where we can't use an unrolled loop and wh64)
1958c2ecf20Sopenharmony_ci	 */
1968c2ecf20Sopenharmony_ciloop_b:
1978c2ecf20Sopenharmony_ci	stq $17,0($5)		# L :
1988c2ecf20Sopenharmony_ci	subq $3,1,$3		# E : Decrement number quads left
1998c2ecf20Sopenharmony_ci	addq $5,8,$5		# E : Inc address
2008c2ecf20Sopenharmony_ci	bne $3,loop_b		# U : more?
2018c2ecf20Sopenharmony_ci
2028c2ecf20Sopenharmony_cino_quad_b:
2038c2ecf20Sopenharmony_ci	/*
2048c2ecf20Sopenharmony_ci	 * Write 0..7 trailing bytes.
2058c2ecf20Sopenharmony_ci	 */
2068c2ecf20Sopenharmony_ci	nop			# E :
2078c2ecf20Sopenharmony_ci	beq $18,end_b		# U : All done?
2088c2ecf20Sopenharmony_ci	ldq $7,0($5)		# L :
2098c2ecf20Sopenharmony_ci	mskqh $7,$6,$2		# U : Mask final quad
2108c2ecf20Sopenharmony_ci
2118c2ecf20Sopenharmony_ci	insqh $17,$6,$4		# U : New bits
2128c2ecf20Sopenharmony_ci	bis $2,$4,$1		# E : Put it all together
2138c2ecf20Sopenharmony_ci	stq $1,0($5)		# L : And back to memory
2148c2ecf20Sopenharmony_ci	ret $31,($26),1		# L0 :
2158c2ecf20Sopenharmony_ci
2168c2ecf20Sopenharmony_ciwithin_quad_b:
2178c2ecf20Sopenharmony_ci	ldq_u $1,0($16)		# L :
2188c2ecf20Sopenharmony_ci	insql $17,$16,$2	# U : New bits
2198c2ecf20Sopenharmony_ci	mskql $1,$16,$4		# U : Clear old
2208c2ecf20Sopenharmony_ci	bis $2,$4,$2		# E : New result
2218c2ecf20Sopenharmony_ci
2228c2ecf20Sopenharmony_ci	mskql $2,$6,$4		# U :
2238c2ecf20Sopenharmony_ci	mskqh $1,$6,$2		# U :
2248c2ecf20Sopenharmony_ci	bis $2,$4,$1		# E :
2258c2ecf20Sopenharmony_ci	stq_u $1,0($16)		# L :
2268c2ecf20Sopenharmony_ci
2278c2ecf20Sopenharmony_ciend_b:
2288c2ecf20Sopenharmony_ci	nop
2298c2ecf20Sopenharmony_ci	nop
2308c2ecf20Sopenharmony_ci	nop
2318c2ecf20Sopenharmony_ci	ret $31,($26),1		# L0 :
2328c2ecf20Sopenharmony_ci	.end ___memset
2338c2ecf20Sopenharmony_ci	EXPORT_SYMBOL(___memset)
2348c2ecf20Sopenharmony_ci
2358c2ecf20Sopenharmony_ci	/*
2368c2ecf20Sopenharmony_ci	 * This is the original body of code, prior to replication and
2378c2ecf20Sopenharmony_ci	 * rescheduling.  Leave it here, as there may be calls to this
2388c2ecf20Sopenharmony_ci	 * entry point.
2398c2ecf20Sopenharmony_ci	 */
2408c2ecf20Sopenharmony_ci.align 4
2418c2ecf20Sopenharmony_ci	.ent __constant_c_memset
2428c2ecf20Sopenharmony_ci__constant_c_memset:
2438c2ecf20Sopenharmony_ci	.frame $30,0,$26,0
2448c2ecf20Sopenharmony_ci	.prologue 0
2458c2ecf20Sopenharmony_ci
2468c2ecf20Sopenharmony_ci	addq $18,$16,$6		# E : max address to write to
2478c2ecf20Sopenharmony_ci	bis $16,$16,$0		# E : return value
2488c2ecf20Sopenharmony_ci	xor $16,$6,$1		# E : will complete write be within one quadword?
2498c2ecf20Sopenharmony_ci	ble $18,end		# U : zero length requested?
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci	bic $1,7,$1		# E : fit within a single quadword
2528c2ecf20Sopenharmony_ci	beq $1,within_one_quad	# U :
2538c2ecf20Sopenharmony_ci	and $16,7,$3		# E : Target addr misalignment
2548c2ecf20Sopenharmony_ci	beq $3,aligned		# U : target is 0mod8
2558c2ecf20Sopenharmony_ci
2568c2ecf20Sopenharmony_ci	/*
2578c2ecf20Sopenharmony_ci	 * Target address is misaligned, and won't fit within a quadword
2588c2ecf20Sopenharmony_ci	 */
2598c2ecf20Sopenharmony_ci	ldq_u $4,0($16)		# L : Fetch first partial
2608c2ecf20Sopenharmony_ci	bis $16,$16,$5		# E : Save the address
2618c2ecf20Sopenharmony_ci	insql $17,$16,$2	# U : Insert new bytes
2628c2ecf20Sopenharmony_ci	subq $3,8,$3		# E : Invert (for addressing uses)
2638c2ecf20Sopenharmony_ci
2648c2ecf20Sopenharmony_ci	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
2658c2ecf20Sopenharmony_ci	mskql $4,$16,$4		# U : clear relevant parts of the quad
2668c2ecf20Sopenharmony_ci	subq $16,$3,$16		# E : $16 is new aligned destination
2678c2ecf20Sopenharmony_ci	bis $2,$4,$1		# E : Final bytes
2688c2ecf20Sopenharmony_ci
2698c2ecf20Sopenharmony_ci	nop
2708c2ecf20Sopenharmony_ci	stq_u $1,0($5)		# L : Store result
2718c2ecf20Sopenharmony_ci	nop
2728c2ecf20Sopenharmony_ci	nop
2738c2ecf20Sopenharmony_ci
2748c2ecf20Sopenharmony_ci.align 4
2758c2ecf20Sopenharmony_cialigned:
2768c2ecf20Sopenharmony_ci	/*
2778c2ecf20Sopenharmony_ci	 * We are now guaranteed to be quad aligned, with at least
2788c2ecf20Sopenharmony_ci	 * one partial quad to write.
2798c2ecf20Sopenharmony_ci	 */
2808c2ecf20Sopenharmony_ci
2818c2ecf20Sopenharmony_ci	sra $18,3,$3		# U : Number of remaining quads to write
2828c2ecf20Sopenharmony_ci	and $18,7,$18		# E : Number of trailing bytes to write
2838c2ecf20Sopenharmony_ci	bis $16,$16,$5		# E : Save dest address
2848c2ecf20Sopenharmony_ci	beq $3,no_quad		# U : tail stuff only
2858c2ecf20Sopenharmony_ci
2868c2ecf20Sopenharmony_ci	/*
2878c2ecf20Sopenharmony_ci	 * it's worth the effort to unroll this and use wh64 if possible
2888c2ecf20Sopenharmony_ci	 * Lifted a bunch of code from clear_user.S
2898c2ecf20Sopenharmony_ci	 * At this point, entry values are:
2908c2ecf20Sopenharmony_ci	 * $16	Current destination address
2918c2ecf20Sopenharmony_ci	 * $5	A copy of $16
2928c2ecf20Sopenharmony_ci	 * $6	The max quadword address to write to
2938c2ecf20Sopenharmony_ci	 * $18	Number trailer bytes
2948c2ecf20Sopenharmony_ci	 * $3	Number quads to write
2958c2ecf20Sopenharmony_ci	 */
2968c2ecf20Sopenharmony_ci
2978c2ecf20Sopenharmony_ci	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
2988c2ecf20Sopenharmony_ci	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
2998c2ecf20Sopenharmony_ci	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
3008c2ecf20Sopenharmony_ci	blt	$4, loop	# U :
3018c2ecf20Sopenharmony_ci
3028c2ecf20Sopenharmony_ci	/*
3038c2ecf20Sopenharmony_ci	 * We know we've got at least 16 quads, minimum of one trip
3048c2ecf20Sopenharmony_ci	 * through unrolled loop.  Do a quad at a time to get us 0mod64
3058c2ecf20Sopenharmony_ci	 * aligned.
3068c2ecf20Sopenharmony_ci	 */
3078c2ecf20Sopenharmony_ci
3088c2ecf20Sopenharmony_ci	nop			# E :
3098c2ecf20Sopenharmony_ci	nop			# E :
3108c2ecf20Sopenharmony_ci	nop			# E :
3118c2ecf20Sopenharmony_ci	beq	$1, $bigalign	# U :
3128c2ecf20Sopenharmony_ci
3138c2ecf20Sopenharmony_ci$alignmod64:
3148c2ecf20Sopenharmony_ci	stq	$17, 0($5)	# L :
3158c2ecf20Sopenharmony_ci	subq	$3, 1, $3	# E : For consistency later
3168c2ecf20Sopenharmony_ci	addq	$1, 8, $1	# E : Increment towards zero for alignment
3178c2ecf20Sopenharmony_ci	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
3188c2ecf20Sopenharmony_ci
3198c2ecf20Sopenharmony_ci	nop
3208c2ecf20Sopenharmony_ci	nop
3218c2ecf20Sopenharmony_ci	addq	$5, 8, $5	# E : Inc address
3228c2ecf20Sopenharmony_ci	blt	$1, $alignmod64	# U :
3238c2ecf20Sopenharmony_ci
3248c2ecf20Sopenharmony_ci$bigalign:
3258c2ecf20Sopenharmony_ci	/*
3268c2ecf20Sopenharmony_ci	 * $3 - number quads left to go
3278c2ecf20Sopenharmony_ci	 * $5 - target address (aligned 0mod64)
3288c2ecf20Sopenharmony_ci	 * $17 - mask of stuff to store
3298c2ecf20Sopenharmony_ci	 * Scratch registers available: $7, $2, $4, $1
3308c2ecf20Sopenharmony_ci	 * we know that we'll be taking a minimum of one trip through
3318c2ecf20Sopenharmony_ci 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
3328c2ecf20Sopenharmony_ci	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
3338c2ecf20Sopenharmony_ci	 * The wh64 is issued on for the starting destination address for trip +2
3348c2ecf20Sopenharmony_ci	 * through the loop, and if there are less than two trips left, the target
3358c2ecf20Sopenharmony_ci	 * address will be for the current trip.
3368c2ecf20Sopenharmony_ci	 */
3378c2ecf20Sopenharmony_ci
3388c2ecf20Sopenharmony_ci$do_wh64:
3398c2ecf20Sopenharmony_ci	wh64	($4)		# L1 : memory subsystem write hint
3408c2ecf20Sopenharmony_ci	subq	$3, 24, $2	# E : For determining future wh64 addresses
3418c2ecf20Sopenharmony_ci	stq	$17, 0($5)	# L :
3428c2ecf20Sopenharmony_ci	nop			# E :
3438c2ecf20Sopenharmony_ci
3448c2ecf20Sopenharmony_ci	addq	$5, 128, $4	# E : speculative target of next wh64
3458c2ecf20Sopenharmony_ci	stq	$17, 8($5)	# L :
3468c2ecf20Sopenharmony_ci	stq	$17, 16($5)	# L :
3478c2ecf20Sopenharmony_ci	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
3488c2ecf20Sopenharmony_ci
3498c2ecf20Sopenharmony_ci	stq	$17, 24($5)	# L :
3508c2ecf20Sopenharmony_ci	stq	$17, 32($5)	# L :
3518c2ecf20Sopenharmony_ci	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
3528c2ecf20Sopenharmony_ci	nop
3538c2ecf20Sopenharmony_ci
3548c2ecf20Sopenharmony_ci	stq	$17, 40($5)	# L :
3558c2ecf20Sopenharmony_ci	stq	$17, 48($5)	# L :
3568c2ecf20Sopenharmony_ci	subq	$3, 16, $2	# E : Repeat the loop at least once more?
3578c2ecf20Sopenharmony_ci	nop
3588c2ecf20Sopenharmony_ci
3598c2ecf20Sopenharmony_ci	stq	$17, 56($5)	# L :
3608c2ecf20Sopenharmony_ci	addq	$5, 64, $5	# E :
3618c2ecf20Sopenharmony_ci	subq	$3, 8, $3	# E :
3628c2ecf20Sopenharmony_ci	bge	$2, $do_wh64	# U :
3638c2ecf20Sopenharmony_ci
3648c2ecf20Sopenharmony_ci	nop
3658c2ecf20Sopenharmony_ci	nop
3668c2ecf20Sopenharmony_ci	nop
3678c2ecf20Sopenharmony_ci	beq	$3, no_quad	# U : Might have finished already
3688c2ecf20Sopenharmony_ci
3698c2ecf20Sopenharmony_ci.align 4
3708c2ecf20Sopenharmony_ci	/*
3718c2ecf20Sopenharmony_ci	 * Simple loop for trailing quadwords, or for small amounts
3728c2ecf20Sopenharmony_ci	 * of data (where we can't use an unrolled loop and wh64)
3738c2ecf20Sopenharmony_ci	 */
3748c2ecf20Sopenharmony_ciloop:
3758c2ecf20Sopenharmony_ci	stq $17,0($5)		# L :
3768c2ecf20Sopenharmony_ci	subq $3,1,$3		# E : Decrement number quads left
3778c2ecf20Sopenharmony_ci	addq $5,8,$5		# E : Inc address
3788c2ecf20Sopenharmony_ci	bne $3,loop		# U : more?
3798c2ecf20Sopenharmony_ci
3808c2ecf20Sopenharmony_cino_quad:
3818c2ecf20Sopenharmony_ci	/*
3828c2ecf20Sopenharmony_ci	 * Write 0..7 trailing bytes.
3838c2ecf20Sopenharmony_ci	 */
3848c2ecf20Sopenharmony_ci	nop			# E :
3858c2ecf20Sopenharmony_ci	beq $18,end		# U : All done?
3868c2ecf20Sopenharmony_ci	ldq $7,0($5)		# L :
3878c2ecf20Sopenharmony_ci	mskqh $7,$6,$2		# U : Mask final quad
3888c2ecf20Sopenharmony_ci
3898c2ecf20Sopenharmony_ci	insqh $17,$6,$4		# U : New bits
3908c2ecf20Sopenharmony_ci	bis $2,$4,$1		# E : Put it all together
3918c2ecf20Sopenharmony_ci	stq $1,0($5)		# L : And back to memory
3928c2ecf20Sopenharmony_ci	ret $31,($26),1		# L0 :
3938c2ecf20Sopenharmony_ci
3948c2ecf20Sopenharmony_ciwithin_one_quad:
3958c2ecf20Sopenharmony_ci	ldq_u $1,0($16)		# L :
3968c2ecf20Sopenharmony_ci	insql $17,$16,$2	# U : New bits
3978c2ecf20Sopenharmony_ci	mskql $1,$16,$4		# U : Clear old
3988c2ecf20Sopenharmony_ci	bis $2,$4,$2		# E : New result
3998c2ecf20Sopenharmony_ci
4008c2ecf20Sopenharmony_ci	mskql $2,$6,$4		# U :
4018c2ecf20Sopenharmony_ci	mskqh $1,$6,$2		# U :
4028c2ecf20Sopenharmony_ci	bis $2,$4,$1		# E :
4038c2ecf20Sopenharmony_ci	stq_u $1,0($16)		# L :
4048c2ecf20Sopenharmony_ci
4058c2ecf20Sopenharmony_ciend:
4068c2ecf20Sopenharmony_ci	nop
4078c2ecf20Sopenharmony_ci	nop
4088c2ecf20Sopenharmony_ci	nop
4098c2ecf20Sopenharmony_ci	ret $31,($26),1		# L0 :
4108c2ecf20Sopenharmony_ci	.end __constant_c_memset
4118c2ecf20Sopenharmony_ci	EXPORT_SYMBOL(__constant_c_memset)
4128c2ecf20Sopenharmony_ci
4138c2ecf20Sopenharmony_ci	/*
4148c2ecf20Sopenharmony_ci	 * This is a replicant of the __constant_c_memset code, rescheduled
4158c2ecf20Sopenharmony_ci	 * to mask stalls.  Note that entry point names also had to change
4168c2ecf20Sopenharmony_ci	 */
4178c2ecf20Sopenharmony_ci	.align 5
4188c2ecf20Sopenharmony_ci	.ent __memset16
4198c2ecf20Sopenharmony_ci
4208c2ecf20Sopenharmony_ci__memset16:
4218c2ecf20Sopenharmony_ci	.frame $30,0,$26,0
4228c2ecf20Sopenharmony_ci	.prologue 0
4238c2ecf20Sopenharmony_ci
4248c2ecf20Sopenharmony_ci	inswl $17,0,$5		# U : 000000000000c1c2
4258c2ecf20Sopenharmony_ci	inswl $17,2,$2		# U : 00000000c1c20000
4268c2ecf20Sopenharmony_ci	bis $16,$16,$0		# E : return value
4278c2ecf20Sopenharmony_ci	addq	$18,$16,$6	# E : max address to write to
4288c2ecf20Sopenharmony_ci
4298c2ecf20Sopenharmony_ci	ble $18, end_w		# U : zero length requested?
4308c2ecf20Sopenharmony_ci	inswl	$17,4,$3	# U : 0000c1c200000000
4318c2ecf20Sopenharmony_ci	inswl	$17,6,$4	# U : c1c2000000000000
4328c2ecf20Sopenharmony_ci	xor	$16,$6,$1	# E : will complete write be within one quadword?
4338c2ecf20Sopenharmony_ci
4348c2ecf20Sopenharmony_ci	or	$2,$5,$2	# E : 00000000c1c2c1c2
4358c2ecf20Sopenharmony_ci	or	$3,$4,$17	# E : c1c2c1c200000000
4368c2ecf20Sopenharmony_ci	bic	$1,7,$1		# E : fit within a single quadword
4378c2ecf20Sopenharmony_ci	and	$16,7,$3	# E : Target addr misalignment
4388c2ecf20Sopenharmony_ci
4398c2ecf20Sopenharmony_ci	or	$17,$2,$17	# E : c1c2c1c2c1c2c1c2
4408c2ecf20Sopenharmony_ci	beq $1,within_quad_w	# U :
4418c2ecf20Sopenharmony_ci	nop
4428c2ecf20Sopenharmony_ci	beq $3,aligned_w	# U : target is 0mod8
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_ci	/*
4458c2ecf20Sopenharmony_ci	 * Target address is misaligned, and won't fit within a quadword
4468c2ecf20Sopenharmony_ci	 */
4478c2ecf20Sopenharmony_ci	ldq_u $4,0($16)		# L : Fetch first partial
4488c2ecf20Sopenharmony_ci	bis $16,$16,$5		# E : Save the address
4498c2ecf20Sopenharmony_ci	insql $17,$16,$2	# U : Insert new bytes
4508c2ecf20Sopenharmony_ci	subq $3,8,$3		# E : Invert (for addressing uses)
4518c2ecf20Sopenharmony_ci
4528c2ecf20Sopenharmony_ci	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
4538c2ecf20Sopenharmony_ci	mskql $4,$16,$4		# U : clear relevant parts of the quad
4548c2ecf20Sopenharmony_ci	subq $16,$3,$16		# E : $16 is new aligned destination
4558c2ecf20Sopenharmony_ci	bis $2,$4,$1		# E : Final bytes
4568c2ecf20Sopenharmony_ci
4578c2ecf20Sopenharmony_ci	nop
4588c2ecf20Sopenharmony_ci	stq_u $1,0($5)		# L : Store result
4598c2ecf20Sopenharmony_ci	nop
4608c2ecf20Sopenharmony_ci	nop
4618c2ecf20Sopenharmony_ci
4628c2ecf20Sopenharmony_ci.align 4
4638c2ecf20Sopenharmony_cialigned_w:
4648c2ecf20Sopenharmony_ci	/*
4658c2ecf20Sopenharmony_ci	 * We are now guaranteed to be quad aligned, with at least
4668c2ecf20Sopenharmony_ci	 * one partial quad to write.
4678c2ecf20Sopenharmony_ci	 */
4688c2ecf20Sopenharmony_ci
4698c2ecf20Sopenharmony_ci	sra $18,3,$3		# U : Number of remaining quads to write
4708c2ecf20Sopenharmony_ci	and $18,7,$18		# E : Number of trailing bytes to write
4718c2ecf20Sopenharmony_ci	bis $16,$16,$5		# E : Save dest address
4728c2ecf20Sopenharmony_ci	beq $3,no_quad_w	# U : tail stuff only
4738c2ecf20Sopenharmony_ci
4748c2ecf20Sopenharmony_ci	/*
4758c2ecf20Sopenharmony_ci	 * it's worth the effort to unroll this and use wh64 if possible
4768c2ecf20Sopenharmony_ci	 * Lifted a bunch of code from clear_user.S
4778c2ecf20Sopenharmony_ci	 * At this point, entry values are:
4788c2ecf20Sopenharmony_ci	 * $16	Current destination address
4798c2ecf20Sopenharmony_ci	 * $5	A copy of $16
4808c2ecf20Sopenharmony_ci	 * $6	The max quadword address to write to
4818c2ecf20Sopenharmony_ci	 * $18	Number trailer bytes
4828c2ecf20Sopenharmony_ci	 * $3	Number quads to write
4838c2ecf20Sopenharmony_ci	 */
4848c2ecf20Sopenharmony_ci
4858c2ecf20Sopenharmony_ci	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
4868c2ecf20Sopenharmony_ci	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
4878c2ecf20Sopenharmony_ci	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
4888c2ecf20Sopenharmony_ci	blt	$4, loop_w	# U :
4898c2ecf20Sopenharmony_ci
4908c2ecf20Sopenharmony_ci	/*
4918c2ecf20Sopenharmony_ci	 * We know we've got at least 16 quads, minimum of one trip
4928c2ecf20Sopenharmony_ci	 * through unrolled loop.  Do a quad at a time to get us 0mod64
4938c2ecf20Sopenharmony_ci	 * aligned.
4948c2ecf20Sopenharmony_ci	 */
4958c2ecf20Sopenharmony_ci
4968c2ecf20Sopenharmony_ci	nop			# E :
4978c2ecf20Sopenharmony_ci	nop			# E :
4988c2ecf20Sopenharmony_ci	nop			# E :
4998c2ecf20Sopenharmony_ci	beq	$1, $bigalign_w	# U :
5008c2ecf20Sopenharmony_ci
5018c2ecf20Sopenharmony_ci$alignmod64_w:
5028c2ecf20Sopenharmony_ci	stq	$17, 0($5)	# L :
5038c2ecf20Sopenharmony_ci	subq	$3, 1, $3	# E : For consistency later
5048c2ecf20Sopenharmony_ci	addq	$1, 8, $1	# E : Increment towards zero for alignment
5058c2ecf20Sopenharmony_ci	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
5068c2ecf20Sopenharmony_ci
5078c2ecf20Sopenharmony_ci	nop
5088c2ecf20Sopenharmony_ci	nop
5098c2ecf20Sopenharmony_ci	addq	$5, 8, $5	# E : Inc address
5108c2ecf20Sopenharmony_ci	blt	$1, $alignmod64_w	# U :
5118c2ecf20Sopenharmony_ci
5128c2ecf20Sopenharmony_ci$bigalign_w:
5138c2ecf20Sopenharmony_ci	/*
5148c2ecf20Sopenharmony_ci	 * $3 - number quads left to go
5158c2ecf20Sopenharmony_ci	 * $5 - target address (aligned 0mod64)
5168c2ecf20Sopenharmony_ci	 * $17 - mask of stuff to store
5178c2ecf20Sopenharmony_ci	 * Scratch registers available: $7, $2, $4, $1
5188c2ecf20Sopenharmony_ci	 * we know that we'll be taking a minimum of one trip through
5198c2ecf20Sopenharmony_ci 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
5208c2ecf20Sopenharmony_ci	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
5218c2ecf20Sopenharmony_ci	 * The wh64 is issued on for the starting destination address for trip +2
5228c2ecf20Sopenharmony_ci	 * through the loop, and if there are less than two trips left, the target
5238c2ecf20Sopenharmony_ci	 * address will be for the current trip.
5248c2ecf20Sopenharmony_ci	 */
5258c2ecf20Sopenharmony_ci
5268c2ecf20Sopenharmony_ci$do_wh64_w:
5278c2ecf20Sopenharmony_ci	wh64	($4)		# L1 : memory subsystem write hint
5288c2ecf20Sopenharmony_ci	subq	$3, 24, $2	# E : For determining future wh64 addresses
5298c2ecf20Sopenharmony_ci	stq	$17, 0($5)	# L :
5308c2ecf20Sopenharmony_ci	nop			# E :
5318c2ecf20Sopenharmony_ci
5328c2ecf20Sopenharmony_ci	addq	$5, 128, $4	# E : speculative target of next wh64
5338c2ecf20Sopenharmony_ci	stq	$17, 8($5)	# L :
5348c2ecf20Sopenharmony_ci	stq	$17, 16($5)	# L :
5358c2ecf20Sopenharmony_ci	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
5368c2ecf20Sopenharmony_ci
5378c2ecf20Sopenharmony_ci	stq	$17, 24($5)	# L :
5388c2ecf20Sopenharmony_ci	stq	$17, 32($5)	# L :
5398c2ecf20Sopenharmony_ci	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
5408c2ecf20Sopenharmony_ci	nop
5418c2ecf20Sopenharmony_ci
5428c2ecf20Sopenharmony_ci	stq	$17, 40($5)	# L :
5438c2ecf20Sopenharmony_ci	stq	$17, 48($5)	# L :
5448c2ecf20Sopenharmony_ci	subq	$3, 16, $2	# E : Repeat the loop at least once more?
5458c2ecf20Sopenharmony_ci	nop
5468c2ecf20Sopenharmony_ci
5478c2ecf20Sopenharmony_ci	stq	$17, 56($5)	# L :
5488c2ecf20Sopenharmony_ci	addq	$5, 64, $5	# E :
5498c2ecf20Sopenharmony_ci	subq	$3, 8, $3	# E :
5508c2ecf20Sopenharmony_ci	bge	$2, $do_wh64_w	# U :
5518c2ecf20Sopenharmony_ci
5528c2ecf20Sopenharmony_ci	nop
5538c2ecf20Sopenharmony_ci	nop
5548c2ecf20Sopenharmony_ci	nop
5558c2ecf20Sopenharmony_ci	beq	$3, no_quad_w	# U : Might have finished already
5568c2ecf20Sopenharmony_ci
5578c2ecf20Sopenharmony_ci.align 4
5588c2ecf20Sopenharmony_ci	/*
5598c2ecf20Sopenharmony_ci	 * Simple loop for trailing quadwords, or for small amounts
5608c2ecf20Sopenharmony_ci	 * of data (where we can't use an unrolled loop and wh64)
5618c2ecf20Sopenharmony_ci	 */
5628c2ecf20Sopenharmony_ciloop_w:
5638c2ecf20Sopenharmony_ci	stq $17,0($5)		# L :
5648c2ecf20Sopenharmony_ci	subq $3,1,$3		# E : Decrement number quads left
5658c2ecf20Sopenharmony_ci	addq $5,8,$5		# E : Inc address
5668c2ecf20Sopenharmony_ci	bne $3,loop_w		# U : more?
5678c2ecf20Sopenharmony_ci
5688c2ecf20Sopenharmony_cino_quad_w:
5698c2ecf20Sopenharmony_ci	/*
5708c2ecf20Sopenharmony_ci	 * Write 0..7 trailing bytes.
5718c2ecf20Sopenharmony_ci	 */
5728c2ecf20Sopenharmony_ci	nop			# E :
5738c2ecf20Sopenharmony_ci	beq $18,end_w		# U : All done?
5748c2ecf20Sopenharmony_ci	ldq $7,0($5)		# L :
5758c2ecf20Sopenharmony_ci	mskqh $7,$6,$2		# U : Mask final quad
5768c2ecf20Sopenharmony_ci
5778c2ecf20Sopenharmony_ci	insqh $17,$6,$4		# U : New bits
5788c2ecf20Sopenharmony_ci	bis $2,$4,$1		# E : Put it all together
5798c2ecf20Sopenharmony_ci	stq $1,0($5)		# L : And back to memory
5808c2ecf20Sopenharmony_ci	ret $31,($26),1		# L0 :
5818c2ecf20Sopenharmony_ci
5828c2ecf20Sopenharmony_ciwithin_quad_w:
5838c2ecf20Sopenharmony_ci	ldq_u $1,0($16)		# L :
5848c2ecf20Sopenharmony_ci	insql $17,$16,$2	# U : New bits
5858c2ecf20Sopenharmony_ci	mskql $1,$16,$4		# U : Clear old
5868c2ecf20Sopenharmony_ci	bis $2,$4,$2		# E : New result
5878c2ecf20Sopenharmony_ci
5888c2ecf20Sopenharmony_ci	mskql $2,$6,$4		# U :
5898c2ecf20Sopenharmony_ci	mskqh $1,$6,$2		# U :
5908c2ecf20Sopenharmony_ci	bis $2,$4,$1		# E :
5918c2ecf20Sopenharmony_ci	stq_u $1,0($16)		# L :
5928c2ecf20Sopenharmony_ci
5938c2ecf20Sopenharmony_ciend_w:
5948c2ecf20Sopenharmony_ci	nop
5958c2ecf20Sopenharmony_ci	nop
5968c2ecf20Sopenharmony_ci	nop
5978c2ecf20Sopenharmony_ci	ret $31,($26),1		# L0 :
5988c2ecf20Sopenharmony_ci
5998c2ecf20Sopenharmony_ci	.end __memset16
6008c2ecf20Sopenharmony_ci	EXPORT_SYMBOL(__memset16)
6018c2ecf20Sopenharmony_ci
6028c2ecf20Sopenharmony_cimemset = ___memset
6038c2ecf20Sopenharmony_ci__memset = ___memset
6048c2ecf20Sopenharmony_ci	EXPORT_SYMBOL(memset)
6058c2ecf20Sopenharmony_ci	EXPORT_SYMBOL(__memset)
606