162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * arch/alpha/lib/ev6-memset.S
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * This is an efficient (and relatively small) implementation of the C library
662306a36Sopenharmony_ci * "memset()" function for the 21264 implementation of Alpha.
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci * 21264 version  contributed by Rick Gorton <rick.gorton@alpha-processor.com>
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from:
1162306a36Sopenharmony_ci *	Compiler Writer's Guide for the Alpha 21264
1262306a36Sopenharmony_ci *	abbreviated as 'CWG' in other comments here
1362306a36Sopenharmony_ci *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
1462306a36Sopenharmony_ci * Scheduling notation:
1562306a36Sopenharmony_ci *	E	- either cluster
1662306a36Sopenharmony_ci *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
1762306a36Sopenharmony_ci *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
1862306a36Sopenharmony_ci * The algorithm for the leading and trailing quadwords remains the same,
1962306a36Sopenharmony_ci * however the loop has been unrolled to enable better memory throughput,
2062306a36Sopenharmony_ci * and the code has been replicated for each of the entry points: __memset
2162306a36Sopenharmony_ci * and __memset16 to permit better scheduling to eliminate the stalling
2262306a36Sopenharmony_ci * encountered during the mask replication.
2362306a36Sopenharmony_ci * A future enhancement might be to put in a byte store loop for really
2462306a36Sopenharmony_ci * small (say < 32 bytes) memset()s.  Whether or not that change would be
2562306a36Sopenharmony_ci * a win in the kernel would depend upon the contextual usage.
2662306a36Sopenharmony_ci * WARNING: Maintaining this is going to be more work than the above version,
2762306a36Sopenharmony_ci * as fixes will need to be made in multiple places.  The performance gain
2862306a36Sopenharmony_ci * is worth it.
2962306a36Sopenharmony_ci */
3062306a36Sopenharmony_ci#include <linux/export.h>
3162306a36Sopenharmony_ci	.set noat
3262306a36Sopenharmony_ci	.set noreorder
3362306a36Sopenharmony_ci.text
3462306a36Sopenharmony_ci	.globl memset
3562306a36Sopenharmony_ci	.globl __memset
3662306a36Sopenharmony_ci	.globl ___memset
3762306a36Sopenharmony_ci	.globl __memset16
3862306a36Sopenharmony_ci	.globl __constant_c_memset
3962306a36Sopenharmony_ci
4062306a36Sopenharmony_ci	.ent ___memset
4162306a36Sopenharmony_ci.align 5
4262306a36Sopenharmony_ci___memset:
4362306a36Sopenharmony_ci	.frame $30,0,$26,0
4462306a36Sopenharmony_ci	.prologue 0
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ci	/*
4762306a36Sopenharmony_ci	 * Serious stalling happens.  The only way to mitigate this is to
4862306a36Sopenharmony_ci	 * undertake a major re-write to interleave the constant materialization
4962306a36Sopenharmony_ci	 * with other parts of the fall-through code.  This is important, even
5062306a36Sopenharmony_ci	 * though it makes maintenance tougher.
5162306a36Sopenharmony_ci	 * Do this later.
5262306a36Sopenharmony_ci	 */
5362306a36Sopenharmony_ci	and $17,255,$1		# E : 00000000000000ch
5462306a36Sopenharmony_ci	insbl $17,1,$2		# U : 000000000000ch00
5562306a36Sopenharmony_ci	bis $16,$16,$0		# E : return value
5662306a36Sopenharmony_ci	ble $18,end_b		# U : zero length requested?
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ci	addq $18,$16,$6		# E : max address to write to
5962306a36Sopenharmony_ci	bis	$1,$2,$17	# E : 000000000000chch
6062306a36Sopenharmony_ci	insbl	$1,2,$3		# U : 0000000000ch0000
6162306a36Sopenharmony_ci	insbl	$1,3,$4		# U : 00000000ch000000
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci	or	$3,$4,$3	# E : 00000000chch0000
6462306a36Sopenharmony_ci	inswl	$17,4,$5	# U : 0000chch00000000
6562306a36Sopenharmony_ci	xor	$16,$6,$1	# E : will complete write be within one quadword?
6662306a36Sopenharmony_ci	inswl	$17,6,$2	# U : chch000000000000
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_ci	or	$17,$3,$17	# E : 00000000chchchch
6962306a36Sopenharmony_ci	or	$2,$5,$2	# E : chchchch00000000
7062306a36Sopenharmony_ci	bic	$1,7,$1		# E : fit within a single quadword?
7162306a36Sopenharmony_ci	and	$16,7,$3	# E : Target addr misalignment
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci	or	$17,$2,$17	# E : chchchchchchchch
7462306a36Sopenharmony_ci	beq	$1,within_quad_b # U :
7562306a36Sopenharmony_ci	nop			# E :
7662306a36Sopenharmony_ci	beq	$3,aligned_b	# U : target is 0mod8
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_ci	/*
7962306a36Sopenharmony_ci	 * Target address is misaligned, and won't fit within a quadword
8062306a36Sopenharmony_ci	 */
8162306a36Sopenharmony_ci	ldq_u $4,0($16)		# L : Fetch first partial
8262306a36Sopenharmony_ci	bis $16,$16,$5		# E : Save the address
8362306a36Sopenharmony_ci	insql $17,$16,$2	# U : Insert new bytes
8462306a36Sopenharmony_ci	subq $3,8,$3		# E : Invert (for addressing uses)
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_ci	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
8762306a36Sopenharmony_ci	mskql $4,$16,$4		# U : clear relevant parts of the quad
8862306a36Sopenharmony_ci	subq $16,$3,$16		# E : $16 is new aligned destination
8962306a36Sopenharmony_ci	bis $2,$4,$1		# E : Final bytes
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci	nop
9262306a36Sopenharmony_ci	stq_u $1,0($5)		# L : Store result
9362306a36Sopenharmony_ci	nop
9462306a36Sopenharmony_ci	nop
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci.align 4
9762306a36Sopenharmony_cialigned_b:
9862306a36Sopenharmony_ci	/*
9962306a36Sopenharmony_ci	 * We are now guaranteed to be quad aligned, with at least
10062306a36Sopenharmony_ci	 * one partial quad to write.
10162306a36Sopenharmony_ci	 */
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_ci	sra $18,3,$3		# U : Number of remaining quads to write
10462306a36Sopenharmony_ci	and $18,7,$18		# E : Number of trailing bytes to write
10562306a36Sopenharmony_ci	bis $16,$16,$5		# E : Save dest address
10662306a36Sopenharmony_ci	beq $3,no_quad_b	# U : tail stuff only
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_ci	/*
10962306a36Sopenharmony_ci	 * it's worth the effort to unroll this and use wh64 if possible
11062306a36Sopenharmony_ci	 * Lifted a bunch of code from clear_user.S
11162306a36Sopenharmony_ci	 * At this point, entry values are:
11262306a36Sopenharmony_ci	 * $16	Current destination address
11362306a36Sopenharmony_ci	 * $5	A copy of $16
11462306a36Sopenharmony_ci	 * $6	The max quadword address to write to
11562306a36Sopenharmony_ci	 * $18	Number trailer bytes
11662306a36Sopenharmony_ci	 * $3	Number quads to write
11762306a36Sopenharmony_ci	 */
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
12062306a36Sopenharmony_ci	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
12162306a36Sopenharmony_ci	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
12262306a36Sopenharmony_ci	blt	$4, loop_b	# U :
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_ci	/*
12562306a36Sopenharmony_ci	 * We know we've got at least 16 quads, minimum of one trip
12662306a36Sopenharmony_ci	 * through unrolled loop.  Do a quad at a time to get us 0mod64
12762306a36Sopenharmony_ci	 * aligned.
12862306a36Sopenharmony_ci	 */
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_ci	nop			# E :
13162306a36Sopenharmony_ci	nop			# E :
13262306a36Sopenharmony_ci	nop			# E :
13362306a36Sopenharmony_ci	beq	$1, $bigalign_b	# U :
13462306a36Sopenharmony_ci
13562306a36Sopenharmony_ci$alignmod64_b:
13662306a36Sopenharmony_ci	stq	$17, 0($5)	# L :
13762306a36Sopenharmony_ci	subq	$3, 1, $3	# E : For consistency later
13862306a36Sopenharmony_ci	addq	$1, 8, $1	# E : Increment towards zero for alignment
13962306a36Sopenharmony_ci	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci	nop
14262306a36Sopenharmony_ci	nop
14362306a36Sopenharmony_ci	addq	$5, 8, $5	# E : Inc address
14462306a36Sopenharmony_ci	blt	$1, $alignmod64_b # U :
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ci$bigalign_b:
14762306a36Sopenharmony_ci	/*
14862306a36Sopenharmony_ci	 * $3 - number quads left to go
14962306a36Sopenharmony_ci	 * $5 - target address (aligned 0mod64)
15062306a36Sopenharmony_ci	 * $17 - mask of stuff to store
15162306a36Sopenharmony_ci	 * Scratch registers available: $7, $2, $4, $1
15262306a36Sopenharmony_ci	 * we know that we'll be taking a minimum of one trip through
15362306a36Sopenharmony_ci 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
15462306a36Sopenharmony_ci	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
15562306a36Sopenharmony_ci	 * The wh64 is issued on for the starting destination address for trip +2
15662306a36Sopenharmony_ci	 * through the loop, and if there are less than two trips left, the target
15762306a36Sopenharmony_ci	 * address will be for the current trip.
15862306a36Sopenharmony_ci	 */
15962306a36Sopenharmony_ci
16062306a36Sopenharmony_ci$do_wh64_b:
16162306a36Sopenharmony_ci	wh64	($4)		# L1 : memory subsystem write hint
16262306a36Sopenharmony_ci	subq	$3, 24, $2	# E : For determining future wh64 addresses
16362306a36Sopenharmony_ci	stq	$17, 0($5)	# L :
16462306a36Sopenharmony_ci	nop			# E :
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_ci	addq	$5, 128, $4	# E : speculative target of next wh64
16762306a36Sopenharmony_ci	stq	$17, 8($5)	# L :
16862306a36Sopenharmony_ci	stq	$17, 16($5)	# L :
16962306a36Sopenharmony_ci	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci	stq	$17, 24($5)	# L :
17262306a36Sopenharmony_ci	stq	$17, 32($5)	# L :
17362306a36Sopenharmony_ci	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
17462306a36Sopenharmony_ci	nop
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci	stq	$17, 40($5)	# L :
17762306a36Sopenharmony_ci	stq	$17, 48($5)	# L :
17862306a36Sopenharmony_ci	subq	$3, 16, $2	# E : Repeat the loop at least once more?
17962306a36Sopenharmony_ci	nop
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_ci	stq	$17, 56($5)	# L :
18262306a36Sopenharmony_ci	addq	$5, 64, $5	# E :
18362306a36Sopenharmony_ci	subq	$3, 8, $3	# E :
18462306a36Sopenharmony_ci	bge	$2, $do_wh64_b	# U :
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ci	nop
18762306a36Sopenharmony_ci	nop
18862306a36Sopenharmony_ci	nop
18962306a36Sopenharmony_ci	beq	$3, no_quad_b	# U : Might have finished already
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci.align 4
19262306a36Sopenharmony_ci	/*
19362306a36Sopenharmony_ci	 * Simple loop for trailing quadwords, or for small amounts
19462306a36Sopenharmony_ci	 * of data (where we can't use an unrolled loop and wh64)
19562306a36Sopenharmony_ci	 */
19662306a36Sopenharmony_ciloop_b:
19762306a36Sopenharmony_ci	stq $17,0($5)		# L :
19862306a36Sopenharmony_ci	subq $3,1,$3		# E : Decrement number quads left
19962306a36Sopenharmony_ci	addq $5,8,$5		# E : Inc address
20062306a36Sopenharmony_ci	bne $3,loop_b		# U : more?
20162306a36Sopenharmony_ci
20262306a36Sopenharmony_cino_quad_b:
20362306a36Sopenharmony_ci	/*
20462306a36Sopenharmony_ci	 * Write 0..7 trailing bytes.
20562306a36Sopenharmony_ci	 */
20662306a36Sopenharmony_ci	nop			# E :
20762306a36Sopenharmony_ci	beq $18,end_b		# U : All done?
20862306a36Sopenharmony_ci	ldq $7,0($5)		# L :
20962306a36Sopenharmony_ci	mskqh $7,$6,$2		# U : Mask final quad
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_ci	insqh $17,$6,$4		# U : New bits
21262306a36Sopenharmony_ci	bis $2,$4,$1		# E : Put it all together
21362306a36Sopenharmony_ci	stq $1,0($5)		# L : And back to memory
21462306a36Sopenharmony_ci	ret $31,($26),1		# L0 :
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ciwithin_quad_b:
21762306a36Sopenharmony_ci	ldq_u $1,0($16)		# L :
21862306a36Sopenharmony_ci	insql $17,$16,$2	# U : New bits
21962306a36Sopenharmony_ci	mskql $1,$16,$4		# U : Clear old
22062306a36Sopenharmony_ci	bis $2,$4,$2		# E : New result
22162306a36Sopenharmony_ci
22262306a36Sopenharmony_ci	mskql $2,$6,$4		# U :
22362306a36Sopenharmony_ci	mskqh $1,$6,$2		# U :
22462306a36Sopenharmony_ci	bis $2,$4,$1		# E :
22562306a36Sopenharmony_ci	stq_u $1,0($16)		# L :
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ciend_b:
22862306a36Sopenharmony_ci	nop
22962306a36Sopenharmony_ci	nop
23062306a36Sopenharmony_ci	nop
23162306a36Sopenharmony_ci	ret $31,($26),1		# L0 :
23262306a36Sopenharmony_ci	.end ___memset
23362306a36Sopenharmony_ci	EXPORT_SYMBOL(___memset)
23462306a36Sopenharmony_ci
23562306a36Sopenharmony_ci	/*
23662306a36Sopenharmony_ci	 * This is the original body of code, prior to replication and
23762306a36Sopenharmony_ci	 * rescheduling.  Leave it here, as there may be calls to this
23862306a36Sopenharmony_ci	 * entry point.
23962306a36Sopenharmony_ci	 */
24062306a36Sopenharmony_ci.align 4
24162306a36Sopenharmony_ci	.ent __constant_c_memset
24262306a36Sopenharmony_ci__constant_c_memset:
24362306a36Sopenharmony_ci	.frame $30,0,$26,0
24462306a36Sopenharmony_ci	.prologue 0
24562306a36Sopenharmony_ci
24662306a36Sopenharmony_ci	addq $18,$16,$6		# E : max address to write to
24762306a36Sopenharmony_ci	bis $16,$16,$0		# E : return value
24862306a36Sopenharmony_ci	xor $16,$6,$1		# E : will complete write be within one quadword?
24962306a36Sopenharmony_ci	ble $18,end		# U : zero length requested?
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_ci	bic $1,7,$1		# E : fit within a single quadword
25262306a36Sopenharmony_ci	beq $1,within_one_quad	# U :
25362306a36Sopenharmony_ci	and $16,7,$3		# E : Target addr misalignment
25462306a36Sopenharmony_ci	beq $3,aligned		# U : target is 0mod8
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci	/*
25762306a36Sopenharmony_ci	 * Target address is misaligned, and won't fit within a quadword
25862306a36Sopenharmony_ci	 */
25962306a36Sopenharmony_ci	ldq_u $4,0($16)		# L : Fetch first partial
26062306a36Sopenharmony_ci	bis $16,$16,$5		# E : Save the address
26162306a36Sopenharmony_ci	insql $17,$16,$2	# U : Insert new bytes
26262306a36Sopenharmony_ci	subq $3,8,$3		# E : Invert (for addressing uses)
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
26562306a36Sopenharmony_ci	mskql $4,$16,$4		# U : clear relevant parts of the quad
26662306a36Sopenharmony_ci	subq $16,$3,$16		# E : $16 is new aligned destination
26762306a36Sopenharmony_ci	bis $2,$4,$1		# E : Final bytes
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci	nop
27062306a36Sopenharmony_ci	stq_u $1,0($5)		# L : Store result
27162306a36Sopenharmony_ci	nop
27262306a36Sopenharmony_ci	nop
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_ci.align 4
27562306a36Sopenharmony_cialigned:
27662306a36Sopenharmony_ci	/*
27762306a36Sopenharmony_ci	 * We are now guaranteed to be quad aligned, with at least
27862306a36Sopenharmony_ci	 * one partial quad to write.
27962306a36Sopenharmony_ci	 */
28062306a36Sopenharmony_ci
28162306a36Sopenharmony_ci	sra $18,3,$3		# U : Number of remaining quads to write
28262306a36Sopenharmony_ci	and $18,7,$18		# E : Number of trailing bytes to write
28362306a36Sopenharmony_ci	bis $16,$16,$5		# E : Save dest address
28462306a36Sopenharmony_ci	beq $3,no_quad		# U : tail stuff only
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_ci	/*
28762306a36Sopenharmony_ci	 * it's worth the effort to unroll this and use wh64 if possible
28862306a36Sopenharmony_ci	 * Lifted a bunch of code from clear_user.S
28962306a36Sopenharmony_ci	 * At this point, entry values are:
29062306a36Sopenharmony_ci	 * $16	Current destination address
29162306a36Sopenharmony_ci	 * $5	A copy of $16
29262306a36Sopenharmony_ci	 * $6	The max quadword address to write to
29362306a36Sopenharmony_ci	 * $18	Number trailer bytes
29462306a36Sopenharmony_ci	 * $3	Number quads to write
29562306a36Sopenharmony_ci	 */
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_ci	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
29862306a36Sopenharmony_ci	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
29962306a36Sopenharmony_ci	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
30062306a36Sopenharmony_ci	blt	$4, loop	# U :
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	/*
30362306a36Sopenharmony_ci	 * We know we've got at least 16 quads, minimum of one trip
30462306a36Sopenharmony_ci	 * through unrolled loop.  Do a quad at a time to get us 0mod64
30562306a36Sopenharmony_ci	 * aligned.
30662306a36Sopenharmony_ci	 */
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci	nop			# E :
30962306a36Sopenharmony_ci	nop			# E :
31062306a36Sopenharmony_ci	nop			# E :
31162306a36Sopenharmony_ci	beq	$1, $bigalign	# U :
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci$alignmod64:
31462306a36Sopenharmony_ci	stq	$17, 0($5)	# L :
31562306a36Sopenharmony_ci	subq	$3, 1, $3	# E : For consistency later
31662306a36Sopenharmony_ci	addq	$1, 8, $1	# E : Increment towards zero for alignment
31762306a36Sopenharmony_ci	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
31862306a36Sopenharmony_ci
31962306a36Sopenharmony_ci	nop
32062306a36Sopenharmony_ci	nop
32162306a36Sopenharmony_ci	addq	$5, 8, $5	# E : Inc address
32262306a36Sopenharmony_ci	blt	$1, $alignmod64	# U :
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci$bigalign:
32562306a36Sopenharmony_ci	/*
32662306a36Sopenharmony_ci	 * $3 - number quads left to go
32762306a36Sopenharmony_ci	 * $5 - target address (aligned 0mod64)
32862306a36Sopenharmony_ci	 * $17 - mask of stuff to store
32962306a36Sopenharmony_ci	 * Scratch registers available: $7, $2, $4, $1
33062306a36Sopenharmony_ci	 * we know that we'll be taking a minimum of one trip through
33162306a36Sopenharmony_ci 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
33262306a36Sopenharmony_ci	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
33362306a36Sopenharmony_ci	 * The wh64 is issued on for the starting destination address for trip +2
33462306a36Sopenharmony_ci	 * through the loop, and if there are less than two trips left, the target
33562306a36Sopenharmony_ci	 * address will be for the current trip.
33662306a36Sopenharmony_ci	 */
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ci$do_wh64:
33962306a36Sopenharmony_ci	wh64	($4)		# L1 : memory subsystem write hint
34062306a36Sopenharmony_ci	subq	$3, 24, $2	# E : For determining future wh64 addresses
34162306a36Sopenharmony_ci	stq	$17, 0($5)	# L :
34262306a36Sopenharmony_ci	nop			# E :
34362306a36Sopenharmony_ci
34462306a36Sopenharmony_ci	addq	$5, 128, $4	# E : speculative target of next wh64
34562306a36Sopenharmony_ci	stq	$17, 8($5)	# L :
34662306a36Sopenharmony_ci	stq	$17, 16($5)	# L :
34762306a36Sopenharmony_ci	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
34862306a36Sopenharmony_ci
34962306a36Sopenharmony_ci	stq	$17, 24($5)	# L :
35062306a36Sopenharmony_ci	stq	$17, 32($5)	# L :
35162306a36Sopenharmony_ci	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
35262306a36Sopenharmony_ci	nop
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci	stq	$17, 40($5)	# L :
35562306a36Sopenharmony_ci	stq	$17, 48($5)	# L :
35662306a36Sopenharmony_ci	subq	$3, 16, $2	# E : Repeat the loop at least once more?
35762306a36Sopenharmony_ci	nop
35862306a36Sopenharmony_ci
35962306a36Sopenharmony_ci	stq	$17, 56($5)	# L :
36062306a36Sopenharmony_ci	addq	$5, 64, $5	# E :
36162306a36Sopenharmony_ci	subq	$3, 8, $3	# E :
36262306a36Sopenharmony_ci	bge	$2, $do_wh64	# U :
36362306a36Sopenharmony_ci
36462306a36Sopenharmony_ci	nop
36562306a36Sopenharmony_ci	nop
36662306a36Sopenharmony_ci	nop
36762306a36Sopenharmony_ci	beq	$3, no_quad	# U : Might have finished already
36862306a36Sopenharmony_ci
36962306a36Sopenharmony_ci.align 4
37062306a36Sopenharmony_ci	/*
37162306a36Sopenharmony_ci	 * Simple loop for trailing quadwords, or for small amounts
37262306a36Sopenharmony_ci	 * of data (where we can't use an unrolled loop and wh64)
37362306a36Sopenharmony_ci	 */
37462306a36Sopenharmony_ciloop:
37562306a36Sopenharmony_ci	stq $17,0($5)		# L :
37662306a36Sopenharmony_ci	subq $3,1,$3		# E : Decrement number quads left
37762306a36Sopenharmony_ci	addq $5,8,$5		# E : Inc address
37862306a36Sopenharmony_ci	bne $3,loop		# U : more?
37962306a36Sopenharmony_ci
38062306a36Sopenharmony_cino_quad:
38162306a36Sopenharmony_ci	/*
38262306a36Sopenharmony_ci	 * Write 0..7 trailing bytes.
38362306a36Sopenharmony_ci	 */
38462306a36Sopenharmony_ci	nop			# E :
38562306a36Sopenharmony_ci	beq $18,end		# U : All done?
38662306a36Sopenharmony_ci	ldq $7,0($5)		# L :
38762306a36Sopenharmony_ci	mskqh $7,$6,$2		# U : Mask final quad
38862306a36Sopenharmony_ci
38962306a36Sopenharmony_ci	insqh $17,$6,$4		# U : New bits
39062306a36Sopenharmony_ci	bis $2,$4,$1		# E : Put it all together
39162306a36Sopenharmony_ci	stq $1,0($5)		# L : And back to memory
39262306a36Sopenharmony_ci	ret $31,($26),1		# L0 :
39362306a36Sopenharmony_ci
39462306a36Sopenharmony_ciwithin_one_quad:
39562306a36Sopenharmony_ci	ldq_u $1,0($16)		# L :
39662306a36Sopenharmony_ci	insql $17,$16,$2	# U : New bits
39762306a36Sopenharmony_ci	mskql $1,$16,$4		# U : Clear old
39862306a36Sopenharmony_ci	bis $2,$4,$2		# E : New result
39962306a36Sopenharmony_ci
40062306a36Sopenharmony_ci	mskql $2,$6,$4		# U :
40162306a36Sopenharmony_ci	mskqh $1,$6,$2		# U :
40262306a36Sopenharmony_ci	bis $2,$4,$1		# E :
40362306a36Sopenharmony_ci	stq_u $1,0($16)		# L :
40462306a36Sopenharmony_ci
40562306a36Sopenharmony_ciend:
40662306a36Sopenharmony_ci	nop
40762306a36Sopenharmony_ci	nop
40862306a36Sopenharmony_ci	nop
40962306a36Sopenharmony_ci	ret $31,($26),1		# L0 :
41062306a36Sopenharmony_ci	.end __constant_c_memset
41162306a36Sopenharmony_ci	EXPORT_SYMBOL(__constant_c_memset)
41262306a36Sopenharmony_ci
41362306a36Sopenharmony_ci	/*
41462306a36Sopenharmony_ci	 * This is a replicant of the __constant_c_memset code, rescheduled
41562306a36Sopenharmony_ci	 * to mask stalls.  Note that entry point names also had to change
41662306a36Sopenharmony_ci	 */
41762306a36Sopenharmony_ci	.align 5
41862306a36Sopenharmony_ci	.ent __memset16
41962306a36Sopenharmony_ci
42062306a36Sopenharmony_ci__memset16:
42162306a36Sopenharmony_ci	.frame $30,0,$26,0
42262306a36Sopenharmony_ci	.prologue 0
42362306a36Sopenharmony_ci
42462306a36Sopenharmony_ci	inswl $17,0,$5		# U : 000000000000c1c2
42562306a36Sopenharmony_ci	inswl $17,2,$2		# U : 00000000c1c20000
42662306a36Sopenharmony_ci	bis $16,$16,$0		# E : return value
42762306a36Sopenharmony_ci	addq	$18,$16,$6	# E : max address to write to
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	ble $18, end_w		# U : zero length requested?
43062306a36Sopenharmony_ci	inswl	$17,4,$3	# U : 0000c1c200000000
43162306a36Sopenharmony_ci	inswl	$17,6,$4	# U : c1c2000000000000
43262306a36Sopenharmony_ci	xor	$16,$6,$1	# E : will complete write be within one quadword?
43362306a36Sopenharmony_ci
43462306a36Sopenharmony_ci	or	$2,$5,$2	# E : 00000000c1c2c1c2
43562306a36Sopenharmony_ci	or	$3,$4,$17	# E : c1c2c1c200000000
43662306a36Sopenharmony_ci	bic	$1,7,$1		# E : fit within a single quadword
43762306a36Sopenharmony_ci	and	$16,7,$3	# E : Target addr misalignment
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci	or	$17,$2,$17	# E : c1c2c1c2c1c2c1c2
44062306a36Sopenharmony_ci	beq $1,within_quad_w	# U :
44162306a36Sopenharmony_ci	nop
44262306a36Sopenharmony_ci	beq $3,aligned_w	# U : target is 0mod8
44362306a36Sopenharmony_ci
44462306a36Sopenharmony_ci	/*
44562306a36Sopenharmony_ci	 * Target address is misaligned, and won't fit within a quadword
44662306a36Sopenharmony_ci	 */
44762306a36Sopenharmony_ci	ldq_u $4,0($16)		# L : Fetch first partial
44862306a36Sopenharmony_ci	bis $16,$16,$5		# E : Save the address
44962306a36Sopenharmony_ci	insql $17,$16,$2	# U : Insert new bytes
45062306a36Sopenharmony_ci	subq $3,8,$3		# E : Invert (for addressing uses)
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
45362306a36Sopenharmony_ci	mskql $4,$16,$4		# U : clear relevant parts of the quad
45462306a36Sopenharmony_ci	subq $16,$3,$16		# E : $16 is new aligned destination
45562306a36Sopenharmony_ci	bis $2,$4,$1		# E : Final bytes
45662306a36Sopenharmony_ci
45762306a36Sopenharmony_ci	nop
45862306a36Sopenharmony_ci	stq_u $1,0($5)		# L : Store result
45962306a36Sopenharmony_ci	nop
46062306a36Sopenharmony_ci	nop
46162306a36Sopenharmony_ci
46262306a36Sopenharmony_ci.align 4
46362306a36Sopenharmony_cialigned_w:
46462306a36Sopenharmony_ci	/*
46562306a36Sopenharmony_ci	 * We are now guaranteed to be quad aligned, with at least
46662306a36Sopenharmony_ci	 * one partial quad to write.
46762306a36Sopenharmony_ci	 */
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	sra $18,3,$3		# U : Number of remaining quads to write
47062306a36Sopenharmony_ci	and $18,7,$18		# E : Number of trailing bytes to write
47162306a36Sopenharmony_ci	bis $16,$16,$5		# E : Save dest address
47262306a36Sopenharmony_ci	beq $3,no_quad_w	# U : tail stuff only
47362306a36Sopenharmony_ci
47462306a36Sopenharmony_ci	/*
47562306a36Sopenharmony_ci	 * it's worth the effort to unroll this and use wh64 if possible
47662306a36Sopenharmony_ci	 * Lifted a bunch of code from clear_user.S
47762306a36Sopenharmony_ci	 * At this point, entry values are:
47862306a36Sopenharmony_ci	 * $16	Current destination address
47962306a36Sopenharmony_ci	 * $5	A copy of $16
48062306a36Sopenharmony_ci	 * $6	The max quadword address to write to
48162306a36Sopenharmony_ci	 * $18	Number trailer bytes
48262306a36Sopenharmony_ci	 * $3	Number quads to write
48362306a36Sopenharmony_ci	 */
48462306a36Sopenharmony_ci
48562306a36Sopenharmony_ci	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
48662306a36Sopenharmony_ci	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
48762306a36Sopenharmony_ci	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
48862306a36Sopenharmony_ci	blt	$4, loop_w	# U :
48962306a36Sopenharmony_ci
49062306a36Sopenharmony_ci	/*
49162306a36Sopenharmony_ci	 * We know we've got at least 16 quads, minimum of one trip
49262306a36Sopenharmony_ci	 * through unrolled loop.  Do a quad at a time to get us 0mod64
49362306a36Sopenharmony_ci	 * aligned.
49462306a36Sopenharmony_ci	 */
49562306a36Sopenharmony_ci
49662306a36Sopenharmony_ci	nop			# E :
49762306a36Sopenharmony_ci	nop			# E :
49862306a36Sopenharmony_ci	nop			# E :
49962306a36Sopenharmony_ci	beq	$1, $bigalign_w	# U :
50062306a36Sopenharmony_ci
50162306a36Sopenharmony_ci$alignmod64_w:
50262306a36Sopenharmony_ci	stq	$17, 0($5)	# L :
50362306a36Sopenharmony_ci	subq	$3, 1, $3	# E : For consistency later
50462306a36Sopenharmony_ci	addq	$1, 8, $1	# E : Increment towards zero for alignment
50562306a36Sopenharmony_ci	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
50662306a36Sopenharmony_ci
50762306a36Sopenharmony_ci	nop
50862306a36Sopenharmony_ci	nop
50962306a36Sopenharmony_ci	addq	$5, 8, $5	# E : Inc address
51062306a36Sopenharmony_ci	blt	$1, $alignmod64_w	# U :
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_ci$bigalign_w:
51362306a36Sopenharmony_ci	/*
51462306a36Sopenharmony_ci	 * $3 - number quads left to go
51562306a36Sopenharmony_ci	 * $5 - target address (aligned 0mod64)
51662306a36Sopenharmony_ci	 * $17 - mask of stuff to store
51762306a36Sopenharmony_ci	 * Scratch registers available: $7, $2, $4, $1
51862306a36Sopenharmony_ci	 * we know that we'll be taking a minimum of one trip through
51962306a36Sopenharmony_ci 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
52062306a36Sopenharmony_ci	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
52162306a36Sopenharmony_ci	 * The wh64 is issued on for the starting destination address for trip +2
52262306a36Sopenharmony_ci	 * through the loop, and if there are less than two trips left, the target
52362306a36Sopenharmony_ci	 * address will be for the current trip.
52462306a36Sopenharmony_ci	 */
52562306a36Sopenharmony_ci
52662306a36Sopenharmony_ci$do_wh64_w:
52762306a36Sopenharmony_ci	wh64	($4)		# L1 : memory subsystem write hint
52862306a36Sopenharmony_ci	subq	$3, 24, $2	# E : For determining future wh64 addresses
52962306a36Sopenharmony_ci	stq	$17, 0($5)	# L :
53062306a36Sopenharmony_ci	nop			# E :
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ci	addq	$5, 128, $4	# E : speculative target of next wh64
53362306a36Sopenharmony_ci	stq	$17, 8($5)	# L :
53462306a36Sopenharmony_ci	stq	$17, 16($5)	# L :
53562306a36Sopenharmony_ci	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_ci	stq	$17, 24($5)	# L :
53862306a36Sopenharmony_ci	stq	$17, 32($5)	# L :
53962306a36Sopenharmony_ci	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
54062306a36Sopenharmony_ci	nop
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci	stq	$17, 40($5)	# L :
54362306a36Sopenharmony_ci	stq	$17, 48($5)	# L :
54462306a36Sopenharmony_ci	subq	$3, 16, $2	# E : Repeat the loop at least once more?
54562306a36Sopenharmony_ci	nop
54662306a36Sopenharmony_ci
54762306a36Sopenharmony_ci	stq	$17, 56($5)	# L :
54862306a36Sopenharmony_ci	addq	$5, 64, $5	# E :
54962306a36Sopenharmony_ci	subq	$3, 8, $3	# E :
55062306a36Sopenharmony_ci	bge	$2, $do_wh64_w	# U :
55162306a36Sopenharmony_ci
55262306a36Sopenharmony_ci	nop
55362306a36Sopenharmony_ci	nop
55462306a36Sopenharmony_ci	nop
55562306a36Sopenharmony_ci	beq	$3, no_quad_w	# U : Might have finished already
55662306a36Sopenharmony_ci
55762306a36Sopenharmony_ci.align 4
55862306a36Sopenharmony_ci	/*
55962306a36Sopenharmony_ci	 * Simple loop for trailing quadwords, or for small amounts
56062306a36Sopenharmony_ci	 * of data (where we can't use an unrolled loop and wh64)
56162306a36Sopenharmony_ci	 */
56262306a36Sopenharmony_ciloop_w:
56362306a36Sopenharmony_ci	stq $17,0($5)		# L :
56462306a36Sopenharmony_ci	subq $3,1,$3		# E : Decrement number quads left
56562306a36Sopenharmony_ci	addq $5,8,$5		# E : Inc address
56662306a36Sopenharmony_ci	bne $3,loop_w		# U : more?
56762306a36Sopenharmony_ci
56862306a36Sopenharmony_cino_quad_w:
56962306a36Sopenharmony_ci	/*
57062306a36Sopenharmony_ci	 * Write 0..7 trailing bytes.
57162306a36Sopenharmony_ci	 */
57262306a36Sopenharmony_ci	nop			# E :
57362306a36Sopenharmony_ci	beq $18,end_w		# U : All done?
57462306a36Sopenharmony_ci	ldq $7,0($5)		# L :
57562306a36Sopenharmony_ci	mskqh $7,$6,$2		# U : Mask final quad
57662306a36Sopenharmony_ci
57762306a36Sopenharmony_ci	insqh $17,$6,$4		# U : New bits
57862306a36Sopenharmony_ci	bis $2,$4,$1		# E : Put it all together
57962306a36Sopenharmony_ci	stq $1,0($5)		# L : And back to memory
58062306a36Sopenharmony_ci	ret $31,($26),1		# L0 :
58162306a36Sopenharmony_ci
58262306a36Sopenharmony_ciwithin_quad_w:
58362306a36Sopenharmony_ci	ldq_u $1,0($16)		# L :
58462306a36Sopenharmony_ci	insql $17,$16,$2	# U : New bits
58562306a36Sopenharmony_ci	mskql $1,$16,$4		# U : Clear old
58662306a36Sopenharmony_ci	bis $2,$4,$2		# E : New result
58762306a36Sopenharmony_ci
58862306a36Sopenharmony_ci	mskql $2,$6,$4		# U :
58962306a36Sopenharmony_ci	mskqh $1,$6,$2		# U :
59062306a36Sopenharmony_ci	bis $2,$4,$1		# E :
59162306a36Sopenharmony_ci	stq_u $1,0($16)		# L :
59262306a36Sopenharmony_ci
59362306a36Sopenharmony_ciend_w:
59462306a36Sopenharmony_ci	nop
59562306a36Sopenharmony_ci	nop
59662306a36Sopenharmony_ci	nop
59762306a36Sopenharmony_ci	ret $31,($26),1		# L0 :
59862306a36Sopenharmony_ci
59962306a36Sopenharmony_ci	.end __memset16
60062306a36Sopenharmony_ci	EXPORT_SYMBOL(__memset16)
60162306a36Sopenharmony_ci
60262306a36Sopenharmony_cimemset = ___memset
60362306a36Sopenharmony_ci__memset = ___memset
60462306a36Sopenharmony_ci	EXPORT_SYMBOL(memset)
60562306a36Sopenharmony_ci	EXPORT_SYMBOL(__memset)
606