162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * arch/alpha/lib/ev6-clear_user.S
462306a36Sopenharmony_ci * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * Zero user space, handling exceptions as we go.
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci * We have to make sure that $0 is always up-to-date and contains the
962306a36Sopenharmony_ci * right "bytes left to zero" value (and that it is updated only _after_
1062306a36Sopenharmony_ci * a successful copy).  There is also some rather minor exception setup
1162306a36Sopenharmony_ci * stuff.
1262306a36Sopenharmony_ci *
1362306a36Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from:
1462306a36Sopenharmony_ci *	Compiler Writer's Guide for the Alpha 21264
1562306a36Sopenharmony_ci *	abbreviated as 'CWG' in other comments here
1662306a36Sopenharmony_ci *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
1762306a36Sopenharmony_ci * Scheduling notation:
1862306a36Sopenharmony_ci *	E	- either cluster
1962306a36Sopenharmony_ci *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
2062306a36Sopenharmony_ci *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
2162306a36Sopenharmony_ci * Try not to change the actual algorithm if possible for consistency.
2262306a36Sopenharmony_ci * Determining actual stalls (other than slotting) doesn't appear to be easy to do.
2362306a36Sopenharmony_ci * From perusing the source code context where this routine is called, it is
2462306a36Sopenharmony_ci * a fair assumption that significant fractions of entire pages are zeroed, so
2562306a36Sopenharmony_ci * it's going to be worth the effort to hand-unroll a big loop, and use wh64.
2662306a36Sopenharmony_ci * ASSUMPTION:
2762306a36Sopenharmony_ci *	The believed purpose of only updating $0 after a store is that a signal
2862306a36Sopenharmony_ci *	may come along during the execution of this chunk of code, and we don't
2962306a36Sopenharmony_ci *	want to leave a hole (and we also want to avoid repeating lots of work)
3062306a36Sopenharmony_ci */
3162306a36Sopenharmony_ci
3262306a36Sopenharmony_ci#include <linux/export.h>
3362306a36Sopenharmony_ci/* Allow an exception for an insn; exit if we get one.  */
3462306a36Sopenharmony_ci#define EX(x,y...)			\
3562306a36Sopenharmony_ci	99: x,##y;			\
3662306a36Sopenharmony_ci	.section __ex_table,"a";	\
3762306a36Sopenharmony_ci	.long 99b - .;			\
3862306a36Sopenharmony_ci	lda $31, $exception-99b($31); 	\
3962306a36Sopenharmony_ci	.previous
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_ci	.set noat
4262306a36Sopenharmony_ci	.set noreorder
4362306a36Sopenharmony_ci	.align 4
4462306a36Sopenharmony_ci
4562306a36Sopenharmony_ci	.globl __clear_user
4662306a36Sopenharmony_ci	.ent __clear_user
4762306a36Sopenharmony_ci	.frame	$30, 0, $26
4862306a36Sopenharmony_ci	.prologue 0
4962306a36Sopenharmony_ci
5062306a36Sopenharmony_ci				# Pipeline info : Slotting & Comments
5162306a36Sopenharmony_ci__clear_user:
5262306a36Sopenharmony_ci	and	$17, $17, $0
5362306a36Sopenharmony_ci	and	$16, 7, $4	# .. E  .. ..	: find dest head misalignment
5462306a36Sopenharmony_ci	beq	$0, $zerolength # U  .. .. ..	:  U L U L
5562306a36Sopenharmony_ci
5662306a36Sopenharmony_ci	addq	$0, $4, $1	# .. .. .. E	: bias counter
5762306a36Sopenharmony_ci	and	$1, 7, $2	# .. .. E  ..	: number of misaligned bytes in tail
5862306a36Sopenharmony_ci# Note - we never actually use $2, so this is a moot computation
5962306a36Sopenharmony_ci# and we can rewrite this later...
6062306a36Sopenharmony_ci	srl	$1, 3, $1	# .. E  .. ..	: number of quadwords to clear
6162306a36Sopenharmony_ci	beq	$4, $headalign	# U  .. .. ..	: U L U L
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci/*
6462306a36Sopenharmony_ci * Head is not aligned.  Write (8 - $4) bytes to head of destination
6562306a36Sopenharmony_ci * This means $16 is known to be misaligned
6662306a36Sopenharmony_ci */
6762306a36Sopenharmony_ci	EX( ldq_u $5, 0($16) )	# .. .. .. L	: load dst word to mask back in
6862306a36Sopenharmony_ci	beq	$1, $onebyte	# .. .. U  ..	: sub-word store?
6962306a36Sopenharmony_ci	mskql	$5, $16, $5	# .. U  .. ..	: take care of misaligned head
7062306a36Sopenharmony_ci	addq	$16, 8, $16	# E  .. .. .. 	: L U U L
7162306a36Sopenharmony_ci
7262306a36Sopenharmony_ci	EX( stq_u $5, -8($16) )	# .. .. .. L	:
7362306a36Sopenharmony_ci	subq	$1, 1, $1	# .. .. E  ..	:
7462306a36Sopenharmony_ci	addq	$0, $4, $0	# .. E  .. ..	: bytes left -= 8 - misalignment
7562306a36Sopenharmony_ci	subq	$0, 8, $0	# E  .. .. ..	: U L U L
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_ci	.align	4
7862306a36Sopenharmony_ci/*
7962306a36Sopenharmony_ci * (The .align directive ought to be a moot point)
8062306a36Sopenharmony_ci * values upon initial entry to the loop
8162306a36Sopenharmony_ci * $1 is number of quadwords to clear (zero is a valid value)
8262306a36Sopenharmony_ci * $2 is number of trailing bytes (0..7) ($2 never used...)
8362306a36Sopenharmony_ci * $16 is known to be aligned 0mod8
8462306a36Sopenharmony_ci */
8562306a36Sopenharmony_ci$headalign:
8662306a36Sopenharmony_ci	subq	$1, 16, $4	# .. .. .. E	: If < 16, we can not use the huge loop
8762306a36Sopenharmony_ci	and	$16, 0x3f, $2	# .. .. E  ..	: Forward work for huge loop
8862306a36Sopenharmony_ci	subq	$2, 0x40, $3	# .. E  .. ..	: bias counter (huge loop)
8962306a36Sopenharmony_ci	blt	$4, $trailquad	# U  .. .. ..	: U L U L
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci/*
9262306a36Sopenharmony_ci * We know that we're going to do at least 16 quads, which means we are
9362306a36Sopenharmony_ci * going to be able to use the large block clear loop at least once.
9462306a36Sopenharmony_ci * Figure out how many quads we need to clear before we are 0mod64 aligned
9562306a36Sopenharmony_ci * so we can use the wh64 instruction.
9662306a36Sopenharmony_ci */
9762306a36Sopenharmony_ci
9862306a36Sopenharmony_ci	nop			# .. .. .. E
9962306a36Sopenharmony_ci	nop			# .. .. E  ..
10062306a36Sopenharmony_ci	nop			# .. E  .. ..
10162306a36Sopenharmony_ci	beq	$3, $bigalign	# U  .. .. ..	: U L U L : Aligned 0mod64
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_ci$alignmod64:
10462306a36Sopenharmony_ci	EX( stq_u $31, 0($16) )	# .. .. .. L
10562306a36Sopenharmony_ci	addq	$3, 8, $3	# .. .. E  ..
10662306a36Sopenharmony_ci	subq	$0, 8, $0	# .. E  .. ..
10762306a36Sopenharmony_ci	nop			# E  .. .. ..	: U L U L
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci	nop			# .. .. .. E
11062306a36Sopenharmony_ci	subq	$1, 1, $1	# .. .. E  ..
11162306a36Sopenharmony_ci	addq	$16, 8, $16	# .. E  .. ..
11262306a36Sopenharmony_ci	blt	$3, $alignmod64	# U  .. .. ..	: U L U L
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci$bigalign:
11562306a36Sopenharmony_ci/*
11662306a36Sopenharmony_ci * $0 is the number of bytes left
11762306a36Sopenharmony_ci * $1 is the number of quads left
11862306a36Sopenharmony_ci * $16 is aligned 0mod64
11962306a36Sopenharmony_ci * we know that we'll be taking a minimum of one trip through
12062306a36Sopenharmony_ci * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
12162306a36Sopenharmony_ci * We are _not_ going to update $0 after every single store.  That
12262306a36Sopenharmony_ci * would be silly, because there will be cross-cluster dependencies
12362306a36Sopenharmony_ci * no matter how the code is scheduled.  By doing it in slightly
12462306a36Sopenharmony_ci * staggered fashion, we can still do this loop in 5 fetches
12562306a36Sopenharmony_ci * The worse case will be doing two extra quads in some future execution,
12662306a36Sopenharmony_ci * in the event of an interrupted clear.
12762306a36Sopenharmony_ci * Assumes the wh64 needs to be for 2 trips through the loop in the future
12862306a36Sopenharmony_ci * The wh64 is issued on for the starting destination address for trip +2
12962306a36Sopenharmony_ci * through the loop, and if there are less than two trips left, the target
13062306a36Sopenharmony_ci * address will be for the current trip.
13162306a36Sopenharmony_ci */
13262306a36Sopenharmony_ci	nop			# E :
13362306a36Sopenharmony_ci	nop			# E :
13462306a36Sopenharmony_ci	nop			# E :
13562306a36Sopenharmony_ci	bis	$16,$16,$3	# E : U L U L : Initial wh64 address is dest
13662306a36Sopenharmony_ci	/* This might actually help for the current trip... */
13762306a36Sopenharmony_ci
13862306a36Sopenharmony_ci$do_wh64:
13962306a36Sopenharmony_ci	wh64	($3)		# .. .. .. L1	: memory subsystem hint
14062306a36Sopenharmony_ci	subq	$1, 16, $4	# .. .. E  ..	: Forward calculation - repeat the loop?
14162306a36Sopenharmony_ci	EX( stq_u $31, 0($16) )	# .. L  .. ..
14262306a36Sopenharmony_ci	subq	$0, 8, $0	# E  .. .. ..	: U L U L
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci	addq	$16, 128, $3	# E : Target address of wh64
14562306a36Sopenharmony_ci	EX( stq_u $31, 8($16) )	# L :
14662306a36Sopenharmony_ci	EX( stq_u $31, 16($16) )	# L :
14762306a36Sopenharmony_ci	subq	$0, 16, $0	# E : U L L U
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci	nop			# E :
15062306a36Sopenharmony_ci	EX( stq_u $31, 24($16) )	# L :
15162306a36Sopenharmony_ci	EX( stq_u $31, 32($16) )	# L :
15262306a36Sopenharmony_ci	subq	$0, 168, $5	# E : U L L U : two trips through the loop left?
15362306a36Sopenharmony_ci	/* 168 = 192 - 24, since we've already completed some stores */
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci	subq	$0, 16, $0	# E :
15662306a36Sopenharmony_ci	EX( stq_u $31, 40($16) )	# L :
15762306a36Sopenharmony_ci	EX( stq_u $31, 48($16) )	# L :
15862306a36Sopenharmony_ci	cmovlt	$5, $16, $3	# E : U L L U : Latency 2, extra mapping cycle
15962306a36Sopenharmony_ci
16062306a36Sopenharmony_ci	subq	$1, 8, $1	# E :
16162306a36Sopenharmony_ci	subq	$0, 16, $0	# E :
16262306a36Sopenharmony_ci	EX( stq_u $31, 56($16) )	# L :
16362306a36Sopenharmony_ci	nop			# E : U L U L
16462306a36Sopenharmony_ci
16562306a36Sopenharmony_ci	nop			# E :
16662306a36Sopenharmony_ci	subq	$0, 8, $0	# E :
16762306a36Sopenharmony_ci	addq	$16, 64, $16	# E :
16862306a36Sopenharmony_ci	bge	$4, $do_wh64	# U : U L U L
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci$trailquad:
17162306a36Sopenharmony_ci	# zero to 16 quadwords left to store, plus any trailing bytes
17262306a36Sopenharmony_ci	# $1 is the number of quadwords left to go.
17362306a36Sopenharmony_ci	#
17462306a36Sopenharmony_ci	nop			# .. .. .. E
17562306a36Sopenharmony_ci	nop			# .. .. E  ..
17662306a36Sopenharmony_ci	nop			# .. E  .. ..
17762306a36Sopenharmony_ci	beq	$1, $trailbytes	# U  .. .. ..	: U L U L : Only 0..7 bytes to go
17862306a36Sopenharmony_ci
17962306a36Sopenharmony_ci$onequad:
18062306a36Sopenharmony_ci	EX( stq_u $31, 0($16) )	# .. .. .. L
18162306a36Sopenharmony_ci	subq	$1, 1, $1	# .. .. E  ..
18262306a36Sopenharmony_ci	subq	$0, 8, $0	# .. E  .. ..
18362306a36Sopenharmony_ci	nop			# E  .. .. ..	: U L U L
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_ci	nop			# .. .. .. E
18662306a36Sopenharmony_ci	nop			# .. .. E  ..
18762306a36Sopenharmony_ci	addq	$16, 8, $16	# .. E  .. ..
18862306a36Sopenharmony_ci	bgt	$1, $onequad	# U  .. .. ..	: U L U L
18962306a36Sopenharmony_ci
19062306a36Sopenharmony_ci	# We have an unknown number of bytes left to go.
19162306a36Sopenharmony_ci$trailbytes:
19262306a36Sopenharmony_ci	nop			# .. .. .. E
19362306a36Sopenharmony_ci	nop			# .. .. E  ..
19462306a36Sopenharmony_ci	nop			# .. E  .. ..
19562306a36Sopenharmony_ci	beq	$0, $zerolength	# U  .. .. ..	: U L U L
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_ci	# $0 contains the number of bytes left to copy (0..31)
19862306a36Sopenharmony_ci	# so we will use $0 as the loop counter
19962306a36Sopenharmony_ci	# We know for a fact that $0 > 0 zero due to previous context
20062306a36Sopenharmony_ci$onebyte:
20162306a36Sopenharmony_ci	EX( stb $31, 0($16) )	# .. .. .. L
20262306a36Sopenharmony_ci	subq	$0, 1, $0	# .. .. E  ..	:
20362306a36Sopenharmony_ci	addq	$16, 1, $16	# .. E  .. ..	:
20462306a36Sopenharmony_ci	bgt	$0, $onebyte	# U  .. .. ..	: U L U L
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_ci$zerolength:
20762306a36Sopenharmony_ci$exception:			# Destination for exception recovery(?)
20862306a36Sopenharmony_ci	nop			# .. .. .. E	:
20962306a36Sopenharmony_ci	nop			# .. .. E  ..	:
21062306a36Sopenharmony_ci	nop			# .. E  .. ..	:
21162306a36Sopenharmony_ci	ret	$31, ($26), 1	# L0 .. .. ..	: L U L U
21262306a36Sopenharmony_ci	.end __clear_user
21362306a36Sopenharmony_ci	EXPORT_SYMBOL(__clear_user)
214