18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * arch/alpha/lib/ev6-clear_user.S
48c2ecf20Sopenharmony_ci * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
58c2ecf20Sopenharmony_ci *
68c2ecf20Sopenharmony_ci * Zero user space, handling exceptions as we go.
78c2ecf20Sopenharmony_ci *
88c2ecf20Sopenharmony_ci * We have to make sure that $0 is always up-to-date and contains the
98c2ecf20Sopenharmony_ci * right "bytes left to zero" value (and that it is updated only _after_
108c2ecf20Sopenharmony_ci * a successful copy).  There is also some rather minor exception setup
118c2ecf20Sopenharmony_ci * stuff.
128c2ecf20Sopenharmony_ci *
138c2ecf20Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from:
148c2ecf20Sopenharmony_ci *	Compiler Writer's Guide for the Alpha 21264
158c2ecf20Sopenharmony_ci *	abbreviated as 'CWG' in other comments here
168c2ecf20Sopenharmony_ci *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
178c2ecf20Sopenharmony_ci * Scheduling notation:
188c2ecf20Sopenharmony_ci *	E	- either cluster
198c2ecf20Sopenharmony_ci *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
208c2ecf20Sopenharmony_ci *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
218c2ecf20Sopenharmony_ci * Try not to change the actual algorithm if possible for consistency.
228c2ecf20Sopenharmony_ci * Determining actual stalls (other than slotting) doesn't appear to be easy to do.
238c2ecf20Sopenharmony_ci * From perusing the source code context where this routine is called, it is
248c2ecf20Sopenharmony_ci * a fair assumption that significant fractions of entire pages are zeroed, so
258c2ecf20Sopenharmony_ci * it's going to be worth the effort to hand-unroll a big loop, and use wh64.
268c2ecf20Sopenharmony_ci * ASSUMPTION:
278c2ecf20Sopenharmony_ci *	The believed purpose of only updating $0 after a store is that a signal
288c2ecf20Sopenharmony_ci *	may come along during the execution of this chunk of code, and we don't
298c2ecf20Sopenharmony_ci *	want to leave a hole (and we also want to avoid repeating lots of work)
308c2ecf20Sopenharmony_ci */
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_ci#include <asm/export.h>
338c2ecf20Sopenharmony_ci/* Allow an exception for an insn; exit if we get one.  */
348c2ecf20Sopenharmony_ci#define EX(x,y...)			\
358c2ecf20Sopenharmony_ci	99: x,##y;			\
368c2ecf20Sopenharmony_ci	.section __ex_table,"a";	\
378c2ecf20Sopenharmony_ci	.long 99b - .;			\
388c2ecf20Sopenharmony_ci	lda $31, $exception-99b($31); 	\
398c2ecf20Sopenharmony_ci	.previous
408c2ecf20Sopenharmony_ci
418c2ecf20Sopenharmony_ci	.set noat
428c2ecf20Sopenharmony_ci	.set noreorder
438c2ecf20Sopenharmony_ci	.align 4
448c2ecf20Sopenharmony_ci
458c2ecf20Sopenharmony_ci	.globl __clear_user
468c2ecf20Sopenharmony_ci	.ent __clear_user
478c2ecf20Sopenharmony_ci	.frame	$30, 0, $26
488c2ecf20Sopenharmony_ci	.prologue 0
498c2ecf20Sopenharmony_ci
508c2ecf20Sopenharmony_ci				# Pipeline info : Slotting & Comments
518c2ecf20Sopenharmony_ci__clear_user:
528c2ecf20Sopenharmony_ci	and	$17, $17, $0
538c2ecf20Sopenharmony_ci	and	$16, 7, $4	# .. E  .. ..	: find dest head misalignment
548c2ecf20Sopenharmony_ci	beq	$0, $zerolength # U  .. .. ..	:  U L U L
558c2ecf20Sopenharmony_ci
568c2ecf20Sopenharmony_ci	addq	$0, $4, $1	# .. .. .. E	: bias counter
578c2ecf20Sopenharmony_ci	and	$1, 7, $2	# .. .. E  ..	: number of misaligned bytes in tail
588c2ecf20Sopenharmony_ci# Note - we never actually use $2, so this is a moot computation
598c2ecf20Sopenharmony_ci# and we can rewrite this later...
608c2ecf20Sopenharmony_ci	srl	$1, 3, $1	# .. E  .. ..	: number of quadwords to clear
618c2ecf20Sopenharmony_ci	beq	$4, $headalign	# U  .. .. ..	: U L U L
628c2ecf20Sopenharmony_ci
638c2ecf20Sopenharmony_ci/*
648c2ecf20Sopenharmony_ci * Head is not aligned.  Write (8 - $4) bytes to head of destination
658c2ecf20Sopenharmony_ci * This means $16 is known to be misaligned
668c2ecf20Sopenharmony_ci */
678c2ecf20Sopenharmony_ci	EX( ldq_u $5, 0($16) )	# .. .. .. L	: load dst word to mask back in
688c2ecf20Sopenharmony_ci	beq	$1, $onebyte	# .. .. U  ..	: sub-word store?
698c2ecf20Sopenharmony_ci	mskql	$5, $16, $5	# .. U  .. ..	: take care of misaligned head
708c2ecf20Sopenharmony_ci	addq	$16, 8, $16	# E  .. .. .. 	: L U U L
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_ci	EX( stq_u $5, -8($16) )	# .. .. .. L	:
738c2ecf20Sopenharmony_ci	subq	$1, 1, $1	# .. .. E  ..	:
748c2ecf20Sopenharmony_ci	addq	$0, $4, $0	# .. E  .. ..	: bytes left -= 8 - misalignment
758c2ecf20Sopenharmony_ci	subq	$0, 8, $0	# E  .. .. ..	: U L U L
768c2ecf20Sopenharmony_ci
778c2ecf20Sopenharmony_ci	.align	4
788c2ecf20Sopenharmony_ci/*
798c2ecf20Sopenharmony_ci * (The .align directive ought to be a moot point)
808c2ecf20Sopenharmony_ci * values upon initial entry to the loop
818c2ecf20Sopenharmony_ci * $1 is number of quadwords to clear (zero is a valid value)
828c2ecf20Sopenharmony_ci * $2 is number of trailing bytes (0..7) ($2 never used...)
838c2ecf20Sopenharmony_ci * $16 is known to be aligned 0mod8
848c2ecf20Sopenharmony_ci */
858c2ecf20Sopenharmony_ci$headalign:
868c2ecf20Sopenharmony_ci	subq	$1, 16, $4	# .. .. .. E	: If < 16, we can not use the huge loop
878c2ecf20Sopenharmony_ci	and	$16, 0x3f, $2	# .. .. E  ..	: Forward work for huge loop
888c2ecf20Sopenharmony_ci	subq	$2, 0x40, $3	# .. E  .. ..	: bias counter (huge loop)
898c2ecf20Sopenharmony_ci	blt	$4, $trailquad	# U  .. .. ..	: U L U L
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci/*
928c2ecf20Sopenharmony_ci * We know that we're going to do at least 16 quads, which means we are
938c2ecf20Sopenharmony_ci * going to be able to use the large block clear loop at least once.
948c2ecf20Sopenharmony_ci * Figure out how many quads we need to clear before we are 0mod64 aligned
958c2ecf20Sopenharmony_ci * so we can use the wh64 instruction.
968c2ecf20Sopenharmony_ci */
978c2ecf20Sopenharmony_ci
988c2ecf20Sopenharmony_ci	nop			# .. .. .. E
998c2ecf20Sopenharmony_ci	nop			# .. .. E  ..
1008c2ecf20Sopenharmony_ci	nop			# .. E  .. ..
1018c2ecf20Sopenharmony_ci	beq	$3, $bigalign	# U  .. .. ..	: U L U L : Aligned 0mod64
1028c2ecf20Sopenharmony_ci
1038c2ecf20Sopenharmony_ci$alignmod64:
1048c2ecf20Sopenharmony_ci	EX( stq_u $31, 0($16) )	# .. .. .. L
1058c2ecf20Sopenharmony_ci	addq	$3, 8, $3	# .. .. E  ..
1068c2ecf20Sopenharmony_ci	subq	$0, 8, $0	# .. E  .. ..
1078c2ecf20Sopenharmony_ci	nop			# E  .. .. ..	: U L U L
1088c2ecf20Sopenharmony_ci
1098c2ecf20Sopenharmony_ci	nop			# .. .. .. E
1108c2ecf20Sopenharmony_ci	subq	$1, 1, $1	# .. .. E  ..
1118c2ecf20Sopenharmony_ci	addq	$16, 8, $16	# .. E  .. ..
1128c2ecf20Sopenharmony_ci	blt	$3, $alignmod64	# U  .. .. ..	: U L U L
1138c2ecf20Sopenharmony_ci
1148c2ecf20Sopenharmony_ci$bigalign:
1158c2ecf20Sopenharmony_ci/*
1168c2ecf20Sopenharmony_ci * $0 is the number of bytes left
1178c2ecf20Sopenharmony_ci * $1 is the number of quads left
1188c2ecf20Sopenharmony_ci * $16 is aligned 0mod64
1198c2ecf20Sopenharmony_ci * we know that we'll be taking a minimum of one trip through
1208c2ecf20Sopenharmony_ci * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
1218c2ecf20Sopenharmony_ci * We are _not_ going to update $0 after every single store.  That
1228c2ecf20Sopenharmony_ci * would be silly, because there will be cross-cluster dependencies
1238c2ecf20Sopenharmony_ci * no matter how the code is scheduled.  By doing it in slightly
1248c2ecf20Sopenharmony_ci * staggered fashion, we can still do this loop in 5 fetches
1258c2ecf20Sopenharmony_ci * The worse case will be doing two extra quads in some future execution,
1268c2ecf20Sopenharmony_ci * in the event of an interrupted clear.
1278c2ecf20Sopenharmony_ci * Assumes the wh64 needs to be for 2 trips through the loop in the future
1288c2ecf20Sopenharmony_ci * The wh64 is issued on for the starting destination address for trip +2
1298c2ecf20Sopenharmony_ci * through the loop, and if there are less than two trips left, the target
1308c2ecf20Sopenharmony_ci * address will be for the current trip.
1318c2ecf20Sopenharmony_ci */
1328c2ecf20Sopenharmony_ci	nop			# E :
1338c2ecf20Sopenharmony_ci	nop			# E :
1348c2ecf20Sopenharmony_ci	nop			# E :
1358c2ecf20Sopenharmony_ci	bis	$16,$16,$3	# E : U L U L : Initial wh64 address is dest
1368c2ecf20Sopenharmony_ci	/* This might actually help for the current trip... */
1378c2ecf20Sopenharmony_ci
1388c2ecf20Sopenharmony_ci$do_wh64:
1398c2ecf20Sopenharmony_ci	wh64	($3)		# .. .. .. L1	: memory subsystem hint
1408c2ecf20Sopenharmony_ci	subq	$1, 16, $4	# .. .. E  ..	: Forward calculation - repeat the loop?
1418c2ecf20Sopenharmony_ci	EX( stq_u $31, 0($16) )	# .. L  .. ..
1428c2ecf20Sopenharmony_ci	subq	$0, 8, $0	# E  .. .. ..	: U L U L
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci	addq	$16, 128, $3	# E : Target address of wh64
1458c2ecf20Sopenharmony_ci	EX( stq_u $31, 8($16) )	# L :
1468c2ecf20Sopenharmony_ci	EX( stq_u $31, 16($16) )	# L :
1478c2ecf20Sopenharmony_ci	subq	$0, 16, $0	# E : U L L U
1488c2ecf20Sopenharmony_ci
1498c2ecf20Sopenharmony_ci	nop			# E :
1508c2ecf20Sopenharmony_ci	EX( stq_u $31, 24($16) )	# L :
1518c2ecf20Sopenharmony_ci	EX( stq_u $31, 32($16) )	# L :
1528c2ecf20Sopenharmony_ci	subq	$0, 168, $5	# E : U L L U : two trips through the loop left?
1538c2ecf20Sopenharmony_ci	/* 168 = 192 - 24, since we've already completed some stores */
1548c2ecf20Sopenharmony_ci
1558c2ecf20Sopenharmony_ci	subq	$0, 16, $0	# E :
1568c2ecf20Sopenharmony_ci	EX( stq_u $31, 40($16) )	# L :
1578c2ecf20Sopenharmony_ci	EX( stq_u $31, 48($16) )	# L :
1588c2ecf20Sopenharmony_ci	cmovlt	$5, $16, $3	# E : U L L U : Latency 2, extra mapping cycle
1598c2ecf20Sopenharmony_ci
1608c2ecf20Sopenharmony_ci	subq	$1, 8, $1	# E :
1618c2ecf20Sopenharmony_ci	subq	$0, 16, $0	# E :
1628c2ecf20Sopenharmony_ci	EX( stq_u $31, 56($16) )	# L :
1638c2ecf20Sopenharmony_ci	nop			# E : U L U L
1648c2ecf20Sopenharmony_ci
1658c2ecf20Sopenharmony_ci	nop			# E :
1668c2ecf20Sopenharmony_ci	subq	$0, 8, $0	# E :
1678c2ecf20Sopenharmony_ci	addq	$16, 64, $16	# E :
1688c2ecf20Sopenharmony_ci	bge	$4, $do_wh64	# U : U L U L
1698c2ecf20Sopenharmony_ci
1708c2ecf20Sopenharmony_ci$trailquad:
1718c2ecf20Sopenharmony_ci	# zero to 16 quadwords left to store, plus any trailing bytes
1728c2ecf20Sopenharmony_ci	# $1 is the number of quadwords left to go.
1738c2ecf20Sopenharmony_ci	#
1748c2ecf20Sopenharmony_ci	nop			# .. .. .. E
1758c2ecf20Sopenharmony_ci	nop			# .. .. E  ..
1768c2ecf20Sopenharmony_ci	nop			# .. E  .. ..
1778c2ecf20Sopenharmony_ci	beq	$1, $trailbytes	# U  .. .. ..	: U L U L : Only 0..7 bytes to go
1788c2ecf20Sopenharmony_ci
1798c2ecf20Sopenharmony_ci$onequad:
1808c2ecf20Sopenharmony_ci	EX( stq_u $31, 0($16) )	# .. .. .. L
1818c2ecf20Sopenharmony_ci	subq	$1, 1, $1	# .. .. E  ..
1828c2ecf20Sopenharmony_ci	subq	$0, 8, $0	# .. E  .. ..
1838c2ecf20Sopenharmony_ci	nop			# E  .. .. ..	: U L U L
1848c2ecf20Sopenharmony_ci
1858c2ecf20Sopenharmony_ci	nop			# .. .. .. E
1868c2ecf20Sopenharmony_ci	nop			# .. .. E  ..
1878c2ecf20Sopenharmony_ci	addq	$16, 8, $16	# .. E  .. ..
1888c2ecf20Sopenharmony_ci	bgt	$1, $onequad	# U  .. .. ..	: U L U L
1898c2ecf20Sopenharmony_ci
1908c2ecf20Sopenharmony_ci	# We have an unknown number of bytes left to go.
1918c2ecf20Sopenharmony_ci$trailbytes:
1928c2ecf20Sopenharmony_ci	nop			# .. .. .. E
1938c2ecf20Sopenharmony_ci	nop			# .. .. E  ..
1948c2ecf20Sopenharmony_ci	nop			# .. E  .. ..
1958c2ecf20Sopenharmony_ci	beq	$0, $zerolength	# U  .. .. ..	: U L U L
1968c2ecf20Sopenharmony_ci
1978c2ecf20Sopenharmony_ci	# $0 contains the number of bytes left to copy (0..31)
1988c2ecf20Sopenharmony_ci	# so we will use $0 as the loop counter
1998c2ecf20Sopenharmony_ci	# We know for a fact that $0 > 0 zero due to previous context
2008c2ecf20Sopenharmony_ci$onebyte:
2018c2ecf20Sopenharmony_ci	EX( stb $31, 0($16) )	# .. .. .. L
2028c2ecf20Sopenharmony_ci	subq	$0, 1, $0	# .. .. E  ..	:
2038c2ecf20Sopenharmony_ci	addq	$16, 1, $16	# .. E  .. ..	:
2048c2ecf20Sopenharmony_ci	bgt	$0, $onebyte	# U  .. .. ..	: U L U L
2058c2ecf20Sopenharmony_ci
2068c2ecf20Sopenharmony_ci$zerolength:
2078c2ecf20Sopenharmony_ci$exception:			# Destination for exception recovery(?)
2088c2ecf20Sopenharmony_ci	nop			# .. .. .. E	:
2098c2ecf20Sopenharmony_ci	nop			# .. .. E  ..	:
2108c2ecf20Sopenharmony_ci	nop			# .. E  .. ..	:
2118c2ecf20Sopenharmony_ci	ret	$31, ($26), 1	# L0 .. .. ..	: L U L U
2128c2ecf20Sopenharmony_ci	.end __clear_user
2138c2ecf20Sopenharmony_ci	EXPORT_SYMBOL(__clear_user)
214