162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * arch/alpha/lib/ev6-copy_user.S
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Copy to/from user space, handling exceptions as we go..  This
862306a36Sopenharmony_ci * isn't exactly pretty.
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci * This is essentially the same as "memcpy()", but with a few twists.
1162306a36Sopenharmony_ci * Notably, we have to make sure that $0 is always up-to-date and
1262306a36Sopenharmony_ci * contains the right "bytes left to copy" value (and that it is updated
1362306a36Sopenharmony_ci * only _after_ a successful copy). There is also some rather minor
1462306a36Sopenharmony_ci * exception setup stuff..
1562306a36Sopenharmony_ci *
1662306a36Sopenharmony_ci * Much of the information about 21264 scheduling/coding comes from:
1762306a36Sopenharmony_ci *	Compiler Writer's Guide for the Alpha 21264
1862306a36Sopenharmony_ci *	abbreviated as 'CWG' in other comments here
1962306a36Sopenharmony_ci *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
2062306a36Sopenharmony_ci * Scheduling notation:
2162306a36Sopenharmony_ci *	E	- either cluster
2262306a36Sopenharmony_ci *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
2362306a36Sopenharmony_ci *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
2462306a36Sopenharmony_ci */
2562306a36Sopenharmony_ci
2662306a36Sopenharmony_ci#include <linux/export.h>
2762306a36Sopenharmony_ci/* Allow an exception for an insn; exit if we get one.  */
2862306a36Sopenharmony_ci#define EXI(x,y...)			\
2962306a36Sopenharmony_ci	99: x,##y;			\
3062306a36Sopenharmony_ci	.section __ex_table,"a";	\
3162306a36Sopenharmony_ci	.long 99b - .;			\
3262306a36Sopenharmony_ci	lda $31, $exitin-99b($31);	\
3362306a36Sopenharmony_ci	.previous
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_ci#define EXO(x,y...)			\
3662306a36Sopenharmony_ci	99: x,##y;			\
3762306a36Sopenharmony_ci	.section __ex_table,"a";	\
3862306a36Sopenharmony_ci	.long 99b - .;			\
3962306a36Sopenharmony_ci	lda $31, $exitout-99b($31);	\
4062306a36Sopenharmony_ci	.previous
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci	.set noat
4362306a36Sopenharmony_ci	.align 4
4462306a36Sopenharmony_ci	.globl __copy_user
4562306a36Sopenharmony_ci	.ent __copy_user
4662306a36Sopenharmony_ci				# Pipeline info: Slotting & Comments
4762306a36Sopenharmony_ci__copy_user:
4862306a36Sopenharmony_ci	.prologue 0
4962306a36Sopenharmony_ci	mov $18, $0		# .. .. .. E
5062306a36Sopenharmony_ci	subq $18, 32, $1	# .. .. E. ..	: Is this going to be a small copy?
5162306a36Sopenharmony_ci	nop			# .. E  .. ..
5262306a36Sopenharmony_ci	beq $18, $zerolength	# U  .. .. ..	: U L U L
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci	and $16,7,$3		# .. .. .. E	: is leading dest misalignment
5562306a36Sopenharmony_ci	ble $1, $onebyteloop	# .. .. U  ..	: 1st branch : small amount of data
5662306a36Sopenharmony_ci	beq $3, $destaligned	# .. U  .. ..	: 2nd (one cycle fetcher stall)
5762306a36Sopenharmony_ci	subq $3, 8, $3		# E  .. .. ..	: L U U L : trip counter
5862306a36Sopenharmony_ci/*
5962306a36Sopenharmony_ci * The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U)
6062306a36Sopenharmony_ci * This loop aligns the destination a byte at a time
6162306a36Sopenharmony_ci * We know we have at least one trip through this loop
6262306a36Sopenharmony_ci */
6362306a36Sopenharmony_ci$aligndest:
6462306a36Sopenharmony_ci	EXI( ldbu $1,0($17) )	# .. .. .. L	: Keep loads separate from stores
6562306a36Sopenharmony_ci	addq $16,1,$16		# .. .. E  ..	: Section 3.8 in the CWG
6662306a36Sopenharmony_ci	addq $3,1,$3		# .. E  .. ..	:
6762306a36Sopenharmony_ci	nop			# E  .. .. ..	: U L U L
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_ci/*
7062306a36Sopenharmony_ci * the -1 is to compensate for the inc($16) done in a previous quadpack
7162306a36Sopenharmony_ci * which allows us zero dependencies within either quadpack in the loop
7262306a36Sopenharmony_ci */
7362306a36Sopenharmony_ci	EXO( stb $1,-1($16) )	# .. .. .. L	:
7462306a36Sopenharmony_ci	addq $17,1,$17		# .. .. E  ..	: Section 3.8 in the CWG
7562306a36Sopenharmony_ci	subq $0,1,$0		# .. E  .. ..	:
7662306a36Sopenharmony_ci	bne $3, $aligndest	# U  .. .. ..	: U L U L
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_ci/*
7962306a36Sopenharmony_ci * If we fell through into here, we have a minimum of 33 - 7 bytes
8062306a36Sopenharmony_ci * If we arrived via branch, we have a minimum of 32 bytes
8162306a36Sopenharmony_ci */
8262306a36Sopenharmony_ci$destaligned:
8362306a36Sopenharmony_ci	and $17,7,$1		# .. .. .. E	: Check _current_ source alignment
8462306a36Sopenharmony_ci	bic $0,7,$4		# .. .. E  ..	: number bytes as a quadword loop
8562306a36Sopenharmony_ci	EXI( ldq_u $3,0($17) )	# .. L  .. ..	: Forward fetch for fallthrough code
8662306a36Sopenharmony_ci	beq $1,$quadaligned	# U  .. .. ..	: U L U L
8762306a36Sopenharmony_ci
8862306a36Sopenharmony_ci/*
8962306a36Sopenharmony_ci * In the worst case, we've just executed an ldq_u here from 0($17)
9062306a36Sopenharmony_ci * and we'll repeat it once if we take the branch
9162306a36Sopenharmony_ci */
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_ci/* Misaligned quadword loop - not unrolled.  Leave it that way. */
9462306a36Sopenharmony_ci$misquad:
9562306a36Sopenharmony_ci	EXI( ldq_u $2,8($17) )	# .. .. .. L	:
9662306a36Sopenharmony_ci	subq $4,8,$4		# .. .. E  ..	:
9762306a36Sopenharmony_ci	extql $3,$17,$3		# .. U  .. ..	:
9862306a36Sopenharmony_ci	extqh $2,$17,$1		# U  .. .. ..	: U U L L
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci	bis $3,$1,$1		# .. .. .. E	:
10162306a36Sopenharmony_ci	EXO( stq $1,0($16) )	# .. .. L  ..	:
10262306a36Sopenharmony_ci	addq $17,8,$17		# .. E  .. ..	:
10362306a36Sopenharmony_ci	subq $0,8,$0		# E  .. .. ..	: U L L U
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci	addq $16,8,$16		# .. .. .. E	:
10662306a36Sopenharmony_ci	bis $2,$2,$3		# .. .. E  ..	:
10762306a36Sopenharmony_ci	nop			# .. E  .. ..	:
10862306a36Sopenharmony_ci	bne $4,$misquad		# U  .. .. ..	: U L U L
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci	nop			# .. .. .. E
11162306a36Sopenharmony_ci	nop			# .. .. E  ..
11262306a36Sopenharmony_ci	nop			# .. E  .. ..
11362306a36Sopenharmony_ci	beq $0,$zerolength	# U  .. .. ..	: U L U L
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_ci/* We know we have at least one trip through the byte loop */
11662306a36Sopenharmony_ci	EXI ( ldbu $2,0($17) )	# .. .. .. L	: No loads in the same quad
11762306a36Sopenharmony_ci	addq $16,1,$16		# .. .. E  ..	: as the store (Section 3.8 in CWG)
11862306a36Sopenharmony_ci	nop			# .. E  .. ..	:
11962306a36Sopenharmony_ci	br $31, $dirtyentry	# L0 .. .. ..	: L U U L
12062306a36Sopenharmony_ci/* Do the trailing byte loop load, then hop into the store part of the loop */
12162306a36Sopenharmony_ci
12262306a36Sopenharmony_ci/*
12362306a36Sopenharmony_ci * A minimum of (33 - 7) bytes to do a quad at a time.
12462306a36Sopenharmony_ci * Based upon the usage context, it's worth the effort to unroll this loop
12562306a36Sopenharmony_ci * $0 - number of bytes to be moved
12662306a36Sopenharmony_ci * $4 - number of bytes to move as quadwords
12762306a36Sopenharmony_ci * $16 is current destination address
12862306a36Sopenharmony_ci * $17 is current source address
12962306a36Sopenharmony_ci */
13062306a36Sopenharmony_ci$quadaligned:
13162306a36Sopenharmony_ci	subq	$4, 32, $2	# .. .. .. E	: do not unroll for small stuff
13262306a36Sopenharmony_ci	nop			# .. .. E  ..
13362306a36Sopenharmony_ci	nop			# .. E  .. ..
13462306a36Sopenharmony_ci	blt	$2, $onequad	# U  .. .. ..	: U L U L
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_ci/*
13762306a36Sopenharmony_ci * There is a significant assumption here that the source and destination
13862306a36Sopenharmony_ci * addresses differ by more than 32 bytes.  In this particular case, a
13962306a36Sopenharmony_ci * sparsity of registers further bounds this to be a minimum of 8 bytes.
14062306a36Sopenharmony_ci * But if this isn't met, then the output result will be incorrect.
14162306a36Sopenharmony_ci * Furthermore, due to a lack of available registers, we really can't
14262306a36Sopenharmony_ci * unroll this to be an 8x loop (which would enable us to use the wh64
14362306a36Sopenharmony_ci * instruction memory hint instruction).
14462306a36Sopenharmony_ci */
14562306a36Sopenharmony_ci$unroll4:
14662306a36Sopenharmony_ci	EXI( ldq $1,0($17) )	# .. .. .. L
14762306a36Sopenharmony_ci	EXI( ldq $2,8($17) )	# .. .. L  ..
14862306a36Sopenharmony_ci	subq	$4,32,$4	# .. E  .. ..
14962306a36Sopenharmony_ci	nop			# E  .. .. ..	: U U L L
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci	addq	$17,16,$17	# .. .. .. E
15262306a36Sopenharmony_ci	EXO( stq $1,0($16) )	# .. .. L  ..
15362306a36Sopenharmony_ci	EXO( stq $2,8($16) )	# .. L  .. ..
15462306a36Sopenharmony_ci	subq	$0,16,$0	# E  .. .. ..	: U L L U
15562306a36Sopenharmony_ci
15662306a36Sopenharmony_ci	addq	$16,16,$16	# .. .. .. E
15762306a36Sopenharmony_ci	EXI( ldq $1,0($17) )	# .. .. L  ..
15862306a36Sopenharmony_ci	EXI( ldq $2,8($17) )	# .. L  .. ..
15962306a36Sopenharmony_ci	subq	$4, 32, $3	# E  .. .. ..	: U U L L : is there enough for another trip?
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci	EXO( stq $1,0($16) )	# .. .. .. L
16262306a36Sopenharmony_ci	EXO( stq $2,8($16) )	# .. .. L  ..
16362306a36Sopenharmony_ci	subq	$0,16,$0	# .. E  .. ..
16462306a36Sopenharmony_ci	addq	$17,16,$17	# E  .. .. ..	: U L L U
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_ci	nop			# .. .. .. E
16762306a36Sopenharmony_ci	nop			# .. .. E  ..
16862306a36Sopenharmony_ci	addq	$16,16,$16	# .. E  .. ..
16962306a36Sopenharmony_ci	bgt	$3,$unroll4	# U  .. .. ..	: U L U L
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci	nop
17262306a36Sopenharmony_ci	nop
17362306a36Sopenharmony_ci	nop
17462306a36Sopenharmony_ci	beq	$4, $noquads
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci$onequad:
17762306a36Sopenharmony_ci	EXI( ldq $1,0($17) )
17862306a36Sopenharmony_ci	subq	$4,8,$4
17962306a36Sopenharmony_ci	addq	$17,8,$17
18062306a36Sopenharmony_ci	nop
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci	EXO( stq $1,0($16) )
18362306a36Sopenharmony_ci	subq	$0,8,$0
18462306a36Sopenharmony_ci	addq	$16,8,$16
18562306a36Sopenharmony_ci	bne	$4,$onequad
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_ci$noquads:
18862306a36Sopenharmony_ci	nop
18962306a36Sopenharmony_ci	nop
19062306a36Sopenharmony_ci	nop
19162306a36Sopenharmony_ci	beq $0,$zerolength
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci/*
19462306a36Sopenharmony_ci * For small copies (or the tail of a larger copy), do a very simple byte loop.
19562306a36Sopenharmony_ci * There's no point in doing a lot of complex alignment calculations to try to
19662306a36Sopenharmony_ci * to quadword stuff for a small amount of data.
19762306a36Sopenharmony_ci *	$0 - remaining number of bytes left to copy
19862306a36Sopenharmony_ci *	$16 - current dest addr
19962306a36Sopenharmony_ci *	$17 - current source addr
20062306a36Sopenharmony_ci */
20162306a36Sopenharmony_ci
20262306a36Sopenharmony_ci$onebyteloop:
20362306a36Sopenharmony_ci	EXI ( ldbu $2,0($17) )	# .. .. .. L	: No loads in the same quad
20462306a36Sopenharmony_ci	addq $16,1,$16		# .. .. E  ..	: as the store (Section 3.8 in CWG)
20562306a36Sopenharmony_ci	nop			# .. E  .. ..	:
20662306a36Sopenharmony_ci	nop			# E  .. .. ..	: U L U L
20762306a36Sopenharmony_ci
20862306a36Sopenharmony_ci$dirtyentry:
20962306a36Sopenharmony_ci/*
21062306a36Sopenharmony_ci * the -1 is to compensate for the inc($16) done in a previous quadpack
21162306a36Sopenharmony_ci * which allows us zero dependencies within either quadpack in the loop
21262306a36Sopenharmony_ci */
21362306a36Sopenharmony_ci	EXO ( stb $2,-1($16) )	# .. .. .. L	:
21462306a36Sopenharmony_ci	addq $17,1,$17		# .. .. E  ..	: quadpack as the load
21562306a36Sopenharmony_ci	subq $0,1,$0		# .. E  .. ..	: change count _after_ copy
21662306a36Sopenharmony_ci	bgt $0,$onebyteloop	# U  .. .. ..	: U L U L
21762306a36Sopenharmony_ci
21862306a36Sopenharmony_ci$zerolength:
21962306a36Sopenharmony_ci$exitin:
22062306a36Sopenharmony_ci$exitout:			# Destination for exception recovery(?)
22162306a36Sopenharmony_ci	nop			# .. .. .. E
22262306a36Sopenharmony_ci	nop			# .. .. E  ..
22362306a36Sopenharmony_ci	nop			# .. E  .. ..
22462306a36Sopenharmony_ci	ret $31,($26),1		# L0 .. .. ..	: L U L U
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci	.end __copy_user
22762306a36Sopenharmony_ci	EXPORT_SYMBOL(__copy_user)
228