1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * arch/alpha/lib/ev6-divide.S
4  *
5  * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
6  *
7  * Alpha division..
8  */
9 
10 /*
11  * The alpha chip doesn't provide hardware division, so we have to do it
12  * by hand.  The compiler expects the functions
13  *
14  *	__divqu: 64-bit unsigned long divide
15  *	__remqu: 64-bit unsigned long remainder
16  *	__divqs/__remqs: signed 64-bit
17  *	__divlu/__remlu: unsigned 32-bit
18  *	__divls/__remls: signed 32-bit
19  *
20  * These are not normal C functions: instead of the normal
21  * calling sequence, these expect their arguments in registers
22  * $24 and $25, and return the result in $27. Register $28 may
23  * be clobbered (assembly temporary), anything else must be saved.
24  *
25  * In short: painful.
26  *
27  * This is a rather simple bit-at-a-time algorithm: it's very good
28  * at dividing random 64-bit numbers, but the more usual case where
29  * the divisor is small is handled better by the DEC algorithm
30  * using lookup tables. This uses much less memory, though, and is
31  * nicer on the cache.. Besides, I don't know the copyright status
32  * of the DEC code.
33  */
34 
35 /*
36  * My temporaries:
37  *	$0 - current bit
38  *	$1 - shifted divisor
39  *	$2 - modulus/quotient
40  *
41  *	$23 - return address
42  *	$24 - dividend
43  *	$25 - divisor
44  *
45  *	$27 - quotient/modulus
46  *	$28 - compare status
47  *
48  * Much of the information about 21264 scheduling/coding comes from:
49  *	Compiler Writer's Guide for the Alpha 21264
50  *	abbreviated as 'CWG' in other comments here
51  *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
52  * Scheduling notation:
53  *	E	- either cluster
54  *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
55  *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
56  * Try not to change the actual algorithm if possible for consistency.
57  */
58 
59 #include <linux/export.h>
60 #define halt .long 0
61 
62 /*
63  * Select function type and registers
64  */
65 #define mask	$0
66 #define divisor	$1
67 #define compare $28
68 #define tmp1	$3
69 #define tmp2	$4
70 
71 #ifdef DIV
72 #define DIV_ONLY(x,y...) x,##y
73 #define MOD_ONLY(x,y...)
74 #define func(x) __div##x
75 #define modulus $2
76 #define quotient $27
77 #define GETSIGN(x) xor $24,$25,x
78 #define STACK 48
79 #else
80 #define DIV_ONLY(x,y...)
81 #define MOD_ONLY(x,y...) x,##y
82 #define func(x) __rem##x
83 #define modulus $27
84 #define quotient $2
85 #define GETSIGN(x) bis $24,$24,x
86 #define STACK 32
87 #endif
88 
89 /*
90  * For 32-bit operations, we need to extend to 64-bit
91  */
92 #ifdef INTSIZE
93 #define ufunction func(lu)
94 #define sfunction func(l)
95 #define LONGIFY(x) zapnot x,15,x
96 #define SLONGIFY(x) addl x,0,x
97 #else
98 #define ufunction func(qu)
99 #define sfunction func(q)
100 #define LONGIFY(x)
101 #define SLONGIFY(x)
102 #endif
103 
104 .set noat
105 .align	4
106 .globl	ufunction
107 .ent	ufunction
108 ufunction:
109 	subq	$30,STACK,$30		# E :
110 	.frame	$30,STACK,$23
111 	.prologue 0
112 
113 7:	stq	$1, 0($30)		# L :
114 	bis	$25,$25,divisor		# E :
115 	stq	$2, 8($30)		# L : L U L U
116 
117 	bis	$24,$24,modulus		# E :
118 	stq	$0,16($30)		# L :
119 	bis	$31,$31,quotient	# E :
120 	LONGIFY(divisor)		# E : U L L U
121 
122 	stq	tmp1,24($30)		# L :
123 	LONGIFY(modulus)		# E :
124 	bis	$31,1,mask		# E :
125 	DIV_ONLY(stq tmp2,32($30))	# L : L U U L
126 
127 	beq	divisor, 9f			/* div by zero */
128 	/*
129 	 * In spite of the DIV_ONLY being either a non-instruction
130 	 * or an actual stq, the addition of the .align directive
131 	 * below ensures that label 1 is going to be nicely aligned
132 	 */
133 
134 	.align	4
135 #ifdef INTSIZE
136 	/*
137 	 * shift divisor left, using 3-bit shifts for
138 	 * 32-bit divides as we can't overflow. Three-bit
139 	 * shifts will result in looping three times less
140 	 * here, but can result in two loops more later.
141 	 * Thus using a large shift isn't worth it (and
142 	 * s8add pairs better than a sll..)
143 	 */
144 1:	cmpult	divisor,modulus,compare	# E :
145 	s8addq	divisor,$31,divisor	# E :
146 	s8addq	mask,$31,mask		# E :
147 	bne	compare,1b		# U : U L U L
148 #else
149 1:	cmpult	divisor,modulus,compare	# E :
150 	nop				# E :
151 	nop				# E :
152 	blt     divisor, 2f		# U : U L U L
153 
154 	addq	divisor,divisor,divisor	# E :
155 	addq	mask,mask,mask		# E :
156 	unop				# E :
157 	bne	compare,1b		# U : U L U L
158 #endif
159 
160 	/* ok, start to go right again.. */
161 2:
162 	/*
163 	 * Keep things nicely bundled... use a nop instead of not
164 	 * having an instruction for DIV_ONLY
165 	 */
166 #ifdef DIV
167 	DIV_ONLY(addq quotient,mask,tmp2) # E :
168 #else
169 	nop				# E :
170 #endif
171 	srl	mask,1,mask		# U :
172 	cmpule	divisor,modulus,compare	# E :
173 	subq	modulus,divisor,tmp1	# E :
174 
175 #ifdef DIV
176 	DIV_ONLY(cmovne compare,tmp2,quotient)	# E : Latency 2, extra map slot
177 	nop				# E : as part of the cmovne
178 	srl	divisor,1,divisor	# U :
179 	nop				# E : L U L U
180 
181 	nop				# E :
182 	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
183 	nop				# E : as part of the cmovne
184 	bne	mask,2b			# U : U L U L
185 #else
186 	srl	divisor,1,divisor	# U :
187 	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
188 	nop				# E : as part of the cmovne
189 	bne	mask,2b			# U : U L L U
190 #endif
191 
192 9:	ldq	$1, 0($30)		# L :
193 	ldq	$2, 8($30)		# L :
194 	nop				# E :
195 	nop				# E : U U L L
196 
197 	ldq	$0,16($30)		# L :
198 	ldq	tmp1,24($30)		# L :
199 	nop				# E :
200 	nop				# E :
201 
202 #ifdef DIV
203 	DIV_ONLY(ldq tmp2,32($30))	# L :
204 #else
205 	nop				# E :
206 #endif
207 	addq	$30,STACK,$30		# E :
208 	ret	$31,($23),1		# L0 : L U U L
209 	.end	ufunction
210 EXPORT_SYMBOL(ufunction)
211 
212 /*
213  * Uhh.. Ugly signed division. I'd rather not have it at all, but
214  * it's needed in some circumstances. There are different ways to
215  * handle this, really. This does:
216  * 	-a / b = a / -b = -(a / b)
217  *	-a % b = -(a % b)
218  *	a % -b = a % b
219  * which is probably not the best solution, but at least should
220  * have the property that (x/y)*y + (x%y) = x.
221  */
222 .align 4
223 .globl	sfunction
224 .ent	sfunction
225 sfunction:
226 	subq	$30,STACK,$30		# E :
227 	.frame	$30,STACK,$23
228 	.prologue 0
229 	bis	$24,$25,$28		# E :
230 	SLONGIFY($28)			# E :
231 	bge	$28,7b			# U :
232 
233 	stq	$24,0($30)		# L :
234 	subq	$31,$24,$28		# E :
235 	stq	$25,8($30)		# L :
236 	nop				# E : U L U L
237 
238 	cmovlt	$24,$28,$24	/* abs($24) */ # E : Latency 2, extra map slot
239 	nop				# E : as part of the cmov
240 	stq	$23,16($30)		# L :
241 	subq	$31,$25,$28		# E : U L U L
242 
243 	stq	tmp1,24($30)		# L :
244 	cmovlt	$25,$28,$25	/* abs($25) */ # E : Latency 2, extra map slot
245 	nop				# E :
246 	bsr	$23,ufunction		# L0: L U L U
247 
248 	ldq	$24,0($30)		# L :
249 	ldq	$25,8($30)		# L :
250 	GETSIGN($28)			# E :
251 	subq	$31,$27,tmp1		# E : U U L L
252 
253 	SLONGIFY($28)			# E :
254 	ldq	$23,16($30)		# L :
255 	cmovlt	$28,tmp1,$27		# E : Latency 2, extra map slot
256 	nop				# E : U L L U : as part of the cmov
257 
258 	ldq	tmp1,24($30)		# L :
259 	nop				# E : as part of the cmov
260 	addq	$30,STACK,$30		# E :
261 	ret	$31,($23),1		# L0 : L U U L
262 	.end	sfunction
263 EXPORT_SYMBOL(sfunction)
264