1 #ifndef	__KERNEL__
2 # include "arm_arch.h"
3 
4 .private_extern	_OPENSSL_armv8_rsa_neonized
5 #endif
6 .text
7 
8 .globl	_bn_mul_mont
9 
10 .align	5
11 _bn_mul_mont:
12 Lbn_mul_mont:
13 	tst	x5,#3
14 	b.ne	Lmul_mont
15 	cmp	x5,#32
16 	b.le	Lscalar_impl
17 #ifndef	__KERNEL__
18 	adrp	x17,_OPENSSL_armv8_rsa_neonized@PAGE
19 	ldr	w17,[x17,_OPENSSL_armv8_rsa_neonized@PAGEOFF]
20 	cbnz	w17, bn_mul8x_mont_neon
21 #endif
22 
23 Lscalar_impl:
24 	tst	x5,#7
25 	b.eq	__bn_sqr8x_mont
26 	tst	x5,#3
27 	b.eq	__bn_mul4x_mont
28 
29 Lmul_mont:
30 	stp	x29,x30,[sp,#-64]!
31 	add	x29,sp,#0
32 	stp	x19,x20,[sp,#16]
33 	stp	x21,x22,[sp,#32]
34 	stp	x23,x24,[sp,#48]
35 
36 	ldr	x9,[x2],#8		// bp[0]
37 	sub	x22,sp,x5,lsl#3
38 	ldp	x7,x8,[x1],#16	// ap[0..1]
39 	lsl	x5,x5,#3
40 	ldr	x4,[x4]		// *n0
41 	and	x22,x22,#-16		// ABI says so
42 	ldp	x13,x14,[x3],#16	// np[0..1]
43 
44 	mul	x6,x7,x9		// ap[0]*bp[0]
45 	sub	x21,x5,#16		// j=num-2
46 	umulh	x7,x7,x9
47 	mul	x10,x8,x9		// ap[1]*bp[0]
48 	umulh	x11,x8,x9
49 
50 	mul	x15,x6,x4		// "tp[0]"*n0
51 	mov	sp,x22			// alloca
52 
53 	// (*)	mul	x12,x13,x15	// np[0]*m1
54 	umulh	x13,x13,x15
55 	mul	x16,x14,x15		// np[1]*m1
56 	// (*)	adds	x12,x12,x6	// discarded
57 	// (*)	As for removal of first multiplication and addition
58 	//	instructions. The outcome of first addition is
59 	//	guaranteed to be zero, which leaves two computationally
60 	//	significant outcomes: it either carries or not. Then
61 	//	question is when does it carry? Is there alternative
62 	//	way to deduce it? If you follow operations, you can
63 	//	observe that condition for carry is quite simple:
64 	//	x6 being non-zero. So that carry can be calculated
65 	//	by adding -1 to x6. That's what next instruction does.
66 	subs	xzr,x6,#1		// (*)
67 	umulh	x17,x14,x15
68 	adc	x13,x13,xzr
69 	cbz	x21,L1st_skip
70 
71 L1st:
72 	ldr	x8,[x1],#8
73 	adds	x6,x10,x7
74 	sub	x21,x21,#8		// j--
75 	adc	x7,x11,xzr
76 
77 	ldr	x14,[x3],#8
78 	adds	x12,x16,x13
79 	mul	x10,x8,x9		// ap[j]*bp[0]
80 	adc	x13,x17,xzr
81 	umulh	x11,x8,x9
82 
83 	adds	x12,x12,x6
84 	mul	x16,x14,x15		// np[j]*m1
85 	adc	x13,x13,xzr
86 	umulh	x17,x14,x15
87 	str	x12,[x22],#8		// tp[j-1]
88 	cbnz	x21,L1st
89 
90 L1st_skip:
91 	adds	x6,x10,x7
92 	sub	x1,x1,x5		// rewind x1
93 	adc	x7,x11,xzr
94 
95 	adds	x12,x16,x13
96 	sub	x3,x3,x5		// rewind x3
97 	adc	x13,x17,xzr
98 
99 	adds	x12,x12,x6
100 	sub	x20,x5,#8		// i=num-1
101 	adcs	x13,x13,x7
102 
103 	adc	x19,xzr,xzr		// upmost overflow bit
104 	stp	x12,x13,[x22]
105 
106 Louter:
107 	ldr	x9,[x2],#8		// bp[i]
108 	ldp	x7,x8,[x1],#16
109 	ldr	x23,[sp]		// tp[0]
110 	add	x22,sp,#8
111 
112 	mul	x6,x7,x9		// ap[0]*bp[i]
113 	sub	x21,x5,#16		// j=num-2
114 	umulh	x7,x7,x9
115 	ldp	x13,x14,[x3],#16
116 	mul	x10,x8,x9		// ap[1]*bp[i]
117 	adds	x6,x6,x23
118 	umulh	x11,x8,x9
119 	adc	x7,x7,xzr
120 
121 	mul	x15,x6,x4
122 	sub	x20,x20,#8		// i--
123 
124 	// (*)	mul	x12,x13,x15	// np[0]*m1
125 	umulh	x13,x13,x15
126 	mul	x16,x14,x15		// np[1]*m1
127 	// (*)	adds	x12,x12,x6
128 	subs	xzr,x6,#1		// (*)
129 	umulh	x17,x14,x15
130 	cbz	x21,Linner_skip
131 
132 Linner:
133 	ldr	x8,[x1],#8
134 	adc	x13,x13,xzr
135 	ldr	x23,[x22],#8		// tp[j]
136 	adds	x6,x10,x7
137 	sub	x21,x21,#8		// j--
138 	adc	x7,x11,xzr
139 
140 	adds	x12,x16,x13
141 	ldr	x14,[x3],#8
142 	adc	x13,x17,xzr
143 
144 	mul	x10,x8,x9		// ap[j]*bp[i]
145 	adds	x6,x6,x23
146 	umulh	x11,x8,x9
147 	adc	x7,x7,xzr
148 
149 	mul	x16,x14,x15		// np[j]*m1
150 	adds	x12,x12,x6
151 	umulh	x17,x14,x15
152 	stur	x12,[x22,#-16]		// tp[j-1]
153 	cbnz	x21,Linner
154 
155 Linner_skip:
156 	ldr	x23,[x22],#8		// tp[j]
157 	adc	x13,x13,xzr
158 	adds	x6,x10,x7
159 	sub	x1,x1,x5		// rewind x1
160 	adc	x7,x11,xzr
161 
162 	adds	x12,x16,x13
163 	sub	x3,x3,x5		// rewind x3
164 	adcs	x13,x17,x19
165 	adc	x19,xzr,xzr
166 
167 	adds	x6,x6,x23
168 	adc	x7,x7,xzr
169 
170 	adds	x12,x12,x6
171 	adcs	x13,x13,x7
172 	adc	x19,x19,xzr		// upmost overflow bit
173 	stp	x12,x13,[x22,#-16]
174 
175 	cbnz	x20,Louter
176 
177 	// Final step. We see if result is larger than modulus, and
178 	// if it is, subtract the modulus. But comparison implies
179 	// subtraction. So we subtract modulus, see if it borrowed,
180 	// and conditionally copy original value.
181 	ldr	x23,[sp]		// tp[0]
182 	add	x22,sp,#8
183 	ldr	x14,[x3],#8		// np[0]
184 	subs	x21,x5,#8		// j=num-1 and clear borrow
185 	mov	x1,x0
186 Lsub:
187 	sbcs	x8,x23,x14		// tp[j]-np[j]
188 	ldr	x23,[x22],#8
189 	sub	x21,x21,#8		// j--
190 	ldr	x14,[x3],#8
191 	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
192 	cbnz	x21,Lsub
193 
194 	sbcs	x8,x23,x14
195 	sbcs	x19,x19,xzr		// did it borrow?
196 	str	x8,[x1],#8		// rp[num-1]
197 
198 	ldr	x23,[sp]		// tp[0]
199 	add	x22,sp,#8
200 	ldr	x8,[x0],#8		// rp[0]
201 	sub	x5,x5,#8		// num--
202 	nop
203 Lcond_copy:
204 	sub	x5,x5,#8		// num--
205 	csel	x14,x23,x8,lo		// did it borrow?
206 	ldr	x23,[x22],#8
207 	ldr	x8,[x0],#8
208 	stur	xzr,[x22,#-16]		// wipe tp
209 	stur	x14,[x0,#-16]
210 	cbnz	x5,Lcond_copy
211 
212 	csel	x14,x23,x8,lo
213 	stur	xzr,[x22,#-8]		// wipe tp
214 	stur	x14,[x0,#-8]
215 
216 	ldp	x19,x20,[x29,#16]
217 	mov	sp,x29
218 	ldp	x21,x22,[x29,#32]
219 	mov	x0,#1
220 	ldp	x23,x24,[x29,#48]
221 	ldr	x29,[sp],#64
222 	ret
223 
224 
225 .align	5
226 bn_mul8x_mont_neon:
227 	stp	x29,x30,[sp,#-80]!
228 	mov	x16,sp
229 	stp	d8,d9,[sp,#16]
230 	stp	d10,d11,[sp,#32]
231 	stp	d12,d13,[sp,#48]
232 	stp	d14,d15,[sp,#64]
233 	lsl	x5,x5,#1
234 	eor	v14.16b,v14.16b,v14.16b
235 
236 .align	4
237 LNEON_8n:
238 	eor	v6.16b,v6.16b,v6.16b
239 	sub	x7,sp,#128
240 	eor	v7.16b,v7.16b,v7.16b
241 	sub	x7,x7,x5,lsl#4
242 	eor	v8.16b,v8.16b,v8.16b
243 	and	x7,x7,#-64
244 	eor	v9.16b,v9.16b,v9.16b
245 	mov	sp,x7		// alloca
246 	eor	v10.16b,v10.16b,v10.16b
247 	add	x7,x7,#256
248 	eor	v11.16b,v11.16b,v11.16b
249 	sub	x8,x5,#8
250 	eor	v12.16b,v12.16b,v12.16b
251 	eor	v13.16b,v13.16b,v13.16b
252 
253 LNEON_8n_init:
254 	st1	{v6.2d,v7.2d},[x7],#32
255 	subs	x8,x8,#8
256 	st1	{v8.2d,v9.2d},[x7],#32
257 	st1	{v10.2d,v11.2d},[x7],#32
258 	st1	{v12.2d,v13.2d},[x7],#32
259 	bne	LNEON_8n_init
260 
261 	add	x6,sp,#256
262 	ld1	{v0.4s,v1.4s},[x1],#32
263 	add	x10,sp,#8
264 	ldr	s30,[x4],#4
265 	mov	x9,x5
266 	b	LNEON_8n_outer
267 
268 .align	4
269 LNEON_8n_outer:
270 	ldr	s28,[x2],#4   // *b++
271 	uxtl	v28.4s,v28.4h
272 	add	x7,sp,#128
273 	ld1	{v2.4s,v3.4s},[x3],#32
274 
275 	umlal	v6.2d,v28.2s,v0.s[0]
276 	umlal	v7.2d,v28.2s,v0.s[1]
277 	umlal	v8.2d,v28.2s,v0.s[2]
278 	shl	v29.2d,v6.2d,#16
279 	ext	v29.16b,v29.16b,v29.16b,#8
280 	umlal	v9.2d,v28.2s,v0.s[3]
281 	add	v29.2d,v29.2d,v6.2d
282 	umlal	v10.2d,v28.2s,v1.s[0]
283 	mul	v29.2s,v29.2s,v30.2s
284 	umlal	v11.2d,v28.2s,v1.s[1]
285 	st1	{v28.2s},[sp]		// put aside smashed b[8*i+0]
286 	umlal	v12.2d,v28.2s,v1.s[2]
287 	uxtl	v29.4s,v29.4h
288 	umlal	v13.2d,v28.2s,v1.s[3]
289 	ldr	s28,[x2],#4   // *b++
290 	umlal	v6.2d,v29.2s,v2.s[0]
291 	umlal	v7.2d,v29.2s,v2.s[1]
292 	uxtl	v28.4s,v28.4h
293 	umlal	v8.2d,v29.2s,v2.s[2]
294 	ushr	v15.2d,v6.2d,#16
295 	umlal	v9.2d,v29.2s,v2.s[3]
296 	umlal	v10.2d,v29.2s,v3.s[0]
297 	ext	v6.16b,v6.16b,v6.16b,#8
298 	add	v6.2d,v6.2d,v15.2d
299 	umlal	v11.2d,v29.2s,v3.s[1]
300 	ushr	v6.2d,v6.2d,#16
301 	umlal	v12.2d,v29.2s,v3.s[2]
302 	umlal	v13.2d,v29.2s,v3.s[3]
303 	add	v16.2d,v7.2d,v6.2d
304 	ins	v7.d[0],v16.d[0]
305 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+0]
306 	umlal	v7.2d,v28.2s,v0.s[0]
307 	ld1	{v6.2d},[x6],#16
308 	umlal	v8.2d,v28.2s,v0.s[1]
309 	umlal	v9.2d,v28.2s,v0.s[2]
310 	shl	v29.2d,v7.2d,#16
311 	ext	v29.16b,v29.16b,v29.16b,#8
312 	umlal	v10.2d,v28.2s,v0.s[3]
313 	add	v29.2d,v29.2d,v7.2d
314 	umlal	v11.2d,v28.2s,v1.s[0]
315 	mul	v29.2s,v29.2s,v30.2s
316 	umlal	v12.2d,v28.2s,v1.s[1]
317 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+1]
318 	umlal	v13.2d,v28.2s,v1.s[2]
319 	uxtl	v29.4s,v29.4h
320 	umlal	v6.2d,v28.2s,v1.s[3]
321 	ldr	s28,[x2],#4   // *b++
322 	umlal	v7.2d,v29.2s,v2.s[0]
323 	umlal	v8.2d,v29.2s,v2.s[1]
324 	uxtl	v28.4s,v28.4h
325 	umlal	v9.2d,v29.2s,v2.s[2]
326 	ushr	v15.2d,v7.2d,#16
327 	umlal	v10.2d,v29.2s,v2.s[3]
328 	umlal	v11.2d,v29.2s,v3.s[0]
329 	ext	v7.16b,v7.16b,v7.16b,#8
330 	add	v7.2d,v7.2d,v15.2d
331 	umlal	v12.2d,v29.2s,v3.s[1]
332 	ushr	v7.2d,v7.2d,#16
333 	umlal	v13.2d,v29.2s,v3.s[2]
334 	umlal	v6.2d,v29.2s,v3.s[3]
335 	add	v16.2d,v8.2d,v7.2d
336 	ins	v8.d[0],v16.d[0]
337 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+1]
338 	umlal	v8.2d,v28.2s,v0.s[0]
339 	ld1	{v7.2d},[x6],#16
340 	umlal	v9.2d,v28.2s,v0.s[1]
341 	umlal	v10.2d,v28.2s,v0.s[2]
342 	shl	v29.2d,v8.2d,#16
343 	ext	v29.16b,v29.16b,v29.16b,#8
344 	umlal	v11.2d,v28.2s,v0.s[3]
345 	add	v29.2d,v29.2d,v8.2d
346 	umlal	v12.2d,v28.2s,v1.s[0]
347 	mul	v29.2s,v29.2s,v30.2s
348 	umlal	v13.2d,v28.2s,v1.s[1]
349 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+2]
350 	umlal	v6.2d,v28.2s,v1.s[2]
351 	uxtl	v29.4s,v29.4h
352 	umlal	v7.2d,v28.2s,v1.s[3]
353 	ldr	s28,[x2],#4   // *b++
354 	umlal	v8.2d,v29.2s,v2.s[0]
355 	umlal	v9.2d,v29.2s,v2.s[1]
356 	uxtl	v28.4s,v28.4h
357 	umlal	v10.2d,v29.2s,v2.s[2]
358 	ushr	v15.2d,v8.2d,#16
359 	umlal	v11.2d,v29.2s,v2.s[3]
360 	umlal	v12.2d,v29.2s,v3.s[0]
361 	ext	v8.16b,v8.16b,v8.16b,#8
362 	add	v8.2d,v8.2d,v15.2d
363 	umlal	v13.2d,v29.2s,v3.s[1]
364 	ushr	v8.2d,v8.2d,#16
365 	umlal	v6.2d,v29.2s,v3.s[2]
366 	umlal	v7.2d,v29.2s,v3.s[3]
367 	add	v16.2d,v9.2d,v8.2d
368 	ins	v9.d[0],v16.d[0]
369 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+2]
370 	umlal	v9.2d,v28.2s,v0.s[0]
371 	ld1	{v8.2d},[x6],#16
372 	umlal	v10.2d,v28.2s,v0.s[1]
373 	umlal	v11.2d,v28.2s,v0.s[2]
374 	shl	v29.2d,v9.2d,#16
375 	ext	v29.16b,v29.16b,v29.16b,#8
376 	umlal	v12.2d,v28.2s,v0.s[3]
377 	add	v29.2d,v29.2d,v9.2d
378 	umlal	v13.2d,v28.2s,v1.s[0]
379 	mul	v29.2s,v29.2s,v30.2s
380 	umlal	v6.2d,v28.2s,v1.s[1]
381 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+3]
382 	umlal	v7.2d,v28.2s,v1.s[2]
383 	uxtl	v29.4s,v29.4h
384 	umlal	v8.2d,v28.2s,v1.s[3]
385 	ldr	s28,[x2],#4   // *b++
386 	umlal	v9.2d,v29.2s,v2.s[0]
387 	umlal	v10.2d,v29.2s,v2.s[1]
388 	uxtl	v28.4s,v28.4h
389 	umlal	v11.2d,v29.2s,v2.s[2]
390 	ushr	v15.2d,v9.2d,#16
391 	umlal	v12.2d,v29.2s,v2.s[3]
392 	umlal	v13.2d,v29.2s,v3.s[0]
393 	ext	v9.16b,v9.16b,v9.16b,#8
394 	add	v9.2d,v9.2d,v15.2d
395 	umlal	v6.2d,v29.2s,v3.s[1]
396 	ushr	v9.2d,v9.2d,#16
397 	umlal	v7.2d,v29.2s,v3.s[2]
398 	umlal	v8.2d,v29.2s,v3.s[3]
399 	add	v16.2d,v10.2d,v9.2d
400 	ins	v10.d[0],v16.d[0]
401 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+3]
402 	umlal	v10.2d,v28.2s,v0.s[0]
403 	ld1	{v9.2d},[x6],#16
404 	umlal	v11.2d,v28.2s,v0.s[1]
405 	umlal	v12.2d,v28.2s,v0.s[2]
406 	shl	v29.2d,v10.2d,#16
407 	ext	v29.16b,v29.16b,v29.16b,#8
408 	umlal	v13.2d,v28.2s,v0.s[3]
409 	add	v29.2d,v29.2d,v10.2d
410 	umlal	v6.2d,v28.2s,v1.s[0]
411 	mul	v29.2s,v29.2s,v30.2s
412 	umlal	v7.2d,v28.2s,v1.s[1]
413 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+4]
414 	umlal	v8.2d,v28.2s,v1.s[2]
415 	uxtl	v29.4s,v29.4h
416 	umlal	v9.2d,v28.2s,v1.s[3]
417 	ldr	s28,[x2],#4   // *b++
418 	umlal	v10.2d,v29.2s,v2.s[0]
419 	umlal	v11.2d,v29.2s,v2.s[1]
420 	uxtl	v28.4s,v28.4h
421 	umlal	v12.2d,v29.2s,v2.s[2]
422 	ushr	v15.2d,v10.2d,#16
423 	umlal	v13.2d,v29.2s,v2.s[3]
424 	umlal	v6.2d,v29.2s,v3.s[0]
425 	ext	v10.16b,v10.16b,v10.16b,#8
426 	add	v10.2d,v10.2d,v15.2d
427 	umlal	v7.2d,v29.2s,v3.s[1]
428 	ushr	v10.2d,v10.2d,#16
429 	umlal	v8.2d,v29.2s,v3.s[2]
430 	umlal	v9.2d,v29.2s,v3.s[3]
431 	add	v16.2d,v11.2d,v10.2d
432 	ins	v11.d[0],v16.d[0]
433 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+4]
434 	umlal	v11.2d,v28.2s,v0.s[0]
435 	ld1	{v10.2d},[x6],#16
436 	umlal	v12.2d,v28.2s,v0.s[1]
437 	umlal	v13.2d,v28.2s,v0.s[2]
438 	shl	v29.2d,v11.2d,#16
439 	ext	v29.16b,v29.16b,v29.16b,#8
440 	umlal	v6.2d,v28.2s,v0.s[3]
441 	add	v29.2d,v29.2d,v11.2d
442 	umlal	v7.2d,v28.2s,v1.s[0]
443 	mul	v29.2s,v29.2s,v30.2s
444 	umlal	v8.2d,v28.2s,v1.s[1]
445 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+5]
446 	umlal	v9.2d,v28.2s,v1.s[2]
447 	uxtl	v29.4s,v29.4h
448 	umlal	v10.2d,v28.2s,v1.s[3]
449 	ldr	s28,[x2],#4   // *b++
450 	umlal	v11.2d,v29.2s,v2.s[0]
451 	umlal	v12.2d,v29.2s,v2.s[1]
452 	uxtl	v28.4s,v28.4h
453 	umlal	v13.2d,v29.2s,v2.s[2]
454 	ushr	v15.2d,v11.2d,#16
455 	umlal	v6.2d,v29.2s,v2.s[3]
456 	umlal	v7.2d,v29.2s,v3.s[0]
457 	ext	v11.16b,v11.16b,v11.16b,#8
458 	add	v11.2d,v11.2d,v15.2d
459 	umlal	v8.2d,v29.2s,v3.s[1]
460 	ushr	v11.2d,v11.2d,#16
461 	umlal	v9.2d,v29.2s,v3.s[2]
462 	umlal	v10.2d,v29.2s,v3.s[3]
463 	add	v16.2d,v12.2d,v11.2d
464 	ins	v12.d[0],v16.d[0]
465 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+5]
466 	umlal	v12.2d,v28.2s,v0.s[0]
467 	ld1	{v11.2d},[x6],#16
468 	umlal	v13.2d,v28.2s,v0.s[1]
469 	umlal	v6.2d,v28.2s,v0.s[2]
470 	shl	v29.2d,v12.2d,#16
471 	ext	v29.16b,v29.16b,v29.16b,#8
472 	umlal	v7.2d,v28.2s,v0.s[3]
473 	add	v29.2d,v29.2d,v12.2d
474 	umlal	v8.2d,v28.2s,v1.s[0]
475 	mul	v29.2s,v29.2s,v30.2s
476 	umlal	v9.2d,v28.2s,v1.s[1]
477 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+6]
478 	umlal	v10.2d,v28.2s,v1.s[2]
479 	uxtl	v29.4s,v29.4h
480 	umlal	v11.2d,v28.2s,v1.s[3]
481 	ldr	s28,[x2],#4   // *b++
482 	umlal	v12.2d,v29.2s,v2.s[0]
483 	umlal	v13.2d,v29.2s,v2.s[1]
484 	uxtl	v28.4s,v28.4h
485 	umlal	v6.2d,v29.2s,v2.s[2]
486 	ushr	v15.2d,v12.2d,#16
487 	umlal	v7.2d,v29.2s,v2.s[3]
488 	umlal	v8.2d,v29.2s,v3.s[0]
489 	ext	v12.16b,v12.16b,v12.16b,#8
490 	add	v12.2d,v12.2d,v15.2d
491 	umlal	v9.2d,v29.2s,v3.s[1]
492 	ushr	v12.2d,v12.2d,#16
493 	umlal	v10.2d,v29.2s,v3.s[2]
494 	umlal	v11.2d,v29.2s,v3.s[3]
495 	add	v16.2d,v13.2d,v12.2d
496 	ins	v13.d[0],v16.d[0]
497 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+6]
498 	umlal	v13.2d,v28.2s,v0.s[0]
499 	ld1	{v12.2d},[x6],#16
500 	umlal	v6.2d,v28.2s,v0.s[1]
501 	umlal	v7.2d,v28.2s,v0.s[2]
502 	shl	v29.2d,v13.2d,#16
503 	ext	v29.16b,v29.16b,v29.16b,#8
504 	umlal	v8.2d,v28.2s,v0.s[3]
505 	add	v29.2d,v29.2d,v13.2d
506 	umlal	v9.2d,v28.2s,v1.s[0]
507 	mul	v29.2s,v29.2s,v30.2s
508 	umlal	v10.2d,v28.2s,v1.s[1]
509 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+7]
510 	umlal	v11.2d,v28.2s,v1.s[2]
511 	uxtl	v29.4s,v29.4h
512 	umlal	v12.2d,v28.2s,v1.s[3]
513 	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
514 	umlal	v13.2d,v29.2s,v2.s[0]
515 	ld1	{v0.4s,v1.4s},[x1],#32
516 	umlal	v6.2d,v29.2s,v2.s[1]
517 	umlal	v7.2d,v29.2s,v2.s[2]
518 	mov	v5.16b,v13.16b
519 	ushr	v5.2d,v5.2d,#16
520 	ext	v13.16b,v13.16b,v13.16b,#8
521 	umlal	v8.2d,v29.2s,v2.s[3]
522 	umlal	v9.2d,v29.2s,v3.s[0]
523 	add	v13.2d,v13.2d,v5.2d
524 	umlal	v10.2d,v29.2s,v3.s[1]
525 	ushr	v13.2d,v13.2d,#16
526 	eor	v15.16b,v15.16b,v15.16b
527 	ins	v13.d[1],v15.d[0]
528 	umlal	v11.2d,v29.2s,v3.s[2]
529 	umlal	v12.2d,v29.2s,v3.s[3]
530 	add	v6.2d,v6.2d,v13.2d
531 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+7]
532 	add	x10,sp,#8		// rewind
533 	sub	x8,x5,#8
534 	b	LNEON_8n_inner
535 
536 .align	4
537 LNEON_8n_inner:
538 	subs	x8,x8,#8
539 	umlal	v6.2d,v28.2s,v0.s[0]
540 	ld1	{v13.2d},[x6]
541 	umlal	v7.2d,v28.2s,v0.s[1]
542 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+0]
543 	umlal	v8.2d,v28.2s,v0.s[2]
544 	ld1	{v2.4s,v3.4s},[x3],#32
545 	umlal	v9.2d,v28.2s,v0.s[3]
546 	b.eq	LInner_jump
547 	add	x6,x6,#16	// don't advance in last iteration
548 LInner_jump:
549 	umlal	v10.2d,v28.2s,v1.s[0]
550 	umlal	v11.2d,v28.2s,v1.s[1]
551 	umlal	v12.2d,v28.2s,v1.s[2]
552 	umlal	v13.2d,v28.2s,v1.s[3]
553 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+1]
554 	umlal	v6.2d,v29.2s,v2.s[0]
555 	umlal	v7.2d,v29.2s,v2.s[1]
556 	umlal	v8.2d,v29.2s,v2.s[2]
557 	umlal	v9.2d,v29.2s,v2.s[3]
558 	umlal	v10.2d,v29.2s,v3.s[0]
559 	umlal	v11.2d,v29.2s,v3.s[1]
560 	umlal	v12.2d,v29.2s,v3.s[2]
561 	umlal	v13.2d,v29.2s,v3.s[3]
562 	st1	{v6.2d},[x7],#16
563 	umlal	v7.2d,v28.2s,v0.s[0]
564 	ld1	{v6.2d},[x6]
565 	umlal	v8.2d,v28.2s,v0.s[1]
566 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+1]
567 	umlal	v9.2d,v28.2s,v0.s[2]
568 	b.eq	LInner_jump1
569 	add	x6,x6,#16	// don't advance in last iteration
570 LInner_jump1:
571 	umlal	v10.2d,v28.2s,v0.s[3]
572 	umlal	v11.2d,v28.2s,v1.s[0]
573 	umlal	v12.2d,v28.2s,v1.s[1]
574 	umlal	v13.2d,v28.2s,v1.s[2]
575 	umlal	v6.2d,v28.2s,v1.s[3]
576 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+2]
577 	umlal	v7.2d,v29.2s,v2.s[0]
578 	umlal	v8.2d,v29.2s,v2.s[1]
579 	umlal	v9.2d,v29.2s,v2.s[2]
580 	umlal	v10.2d,v29.2s,v2.s[3]
581 	umlal	v11.2d,v29.2s,v3.s[0]
582 	umlal	v12.2d,v29.2s,v3.s[1]
583 	umlal	v13.2d,v29.2s,v3.s[2]
584 	umlal	v6.2d,v29.2s,v3.s[3]
585 	st1	{v7.2d},[x7],#16
586 	umlal	v8.2d,v28.2s,v0.s[0]
587 	ld1	{v7.2d},[x6]
588 	umlal	v9.2d,v28.2s,v0.s[1]
589 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+2]
590 	umlal	v10.2d,v28.2s,v0.s[2]
591 	b.eq	LInner_jump2
592 	add	x6,x6,#16	// don't advance in last iteration
593 LInner_jump2:
594 	umlal	v11.2d,v28.2s,v0.s[3]
595 	umlal	v12.2d,v28.2s,v1.s[0]
596 	umlal	v13.2d,v28.2s,v1.s[1]
597 	umlal	v6.2d,v28.2s,v1.s[2]
598 	umlal	v7.2d,v28.2s,v1.s[3]
599 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+3]
600 	umlal	v8.2d,v29.2s,v2.s[0]
601 	umlal	v9.2d,v29.2s,v2.s[1]
602 	umlal	v10.2d,v29.2s,v2.s[2]
603 	umlal	v11.2d,v29.2s,v2.s[3]
604 	umlal	v12.2d,v29.2s,v3.s[0]
605 	umlal	v13.2d,v29.2s,v3.s[1]
606 	umlal	v6.2d,v29.2s,v3.s[2]
607 	umlal	v7.2d,v29.2s,v3.s[3]
608 	st1	{v8.2d},[x7],#16
609 	umlal	v9.2d,v28.2s,v0.s[0]
610 	ld1	{v8.2d},[x6]
611 	umlal	v10.2d,v28.2s,v0.s[1]
612 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+3]
613 	umlal	v11.2d,v28.2s,v0.s[2]
614 	b.eq	LInner_jump3
615 	add	x6,x6,#16	// don't advance in last iteration
616 LInner_jump3:
617 	umlal	v12.2d,v28.2s,v0.s[3]
618 	umlal	v13.2d,v28.2s,v1.s[0]
619 	umlal	v6.2d,v28.2s,v1.s[1]
620 	umlal	v7.2d,v28.2s,v1.s[2]
621 	umlal	v8.2d,v28.2s,v1.s[3]
622 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+4]
623 	umlal	v9.2d,v29.2s,v2.s[0]
624 	umlal	v10.2d,v29.2s,v2.s[1]
625 	umlal	v11.2d,v29.2s,v2.s[2]
626 	umlal	v12.2d,v29.2s,v2.s[3]
627 	umlal	v13.2d,v29.2s,v3.s[0]
628 	umlal	v6.2d,v29.2s,v3.s[1]
629 	umlal	v7.2d,v29.2s,v3.s[2]
630 	umlal	v8.2d,v29.2s,v3.s[3]
631 	st1	{v9.2d},[x7],#16
632 	umlal	v10.2d,v28.2s,v0.s[0]
633 	ld1	{v9.2d},[x6]
634 	umlal	v11.2d,v28.2s,v0.s[1]
635 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+4]
636 	umlal	v12.2d,v28.2s,v0.s[2]
637 	b.eq	LInner_jump4
638 	add	x6,x6,#16	// don't advance in last iteration
639 LInner_jump4:
640 	umlal	v13.2d,v28.2s,v0.s[3]
641 	umlal	v6.2d,v28.2s,v1.s[0]
642 	umlal	v7.2d,v28.2s,v1.s[1]
643 	umlal	v8.2d,v28.2s,v1.s[2]
644 	umlal	v9.2d,v28.2s,v1.s[3]
645 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+5]
646 	umlal	v10.2d,v29.2s,v2.s[0]
647 	umlal	v11.2d,v29.2s,v2.s[1]
648 	umlal	v12.2d,v29.2s,v2.s[2]
649 	umlal	v13.2d,v29.2s,v2.s[3]
650 	umlal	v6.2d,v29.2s,v3.s[0]
651 	umlal	v7.2d,v29.2s,v3.s[1]
652 	umlal	v8.2d,v29.2s,v3.s[2]
653 	umlal	v9.2d,v29.2s,v3.s[3]
654 	st1	{v10.2d},[x7],#16
655 	umlal	v11.2d,v28.2s,v0.s[0]
656 	ld1	{v10.2d},[x6]
657 	umlal	v12.2d,v28.2s,v0.s[1]
658 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+5]
659 	umlal	v13.2d,v28.2s,v0.s[2]
660 	b.eq	LInner_jump5
661 	add	x6,x6,#16	// don't advance in last iteration
662 LInner_jump5:
663 	umlal	v6.2d,v28.2s,v0.s[3]
664 	umlal	v7.2d,v28.2s,v1.s[0]
665 	umlal	v8.2d,v28.2s,v1.s[1]
666 	umlal	v9.2d,v28.2s,v1.s[2]
667 	umlal	v10.2d,v28.2s,v1.s[3]
668 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+6]
669 	umlal	v11.2d,v29.2s,v2.s[0]
670 	umlal	v12.2d,v29.2s,v2.s[1]
671 	umlal	v13.2d,v29.2s,v2.s[2]
672 	umlal	v6.2d,v29.2s,v2.s[3]
673 	umlal	v7.2d,v29.2s,v3.s[0]
674 	umlal	v8.2d,v29.2s,v3.s[1]
675 	umlal	v9.2d,v29.2s,v3.s[2]
676 	umlal	v10.2d,v29.2s,v3.s[3]
677 	st1	{v11.2d},[x7],#16
678 	umlal	v12.2d,v28.2s,v0.s[0]
679 	ld1	{v11.2d},[x6]
680 	umlal	v13.2d,v28.2s,v0.s[1]
681 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+6]
682 	umlal	v6.2d,v28.2s,v0.s[2]
683 	b.eq	LInner_jump6
684 	add	x6,x6,#16	// don't advance in last iteration
685 LInner_jump6:
686 	umlal	v7.2d,v28.2s,v0.s[3]
687 	umlal	v8.2d,v28.2s,v1.s[0]
688 	umlal	v9.2d,v28.2s,v1.s[1]
689 	umlal	v10.2d,v28.2s,v1.s[2]
690 	umlal	v11.2d,v28.2s,v1.s[3]
691 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+7]
692 	umlal	v12.2d,v29.2s,v2.s[0]
693 	umlal	v13.2d,v29.2s,v2.s[1]
694 	umlal	v6.2d,v29.2s,v2.s[2]
695 	umlal	v7.2d,v29.2s,v2.s[3]
696 	umlal	v8.2d,v29.2s,v3.s[0]
697 	umlal	v9.2d,v29.2s,v3.s[1]
698 	umlal	v10.2d,v29.2s,v3.s[2]
699 	umlal	v11.2d,v29.2s,v3.s[3]
700 	st1	{v12.2d},[x7],#16
701 	umlal	v13.2d,v28.2s,v0.s[0]
702 	ld1	{v12.2d},[x6]
703 	umlal	v6.2d,v28.2s,v0.s[1]
704 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+7]
705 	umlal	v7.2d,v28.2s,v0.s[2]
706 	b.eq	LInner_jump7
707 	add	x6,x6,#16	// don't advance in last iteration
708 LInner_jump7:
709 	umlal	v8.2d,v28.2s,v0.s[3]
710 	umlal	v9.2d,v28.2s,v1.s[0]
711 	umlal	v10.2d,v28.2s,v1.s[1]
712 	umlal	v11.2d,v28.2s,v1.s[2]
713 	umlal	v12.2d,v28.2s,v1.s[3]
714 	b.ne	LInner_after_rewind8
715 	sub	x1,x1,x5,lsl#2	// rewind
716 LInner_after_rewind8:
717 	umlal	v13.2d,v29.2s,v2.s[0]
718 	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
719 	umlal	v6.2d,v29.2s,v2.s[1]
720 	ld1	{v0.4s,v1.4s},[x1],#32
721 	umlal	v7.2d,v29.2s,v2.s[2]
722 	add	x10,sp,#8		// rewind
723 	umlal	v8.2d,v29.2s,v2.s[3]
724 	umlal	v9.2d,v29.2s,v3.s[0]
725 	umlal	v10.2d,v29.2s,v3.s[1]
726 	umlal	v11.2d,v29.2s,v3.s[2]
727 	st1	{v13.2d},[x7],#16
728 	umlal	v12.2d,v29.2s,v3.s[3]
729 
730 	bne	LNEON_8n_inner
731 	add	x6,sp,#128
732 	st1	{v6.2d,v7.2d},[x7],#32
733 	eor	v2.16b,v2.16b,v2.16b	// v2
734 	st1	{v8.2d,v9.2d},[x7],#32
735 	eor	v3.16b,v3.16b,v3.16b	// v3
736 	st1	{v10.2d,v11.2d},[x7],#32
737 	st1	{v12.2d},[x7]
738 
739 	subs	x9,x9,#8
740 	ld1	{v6.2d,v7.2d},[x6],#32
741 	ld1	{v8.2d,v9.2d},[x6],#32
742 	ld1	{v10.2d,v11.2d},[x6],#32
743 	ld1	{v12.2d,v13.2d},[x6],#32
744 
745 	b.eq	LInner_8n_jump_2steps
746 	sub	x3,x3,x5,lsl#2	// rewind
747 	b	LNEON_8n_outer
748 
749 LInner_8n_jump_2steps:
750 	add	x7,sp,#128
751 	st1	{v2.2d,v3.2d}, [sp],#32	// start wiping stack frame
752 	mov	v5.16b,v6.16b
753 	ushr	v15.2d,v6.2d,#16
754 	ext	v6.16b,v6.16b,v6.16b,#8
755 	st1	{v2.2d,v3.2d}, [sp],#32
756 	add	v6.2d,v6.2d,v15.2d
757 	st1	{v2.2d,v3.2d}, [sp],#32
758 	ushr	v15.2d,v6.2d,#16
759 	st1	{v2.2d,v3.2d}, [sp],#32
760 	zip1	v6.4h,v5.4h,v6.4h
761 	ins	v15.d[1],v14.d[0]
762 
763 	mov	x8,x5
764 	b	LNEON_tail_entry
765 
766 .align	4
767 LNEON_tail:
768 	add	v6.2d,v6.2d,v15.2d
769 	mov	v5.16b,v6.16b
770 	ushr	v15.2d,v6.2d,#16
771 	ext	v6.16b,v6.16b,v6.16b,#8
772 	ld1	{v8.2d,v9.2d}, [x6],#32
773 	add	v6.2d,v6.2d,v15.2d
774 	ld1	{v10.2d,v11.2d}, [x6],#32
775 	ushr	v15.2d,v6.2d,#16
776 	ld1	{v12.2d,v13.2d}, [x6],#32
777 	zip1	v6.4h,v5.4h,v6.4h
778 	ins	v15.d[1],v14.d[0]
779 
780 LNEON_tail_entry:
781 	add	v7.2d,v7.2d,v15.2d
782 	st1	{v6.s}[0], [x7],#4
783 	ushr	v15.2d,v7.2d,#16
784 	mov	v5.16b,v7.16b
785 	ext	v7.16b,v7.16b,v7.16b,#8
786 	add	v7.2d,v7.2d,v15.2d
787 	ushr	v15.2d,v7.2d,#16
788 	zip1	v7.4h,v5.4h,v7.4h
789 	ins	v15.d[1],v14.d[0]
790 	add	v8.2d,v8.2d,v15.2d
791 	st1	{v7.s}[0], [x7],#4
792 	ushr	v15.2d,v8.2d,#16
793 	mov	v5.16b,v8.16b
794 	ext	v8.16b,v8.16b,v8.16b,#8
795 	add	v8.2d,v8.2d,v15.2d
796 	ushr	v15.2d,v8.2d,#16
797 	zip1	v8.4h,v5.4h,v8.4h
798 	ins	v15.d[1],v14.d[0]
799 	add	v9.2d,v9.2d,v15.2d
800 	st1	{v8.s}[0], [x7],#4
801 	ushr	v15.2d,v9.2d,#16
802 	mov	v5.16b,v9.16b
803 	ext	v9.16b,v9.16b,v9.16b,#8
804 	add	v9.2d,v9.2d,v15.2d
805 	ushr	v15.2d,v9.2d,#16
806 	zip1	v9.4h,v5.4h,v9.4h
807 	ins	v15.d[1],v14.d[0]
808 	add	v10.2d,v10.2d,v15.2d
809 	st1	{v9.s}[0], [x7],#4
810 	ushr	v15.2d,v10.2d,#16
811 	mov	v5.16b,v10.16b
812 	ext	v10.16b,v10.16b,v10.16b,#8
813 	add	v10.2d,v10.2d,v15.2d
814 	ushr	v15.2d,v10.2d,#16
815 	zip1	v10.4h,v5.4h,v10.4h
816 	ins	v15.d[1],v14.d[0]
817 	add	v11.2d,v11.2d,v15.2d
818 	st1	{v10.s}[0], [x7],#4
819 	ushr	v15.2d,v11.2d,#16
820 	mov	v5.16b,v11.16b
821 	ext	v11.16b,v11.16b,v11.16b,#8
822 	add	v11.2d,v11.2d,v15.2d
823 	ushr	v15.2d,v11.2d,#16
824 	zip1	v11.4h,v5.4h,v11.4h
825 	ins	v15.d[1],v14.d[0]
826 	add	v12.2d,v12.2d,v15.2d
827 	st1	{v11.s}[0], [x7],#4
828 	ushr	v15.2d,v12.2d,#16
829 	mov	v5.16b,v12.16b
830 	ext	v12.16b,v12.16b,v12.16b,#8
831 	add	v12.2d,v12.2d,v15.2d
832 	ushr	v15.2d,v12.2d,#16
833 	zip1	v12.4h,v5.4h,v12.4h
834 	ins	v15.d[1],v14.d[0]
835 	add	v13.2d,v13.2d,v15.2d
836 	st1	{v12.s}[0], [x7],#4
837 	ushr	v15.2d,v13.2d,#16
838 	mov	v5.16b,v13.16b
839 	ext	v13.16b,v13.16b,v13.16b,#8
840 	add	v13.2d,v13.2d,v15.2d
841 	ushr	v15.2d,v13.2d,#16
842 	zip1	v13.4h,v5.4h,v13.4h
843 	ins	v15.d[1],v14.d[0]
844 	ld1	{v6.2d,v7.2d}, [x6],#32
845 	subs	x8,x8,#8
846 	st1	{v13.s}[0], [x7],#4
847 	bne	LNEON_tail
848 
849 	st1	{v15.s}[0], [x7],#4	// top-most bit
850 	sub	x3,x3,x5,lsl#2		// rewind x3
851 	subs	x1,sp,#0			// clear carry flag
852 	add	x2,sp,x5,lsl#2
853 
854 LNEON_sub:
855 	ldp	w4,w5,[x1],#8
856 	ldp	w6,w7,[x1],#8
857 	ldp	w8,w9,[x3],#8
858 	ldp	w10,w11,[x3],#8
859 	sbcs	w8,w4,w8
860 	sbcs	w9,w5,w9
861 	sbcs	w10,w6,w10
862 	sbcs	w11,w7,w11
863 	sub	x17,x2,x1
864 	stp	w8,w9,[x0],#8
865 	stp	w10,w11,[x0],#8
866 	cbnz	x17,LNEON_sub
867 
868 	ldr	w10, [x1]		// load top-most bit
869 	mov	x11,sp
870 	eor	v0.16b,v0.16b,v0.16b
871 	sub	x11,x2,x11		// this is num*4
872 	eor	v1.16b,v1.16b,v1.16b
873 	mov	x1,sp
874 	sub	x0,x0,x11		// rewind x0
875 	mov	x3,x2		// second 3/4th of frame
876 	sbcs	w10,w10,wzr		// result is carry flag
877 
878 LNEON_copy_n_zap:
879 	ldp	w4,w5,[x1],#8
880 	ldp	w6,w7,[x1],#8
881 	ldp	w8,w9,[x0],#8
882 	ldp	w10,w11,[x0]
883 	sub	x0,x0,#8
884 	b.cs	LCopy_1
885 	mov	w8,w4
886 	mov	w9,w5
887 	mov	w10,w6
888 	mov	w11,w7
889 LCopy_1:
890 	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
891 	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
892 	ldp	w4,w5,[x1],#8
893 	ldp	w6,w7,[x1],#8
894 	stp	w8,w9,[x0],#8
895 	stp	w10,w11,[x0],#8
896 	sub	x1,x1,#32
897 	ldp	w8,w9,[x0],#8
898 	ldp	w10,w11,[x0]
899 	sub	x0,x0,#8
900 	b.cs	LCopy_2
901 	mov	w8, w4
902 	mov	w9, w5
903 	mov	w10, w6
904 	mov	w11, w7
905 LCopy_2:
906 	st1	{v0.2d,v1.2d}, [x1],#32		// wipe
907 	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
908 	sub	x17,x2,x1		// preserves carry
909 	stp	w8,w9,[x0],#8
910 	stp	w10,w11,[x0],#8
911 	cbnz	x17,LNEON_copy_n_zap
912 
913 	mov	sp,x16
914 	ldp	d14,d15,[sp,#64]
915 	ldp	d12,d13,[sp,#48]
916 	ldp	d10,d11,[sp,#32]
917 	ldp	d8,d9,[sp,#16]
918 	ldr	x29,[sp],#80
919 	ret	// bx lr
920 
921 
922 
923 .align	5
924 __bn_sqr8x_mont:
925 	cmp	x1,x2
926 	b.ne	__bn_mul4x_mont
927 Lsqr8x_mont:
928 .long	0xd503233f		// paciasp
929 	stp	x29,x30,[sp,#-128]!
930 	add	x29,sp,#0
931 	stp	x19,x20,[sp,#16]
932 	stp	x21,x22,[sp,#32]
933 	stp	x23,x24,[sp,#48]
934 	stp	x25,x26,[sp,#64]
935 	stp	x27,x28,[sp,#80]
936 	stp	x0,x3,[sp,#96]	// offload rp and np
937 
938 	ldp	x6,x7,[x1,#8*0]
939 	ldp	x8,x9,[x1,#8*2]
940 	ldp	x10,x11,[x1,#8*4]
941 	ldp	x12,x13,[x1,#8*6]
942 
943 	sub	x2,sp,x5,lsl#4
944 	lsl	x5,x5,#3
945 	ldr	x4,[x4]		// *n0
946 	mov	sp,x2			// alloca
947 	sub	x27,x5,#8*8
948 	b	Lsqr8x_zero_start
949 
950 Lsqr8x_zero:
951 	sub	x27,x27,#8*8
952 	stp	xzr,xzr,[x2,#8*0]
953 	stp	xzr,xzr,[x2,#8*2]
954 	stp	xzr,xzr,[x2,#8*4]
955 	stp	xzr,xzr,[x2,#8*6]
956 Lsqr8x_zero_start:
957 	stp	xzr,xzr,[x2,#8*8]
958 	stp	xzr,xzr,[x2,#8*10]
959 	stp	xzr,xzr,[x2,#8*12]
960 	stp	xzr,xzr,[x2,#8*14]
961 	add	x2,x2,#8*16
962 	cbnz	x27,Lsqr8x_zero
963 
964 	add	x3,x1,x5
965 	add	x1,x1,#8*8
966 	mov	x19,xzr
967 	mov	x20,xzr
968 	mov	x21,xzr
969 	mov	x22,xzr
970 	mov	x23,xzr
971 	mov	x24,xzr
972 	mov	x25,xzr
973 	mov	x26,xzr
974 	mov	x2,sp
975 	str	x4,[x29,#112]		// offload n0
976 
977 	// Multiply everything but a[i]*a[i]
978 .align	4
979 Lsqr8x_outer_loop:
980         //                                                 a[1]a[0]	(i)
981         //                                             a[2]a[0]
982         //                                         a[3]a[0]
983         //                                     a[4]a[0]
984         //                                 a[5]a[0]
985         //                             a[6]a[0]
986         //                         a[7]a[0]
987         //                                         a[2]a[1]		(ii)
988         //                                     a[3]a[1]
989         //                                 a[4]a[1]
990         //                             a[5]a[1]
991         //                         a[6]a[1]
992         //                     a[7]a[1]
993         //                                 a[3]a[2]			(iii)
994         //                             a[4]a[2]
995         //                         a[5]a[2]
996         //                     a[6]a[2]
997         //                 a[7]a[2]
998         //                         a[4]a[3]				(iv)
999         //                     a[5]a[3]
1000         //                 a[6]a[3]
1001         //             a[7]a[3]
1002         //                 a[5]a[4]					(v)
1003         //             a[6]a[4]
1004         //         a[7]a[4]
1005         //         a[6]a[5]						(vi)
1006         //     a[7]a[5]
1007         // a[7]a[6]							(vii)
1008 
1009 	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
1010 	mul	x15,x8,x6
1011 	mul	x16,x9,x6
1012 	mul	x17,x10,x6
1013 	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
1014 	mul	x14,x11,x6
1015 	adcs	x21,x21,x15
1016 	mul	x15,x12,x6
1017 	adcs	x22,x22,x16
1018 	mul	x16,x13,x6
1019 	adcs	x23,x23,x17
1020 	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
1021 	adcs	x24,x24,x14
1022 	umulh	x14,x8,x6
1023 	adcs	x25,x25,x15
1024 	umulh	x15,x9,x6
1025 	adcs	x26,x26,x16
1026 	umulh	x16,x10,x6
1027 	stp	x19,x20,[x2],#8*2	// t[0..1]
1028 	adc	x19,xzr,xzr		// t[8]
1029 	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
1030 	umulh	x17,x11,x6
1031 	adcs	x22,x22,x14
1032 	umulh	x14,x12,x6
1033 	adcs	x23,x23,x15
1034 	umulh	x15,x13,x6
1035 	adcs	x24,x24,x16
1036 	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
1037 	adcs	x25,x25,x17
1038 	mul	x17,x9,x7
1039 	adcs	x26,x26,x14
1040 	mul	x14,x10,x7
1041 	adc	x19,x19,x15
1042 
1043 	mul	x15,x11,x7
1044 	adds	x22,x22,x16
1045 	mul	x16,x12,x7
1046 	adcs	x23,x23,x17
1047 	mul	x17,x13,x7
1048 	adcs	x24,x24,x14
1049 	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
1050 	adcs	x25,x25,x15
1051 	umulh	x15,x9,x7
1052 	adcs	x26,x26,x16
1053 	umulh	x16,x10,x7
1054 	adcs	x19,x19,x17
1055 	umulh	x17,x11,x7
1056 	stp	x21,x22,[x2],#8*2	// t[2..3]
1057 	adc	x20,xzr,xzr		// t[9]
1058 	adds	x23,x23,x14
1059 	umulh	x14,x12,x7
1060 	adcs	x24,x24,x15
1061 	umulh	x15,x13,x7
1062 	adcs	x25,x25,x16
1063 	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
1064 	adcs	x26,x26,x17
1065 	mul	x17,x10,x8
1066 	adcs	x19,x19,x14
1067 	mul	x14,x11,x8
1068 	adc	x20,x20,x15
1069 
1070 	mul	x15,x12,x8
1071 	adds	x24,x24,x16
1072 	mul	x16,x13,x8
1073 	adcs	x25,x25,x17
1074 	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
1075 	adcs	x26,x26,x14
1076 	umulh	x14,x10,x8
1077 	adcs	x19,x19,x15
1078 	umulh	x15,x11,x8
1079 	adcs	x20,x20,x16
1080 	umulh	x16,x12,x8
1081 	stp	x23,x24,[x2],#8*2	// t[4..5]
1082 	adc	x21,xzr,xzr		// t[10]
1083 	adds	x25,x25,x17
1084 	umulh	x17,x13,x8
1085 	adcs	x26,x26,x14
1086 	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
1087 	adcs	x19,x19,x15
1088 	mul	x15,x11,x9
1089 	adcs	x20,x20,x16
1090 	mul	x16,x12,x9
1091 	adc	x21,x21,x17
1092 
1093 	mul	x17,x13,x9
1094 	adds	x26,x26,x14
1095 	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
1096 	adcs	x19,x19,x15
1097 	umulh	x15,x11,x9
1098 	adcs	x20,x20,x16
1099 	umulh	x16,x12,x9
1100 	adcs	x21,x21,x17
1101 	umulh	x17,x13,x9
1102 	stp	x25,x26,[x2],#8*2	// t[6..7]
1103 	adc	x22,xzr,xzr		// t[11]
1104 	adds	x19,x19,x14
1105 	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
1106 	adcs	x20,x20,x15
1107 	mul	x15,x12,x10
1108 	adcs	x21,x21,x16
1109 	mul	x16,x13,x10
1110 	adc	x22,x22,x17
1111 
1112 	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
1113 	adds	x20,x20,x14
1114 	umulh	x14,x12,x10
1115 	adcs	x21,x21,x15
1116 	umulh	x15,x13,x10
1117 	adcs	x22,x22,x16
1118 	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
1119 	adc	x23,xzr,xzr		// t[12]
1120 	adds	x21,x21,x17
1121 	mul	x17,x13,x11
1122 	adcs	x22,x22,x14
1123 	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
1124 	adc	x23,x23,x15
1125 
1126 	umulh	x15,x13,x11
1127 	adds	x22,x22,x16
1128 	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
1129 	adcs	x23,x23,x17
1130 	umulh	x17,x13,x12		// hi(a[7]*a[6])
1131 	adc	x24,xzr,xzr		// t[13]
1132 	adds	x23,x23,x14
1133 	sub	x27,x3,x1	// done yet?
1134 	adc	x24,x24,x15
1135 
1136 	adds	x24,x24,x16
1137 	sub	x14,x3,x5	// rewinded ap
1138 	adc	x25,xzr,xzr		// t[14]
1139 	add	x25,x25,x17
1140 
1141 	cbz	x27,Lsqr8x_outer_break
1142 
1143 	mov	x4,x6
1144 	ldp	x6,x7,[x2,#8*0]
1145 	ldp	x8,x9,[x2,#8*2]
1146 	ldp	x10,x11,[x2,#8*4]
1147 	ldp	x12,x13,[x2,#8*6]
1148 	adds	x19,x19,x6
1149 	adcs	x20,x20,x7
1150 	ldp	x6,x7,[x1,#8*0]
1151 	adcs	x21,x21,x8
1152 	adcs	x22,x22,x9
1153 	ldp	x8,x9,[x1,#8*2]
1154 	adcs	x23,x23,x10
1155 	adcs	x24,x24,x11
1156 	ldp	x10,x11,[x1,#8*4]
1157 	adcs	x25,x25,x12
1158 	mov	x0,x1
1159 	adcs	x26,xzr,x13
1160 	ldp	x12,x13,[x1,#8*6]
1161 	add	x1,x1,#8*8
1162 	//adc	x28,xzr,xzr		// moved below
1163 	mov	x27,#-8*8
1164 
1165 	//                                                         a[8]a[0]
1166 	//                                                     a[9]a[0]
1167 	//                                                 a[a]a[0]
1168 	//                                             a[b]a[0]
1169 	//                                         a[c]a[0]
1170 	//                                     a[d]a[0]
1171 	//                                 a[e]a[0]
1172 	//                             a[f]a[0]
1173 	//                                                     a[8]a[1]
1174 	//                         a[f]a[1]........................
1175 	//                                                 a[8]a[2]
1176 	//                     a[f]a[2]........................
1177 	//                                             a[8]a[3]
1178 	//                 a[f]a[3]........................
1179 	//                                         a[8]a[4]
1180 	//             a[f]a[4]........................
1181 	//                                     a[8]a[5]
1182 	//         a[f]a[5]........................
1183 	//                                 a[8]a[6]
1184 	//     a[f]a[6]........................
1185 	//                             a[8]a[7]
1186 	// a[f]a[7]........................
1187 Lsqr8x_mul:
1188 	mul	x14,x6,x4
1189 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
1190 	mul	x15,x7,x4
1191 	add	x27,x27,#8
1192 	mul	x16,x8,x4
1193 	mul	x17,x9,x4
1194 	adds	x19,x19,x14
1195 	mul	x14,x10,x4
1196 	adcs	x20,x20,x15
1197 	mul	x15,x11,x4
1198 	adcs	x21,x21,x16
1199 	mul	x16,x12,x4
1200 	adcs	x22,x22,x17
1201 	mul	x17,x13,x4
1202 	adcs	x23,x23,x14
1203 	umulh	x14,x6,x4
1204 	adcs	x24,x24,x15
1205 	umulh	x15,x7,x4
1206 	adcs	x25,x25,x16
1207 	umulh	x16,x8,x4
1208 	adcs	x26,x26,x17
1209 	umulh	x17,x9,x4
1210 	adc	x28,x28,xzr
1211 	str	x19,[x2],#8
1212 	adds	x19,x20,x14
1213 	umulh	x14,x10,x4
1214 	adcs	x20,x21,x15
1215 	umulh	x15,x11,x4
1216 	adcs	x21,x22,x16
1217 	umulh	x16,x12,x4
1218 	adcs	x22,x23,x17
1219 	umulh	x17,x13,x4
1220 	ldr	x4,[x0,x27]
1221 	adcs	x23,x24,x14
1222 	adcs	x24,x25,x15
1223 	adcs	x25,x26,x16
1224 	adcs	x26,x28,x17
1225 	//adc	x28,xzr,xzr		// moved above
1226 	cbnz	x27,Lsqr8x_mul
1227 					// note that carry flag is guaranteed
1228 					// to be zero at this point
1229 	cmp	x1,x3		// done yet?
1230 	b.eq	Lsqr8x_break
1231 
1232 	ldp	x6,x7,[x2,#8*0]
1233 	ldp	x8,x9,[x2,#8*2]
1234 	ldp	x10,x11,[x2,#8*4]
1235 	ldp	x12,x13,[x2,#8*6]
1236 	adds	x19,x19,x6
1237 	ldur	x4,[x0,#-8*8]
1238 	adcs	x20,x20,x7
1239 	ldp	x6,x7,[x1,#8*0]
1240 	adcs	x21,x21,x8
1241 	adcs	x22,x22,x9
1242 	ldp	x8,x9,[x1,#8*2]
1243 	adcs	x23,x23,x10
1244 	adcs	x24,x24,x11
1245 	ldp	x10,x11,[x1,#8*4]
1246 	adcs	x25,x25,x12
1247 	mov	x27,#-8*8
1248 	adcs	x26,x26,x13
1249 	ldp	x12,x13,[x1,#8*6]
1250 	add	x1,x1,#8*8
1251 	//adc	x28,xzr,xzr		// moved above
1252 	b	Lsqr8x_mul
1253 
1254 .align	4
1255 Lsqr8x_break:
1256 	ldp	x6,x7,[x0,#8*0]
1257 	add	x1,x0,#8*8
1258 	ldp	x8,x9,[x0,#8*2]
1259 	sub	x14,x3,x1		// is it last iteration?
1260 	ldp	x10,x11,[x0,#8*4]
1261 	sub	x15,x2,x14
1262 	ldp	x12,x13,[x0,#8*6]
1263 	cbz	x14,Lsqr8x_outer_loop
1264 
1265 	stp	x19,x20,[x2,#8*0]
1266 	ldp	x19,x20,[x15,#8*0]
1267 	stp	x21,x22,[x2,#8*2]
1268 	ldp	x21,x22,[x15,#8*2]
1269 	stp	x23,x24,[x2,#8*4]
1270 	ldp	x23,x24,[x15,#8*4]
1271 	stp	x25,x26,[x2,#8*6]
1272 	mov	x2,x15
1273 	ldp	x25,x26,[x15,#8*6]
1274 	b	Lsqr8x_outer_loop
1275 
1276 .align	4
1277 Lsqr8x_outer_break:
1278 	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1279 	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
1280 	ldp	x15,x16,[sp,#8*1]
1281 	ldp	x11,x13,[x14,#8*2]
1282 	add	x1,x14,#8*4
1283 	ldp	x17,x14,[sp,#8*3]
1284 
1285 	stp	x19,x20,[x2,#8*0]
1286 	mul	x19,x7,x7
1287 	stp	x21,x22,[x2,#8*2]
1288 	umulh	x7,x7,x7
1289 	stp	x23,x24,[x2,#8*4]
1290 	mul	x8,x9,x9
1291 	stp	x25,x26,[x2,#8*6]
1292 	mov	x2,sp
1293 	umulh	x9,x9,x9
1294 	adds	x20,x7,x15,lsl#1
1295 	extr	x15,x16,x15,#63
1296 	sub	x27,x5,#8*4
1297 
1298 Lsqr4x_shift_n_add:
1299 	adcs	x21,x8,x15
1300 	extr	x16,x17,x16,#63
1301 	sub	x27,x27,#8*4
1302 	adcs	x22,x9,x16
1303 	ldp	x15,x16,[x2,#8*5]
1304 	mul	x10,x11,x11
1305 	ldp	x7,x9,[x1],#8*2
1306 	umulh	x11,x11,x11
1307 	mul	x12,x13,x13
1308 	umulh	x13,x13,x13
1309 	extr	x17,x14,x17,#63
1310 	stp	x19,x20,[x2,#8*0]
1311 	adcs	x23,x10,x17
1312 	extr	x14,x15,x14,#63
1313 	stp	x21,x22,[x2,#8*2]
1314 	adcs	x24,x11,x14
1315 	ldp	x17,x14,[x2,#8*7]
1316 	extr	x15,x16,x15,#63
1317 	adcs	x25,x12,x15
1318 	extr	x16,x17,x16,#63
1319 	adcs	x26,x13,x16
1320 	ldp	x15,x16,[x2,#8*9]
1321 	mul	x6,x7,x7
1322 	ldp	x11,x13,[x1],#8*2
1323 	umulh	x7,x7,x7
1324 	mul	x8,x9,x9
1325 	umulh	x9,x9,x9
1326 	stp	x23,x24,[x2,#8*4]
1327 	extr	x17,x14,x17,#63
1328 	stp	x25,x26,[x2,#8*6]
1329 	add	x2,x2,#8*8
1330 	adcs	x19,x6,x17
1331 	extr	x14,x15,x14,#63
1332 	adcs	x20,x7,x14
1333 	ldp	x17,x14,[x2,#8*3]
1334 	extr	x15,x16,x15,#63
1335 	cbnz	x27,Lsqr4x_shift_n_add
1336 	ldp	x1,x4,[x29,#104]	// pull np and n0
1337 
1338 	adcs	x21,x8,x15
1339 	extr	x16,x17,x16,#63
1340 	adcs	x22,x9,x16
1341 	ldp	x15,x16,[x2,#8*5]
1342 	mul	x10,x11,x11
1343 	umulh	x11,x11,x11
1344 	stp	x19,x20,[x2,#8*0]
1345 	mul	x12,x13,x13
1346 	umulh	x13,x13,x13
1347 	stp	x21,x22,[x2,#8*2]
1348 	extr	x17,x14,x17,#63
1349 	adcs	x23,x10,x17
1350 	extr	x14,x15,x14,#63
1351 	ldp	x19,x20,[sp,#8*0]
1352 	adcs	x24,x11,x14
1353 	extr	x15,x16,x15,#63
1354 	ldp	x6,x7,[x1,#8*0]
1355 	adcs	x25,x12,x15
1356 	extr	x16,xzr,x16,#63
1357 	ldp	x8,x9,[x1,#8*2]
1358 	adc	x26,x13,x16
1359 	ldp	x10,x11,[x1,#8*4]
1360 
1361 	// Reduce by 512 bits per iteration
1362 	mul	x28,x4,x19		// t[0]*n0
1363 	ldp	x12,x13,[x1,#8*6]
1364 	add	x3,x1,x5
1365 	ldp	x21,x22,[sp,#8*2]
1366 	stp	x23,x24,[x2,#8*4]
1367 	ldp	x23,x24,[sp,#8*4]
1368 	stp	x25,x26,[x2,#8*6]
1369 	ldp	x25,x26,[sp,#8*6]
1370 	add	x1,x1,#8*8
1371 	mov	x30,xzr		// initial top-most carry
1372 	mov	x2,sp
1373 	mov	x27,#8
1374 
1375 Lsqr8x_reduction:
1376 	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
1377 	mul	x15,x7,x28
1378 	sub	x27,x27,#1
1379 	mul	x16,x8,x28
1380 	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
1381 	mul	x17,x9,x28
1382 	// (*)	adds	xzr,x19,x14
1383 	subs	xzr,x19,#1		// (*)
1384 	mul	x14,x10,x28
1385 	adcs	x19,x20,x15
1386 	mul	x15,x11,x28
1387 	adcs	x20,x21,x16
1388 	mul	x16,x12,x28
1389 	adcs	x21,x22,x17
1390 	mul	x17,x13,x28
1391 	adcs	x22,x23,x14
1392 	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
1393 	adcs	x23,x24,x15
1394 	umulh	x15,x7,x28
1395 	adcs	x24,x25,x16
1396 	umulh	x16,x8,x28
1397 	adcs	x25,x26,x17
1398 	umulh	x17,x9,x28
1399 	adc	x26,xzr,xzr
1400 	adds	x19,x19,x14
1401 	umulh	x14,x10,x28
1402 	adcs	x20,x20,x15
1403 	umulh	x15,x11,x28
1404 	adcs	x21,x21,x16
1405 	umulh	x16,x12,x28
1406 	adcs	x22,x22,x17
1407 	umulh	x17,x13,x28
1408 	mul	x28,x4,x19		// next t[0]*n0
1409 	adcs	x23,x23,x14
1410 	adcs	x24,x24,x15
1411 	adcs	x25,x25,x16
1412 	adc	x26,x26,x17
1413 	cbnz	x27,Lsqr8x_reduction
1414 
1415 	ldp	x14,x15,[x2,#8*0]
1416 	ldp	x16,x17,[x2,#8*2]
1417 	mov	x0,x2
1418 	sub	x27,x3,x1	// done yet?
1419 	adds	x19,x19,x14
1420 	adcs	x20,x20,x15
1421 	ldp	x14,x15,[x2,#8*4]
1422 	adcs	x21,x21,x16
1423 	adcs	x22,x22,x17
1424 	ldp	x16,x17,[x2,#8*6]
1425 	adcs	x23,x23,x14
1426 	adcs	x24,x24,x15
1427 	adcs	x25,x25,x16
1428 	adcs	x26,x26,x17
1429 	//adc	x28,xzr,xzr		// moved below
1430 	cbz	x27,Lsqr8x8_post_condition
1431 
1432 	ldur	x4,[x2,#-8*8]
1433 	ldp	x6,x7,[x1,#8*0]
1434 	ldp	x8,x9,[x1,#8*2]
1435 	ldp	x10,x11,[x1,#8*4]
1436 	mov	x27,#-8*8
1437 	ldp	x12,x13,[x1,#8*6]
1438 	add	x1,x1,#8*8
1439 
1440 Lsqr8x_tail:
1441 	mul	x14,x6,x4
1442 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
1443 	mul	x15,x7,x4
1444 	add	x27,x27,#8
1445 	mul	x16,x8,x4
1446 	mul	x17,x9,x4
1447 	adds	x19,x19,x14
1448 	mul	x14,x10,x4
1449 	adcs	x20,x20,x15
1450 	mul	x15,x11,x4
1451 	adcs	x21,x21,x16
1452 	mul	x16,x12,x4
1453 	adcs	x22,x22,x17
1454 	mul	x17,x13,x4
1455 	adcs	x23,x23,x14
1456 	umulh	x14,x6,x4
1457 	adcs	x24,x24,x15
1458 	umulh	x15,x7,x4
1459 	adcs	x25,x25,x16
1460 	umulh	x16,x8,x4
1461 	adcs	x26,x26,x17
1462 	umulh	x17,x9,x4
1463 	adc	x28,x28,xzr
1464 	str	x19,[x2],#8
1465 	adds	x19,x20,x14
1466 	umulh	x14,x10,x4
1467 	adcs	x20,x21,x15
1468 	umulh	x15,x11,x4
1469 	adcs	x21,x22,x16
1470 	umulh	x16,x12,x4
1471 	adcs	x22,x23,x17
1472 	umulh	x17,x13,x4
1473 	ldr	x4,[x0,x27]
1474 	adcs	x23,x24,x14
1475 	adcs	x24,x25,x15
1476 	adcs	x25,x26,x16
1477 	adcs	x26,x28,x17
1478 	//adc	x28,xzr,xzr		// moved above
1479 	cbnz	x27,Lsqr8x_tail
1480 					// note that carry flag is guaranteed
1481 					// to be zero at this point
1482 	ldp	x6,x7,[x2,#8*0]
1483 	sub	x27,x3,x1	// done yet?
1484 	sub	x16,x3,x5	// rewinded np
1485 	ldp	x8,x9,[x2,#8*2]
1486 	ldp	x10,x11,[x2,#8*4]
1487 	ldp	x12,x13,[x2,#8*6]
1488 	cbz	x27,Lsqr8x_tail_break
1489 
1490 	ldur	x4,[x0,#-8*8]
1491 	adds	x19,x19,x6
1492 	adcs	x20,x20,x7
1493 	ldp	x6,x7,[x1,#8*0]
1494 	adcs	x21,x21,x8
1495 	adcs	x22,x22,x9
1496 	ldp	x8,x9,[x1,#8*2]
1497 	adcs	x23,x23,x10
1498 	adcs	x24,x24,x11
1499 	ldp	x10,x11,[x1,#8*4]
1500 	adcs	x25,x25,x12
1501 	mov	x27,#-8*8
1502 	adcs	x26,x26,x13
1503 	ldp	x12,x13,[x1,#8*6]
1504 	add	x1,x1,#8*8
1505 	//adc	x28,xzr,xzr		// moved above
1506 	b	Lsqr8x_tail
1507 
1508 .align	4
1509 Lsqr8x_tail_break:
1510 	ldr	x4,[x29,#112]		// pull n0
1511 	add	x27,x2,#8*8		// end of current t[num] window
1512 
1513 	subs	xzr,x30,#1		// "move" top-most carry to carry bit
1514 	adcs	x14,x19,x6
1515 	adcs	x15,x20,x7
1516 	ldp	x19,x20,[x0,#8*0]
1517 	adcs	x21,x21,x8
1518 	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
1519 	adcs	x22,x22,x9
1520 	ldp	x8,x9,[x16,#8*2]
1521 	adcs	x23,x23,x10
1522 	adcs	x24,x24,x11
1523 	ldp	x10,x11,[x16,#8*4]
1524 	adcs	x25,x25,x12
1525 	adcs	x26,x26,x13
1526 	ldp	x12,x13,[x16,#8*6]
1527 	add	x1,x16,#8*8
1528 	adc	x30,xzr,xzr	// top-most carry
1529 	mul	x28,x4,x19
1530 	stp	x14,x15,[x2,#8*0]
1531 	stp	x21,x22,[x2,#8*2]
1532 	ldp	x21,x22,[x0,#8*2]
1533 	stp	x23,x24,[x2,#8*4]
1534 	ldp	x23,x24,[x0,#8*4]
1535 	cmp	x27,x29		// did we hit the bottom?
1536 	stp	x25,x26,[x2,#8*6]
1537 	mov	x2,x0			// slide the window
1538 	ldp	x25,x26,[x0,#8*6]
1539 	mov	x27,#8
1540 	b.ne	Lsqr8x_reduction
1541 
1542 	// Final step. We see if result is larger than modulus, and
1543 	// if it is, subtract the modulus. But comparison implies
1544 	// subtraction. So we subtract modulus, see if it borrowed,
1545 	// and conditionally copy original value.
1546 	ldr	x0,[x29,#96]		// pull rp
1547 	add	x2,x2,#8*8
1548 	subs	x14,x19,x6
1549 	sbcs	x15,x20,x7
1550 	sub	x27,x5,#8*8
1551 	mov	x3,x0		// x0 copy
1552 
1553 Lsqr8x_sub:
1554 	sbcs	x16,x21,x8
1555 	ldp	x6,x7,[x1,#8*0]
1556 	sbcs	x17,x22,x9
1557 	stp	x14,x15,[x0,#8*0]
1558 	sbcs	x14,x23,x10
1559 	ldp	x8,x9,[x1,#8*2]
1560 	sbcs	x15,x24,x11
1561 	stp	x16,x17,[x0,#8*2]
1562 	sbcs	x16,x25,x12
1563 	ldp	x10,x11,[x1,#8*4]
1564 	sbcs	x17,x26,x13
1565 	ldp	x12,x13,[x1,#8*6]
1566 	add	x1,x1,#8*8
1567 	ldp	x19,x20,[x2,#8*0]
1568 	sub	x27,x27,#8*8
1569 	ldp	x21,x22,[x2,#8*2]
1570 	ldp	x23,x24,[x2,#8*4]
1571 	ldp	x25,x26,[x2,#8*6]
1572 	add	x2,x2,#8*8
1573 	stp	x14,x15,[x0,#8*4]
1574 	sbcs	x14,x19,x6
1575 	stp	x16,x17,[x0,#8*6]
1576 	add	x0,x0,#8*8
1577 	sbcs	x15,x20,x7
1578 	cbnz	x27,Lsqr8x_sub
1579 
1580 	sbcs	x16,x21,x8
1581 	mov	x2,sp
1582 	add	x1,sp,x5
1583 	ldp	x6,x7,[x3,#8*0]
1584 	sbcs	x17,x22,x9
1585 	stp	x14,x15,[x0,#8*0]
1586 	sbcs	x14,x23,x10
1587 	ldp	x8,x9,[x3,#8*2]
1588 	sbcs	x15,x24,x11
1589 	stp	x16,x17,[x0,#8*2]
1590 	sbcs	x16,x25,x12
1591 	ldp	x19,x20,[x1,#8*0]
1592 	sbcs	x17,x26,x13
1593 	ldp	x21,x22,[x1,#8*2]
1594 	sbcs	xzr,x30,xzr	// did it borrow?
1595 	ldr	x30,[x29,#8]		// pull return address
1596 	stp	x14,x15,[x0,#8*4]
1597 	stp	x16,x17,[x0,#8*6]
1598 
1599 	sub	x27,x5,#8*4
1600 Lsqr4x_cond_copy:
1601 	sub	x27,x27,#8*4
1602 	csel	x14,x19,x6,lo
1603 	stp	xzr,xzr,[x2,#8*0]
1604 	csel	x15,x20,x7,lo
1605 	ldp	x6,x7,[x3,#8*4]
1606 	ldp	x19,x20,[x1,#8*4]
1607 	csel	x16,x21,x8,lo
1608 	stp	xzr,xzr,[x2,#8*2]
1609 	add	x2,x2,#8*4
1610 	csel	x17,x22,x9,lo
1611 	ldp	x8,x9,[x3,#8*6]
1612 	ldp	x21,x22,[x1,#8*6]
1613 	add	x1,x1,#8*4
1614 	stp	x14,x15,[x3,#8*0]
1615 	stp	x16,x17,[x3,#8*2]
1616 	add	x3,x3,#8*4
1617 	stp	xzr,xzr,[x1,#8*0]
1618 	stp	xzr,xzr,[x1,#8*2]
1619 	cbnz	x27,Lsqr4x_cond_copy
1620 
1621 	csel	x14,x19,x6,lo
1622 	stp	xzr,xzr,[x2,#8*0]
1623 	csel	x15,x20,x7,lo
1624 	stp	xzr,xzr,[x2,#8*2]
1625 	csel	x16,x21,x8,lo
1626 	csel	x17,x22,x9,lo
1627 	stp	x14,x15,[x3,#8*0]
1628 	stp	x16,x17,[x3,#8*2]
1629 
1630 	b	Lsqr8x_done
1631 
1632 .align	4
1633 Lsqr8x8_post_condition:
1634 	adc	x28,xzr,xzr
1635 	ldr	x30,[x29,#8]		// pull return address
1636 	// x19-7,x28 hold result, x6-7 hold modulus
1637 	subs	x6,x19,x6
1638 	ldr	x1,[x29,#96]		// pull rp
1639 	sbcs	x7,x20,x7
1640 	stp	xzr,xzr,[sp,#8*0]
1641 	sbcs	x8,x21,x8
1642 	stp	xzr,xzr,[sp,#8*2]
1643 	sbcs	x9,x22,x9
1644 	stp	xzr,xzr,[sp,#8*4]
1645 	sbcs	x10,x23,x10
1646 	stp	xzr,xzr,[sp,#8*6]
1647 	sbcs	x11,x24,x11
1648 	stp	xzr,xzr,[sp,#8*8]
1649 	sbcs	x12,x25,x12
1650 	stp	xzr,xzr,[sp,#8*10]
1651 	sbcs	x13,x26,x13
1652 	stp	xzr,xzr,[sp,#8*12]
1653 	sbcs	x28,x28,xzr	// did it borrow?
1654 	stp	xzr,xzr,[sp,#8*14]
1655 
1656 	// x6-7 hold result-modulus
1657 	csel	x6,x19,x6,lo
1658 	csel	x7,x20,x7,lo
1659 	csel	x8,x21,x8,lo
1660 	csel	x9,x22,x9,lo
1661 	stp	x6,x7,[x1,#8*0]
1662 	csel	x10,x23,x10,lo
1663 	csel	x11,x24,x11,lo
1664 	stp	x8,x9,[x1,#8*2]
1665 	csel	x12,x25,x12,lo
1666 	csel	x13,x26,x13,lo
1667 	stp	x10,x11,[x1,#8*4]
1668 	stp	x12,x13,[x1,#8*6]
1669 
1670 Lsqr8x_done:
1671 	ldp	x19,x20,[x29,#16]
1672 	mov	sp,x29
1673 	ldp	x21,x22,[x29,#32]
1674 	mov	x0,#1
1675 	ldp	x23,x24,[x29,#48]
1676 	ldp	x25,x26,[x29,#64]
1677 	ldp	x27,x28,[x29,#80]
1678 	ldr	x29,[sp],#128
1679 .long	0xd50323bf		// autiasp
1680 	ret
1681 
1682 
1683 .align	5
1684 __bn_mul4x_mont:
1685 .long	0xd503233f		// paciasp
1686 	stp	x29,x30,[sp,#-128]!
1687 	add	x29,sp,#0
1688 	stp	x19,x20,[sp,#16]
1689 	stp	x21,x22,[sp,#32]
1690 	stp	x23,x24,[sp,#48]
1691 	stp	x25,x26,[sp,#64]
1692 	stp	x27,x28,[sp,#80]
1693 
1694 	sub	x26,sp,x5,lsl#3
1695 	lsl	x5,x5,#3
1696 	ldr	x4,[x4]		// *n0
1697 	sub	sp,x26,#8*4		// alloca
1698 
1699 	add	x10,x2,x5
1700 	add	x27,x1,x5
1701 	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1702 
1703 	ldr	x24,[x2,#8*0]		// b[0]
1704 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1705 	ldp	x8,x9,[x1,#8*2]
1706 	add	x1,x1,#8*4
1707 	mov	x19,xzr
1708 	mov	x20,xzr
1709 	mov	x21,xzr
1710 	mov	x22,xzr
1711 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1712 	ldp	x16,x17,[x3,#8*2]
1713 	adds	x3,x3,#8*4		// clear carry bit
1714 	mov	x0,xzr
1715 	mov	x28,#0
1716 	mov	x26,sp
1717 
1718 Loop_mul4x_1st_reduction:
1719 	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1720 	adc	x0,x0,xzr	// modulo-scheduled
1721 	mul	x11,x7,x24
1722 	add	x28,x28,#8
1723 	mul	x12,x8,x24
1724 	and	x28,x28,#31
1725 	mul	x13,x9,x24
1726 	adds	x19,x19,x10
1727 	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1728 	adcs	x20,x20,x11
1729 	mul	x25,x19,x4		// t[0]*n0
1730 	adcs	x21,x21,x12
1731 	umulh	x11,x7,x24
1732 	adcs	x22,x22,x13
1733 	umulh	x12,x8,x24
1734 	adc	x23,xzr,xzr
1735 	umulh	x13,x9,x24
1736 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1737 	adds	x20,x20,x10
1738 	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1739 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1740 	adcs	x21,x21,x11
1741 	mul	x11,x15,x25
1742 	adcs	x22,x22,x12
1743 	mul	x12,x16,x25
1744 	adc	x23,x23,x13		// can't overflow
1745 	mul	x13,x17,x25
1746 	// (*)	adds	xzr,x19,x10
1747 	subs	xzr,x19,#1		// (*)
1748 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1749 	adcs	x19,x20,x11
1750 	umulh	x11,x15,x25
1751 	adcs	x20,x21,x12
1752 	umulh	x12,x16,x25
1753 	adcs	x21,x22,x13
1754 	umulh	x13,x17,x25
1755 	adcs	x22,x23,x0
1756 	adc	x0,xzr,xzr
1757 	adds	x19,x19,x10
1758 	sub	x10,x27,x1
1759 	adcs	x20,x20,x11
1760 	adcs	x21,x21,x12
1761 	adcs	x22,x22,x13
1762 	//adc	x0,x0,xzr
1763 	cbnz	x28,Loop_mul4x_1st_reduction
1764 
1765 	cbz	x10,Lmul4x4_post_condition
1766 
1767 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1768 	ldp	x8,x9,[x1,#8*2]
1769 	add	x1,x1,#8*4
1770 	ldr	x25,[sp]		// a[0]*n0
1771 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1772 	ldp	x16,x17,[x3,#8*2]
1773 	add	x3,x3,#8*4
1774 
1775 Loop_mul4x_1st_tail:
1776 	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1777 	adc	x0,x0,xzr	// modulo-scheduled
1778 	mul	x11,x7,x24
1779 	add	x28,x28,#8
1780 	mul	x12,x8,x24
1781 	and	x28,x28,#31
1782 	mul	x13,x9,x24
1783 	adds	x19,x19,x10
1784 	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1785 	adcs	x20,x20,x11
1786 	umulh	x11,x7,x24
1787 	adcs	x21,x21,x12
1788 	umulh	x12,x8,x24
1789 	adcs	x22,x22,x13
1790 	umulh	x13,x9,x24
1791 	adc	x23,xzr,xzr
1792 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1793 	adds	x20,x20,x10
1794 	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1795 	adcs	x21,x21,x11
1796 	mul	x11,x15,x25
1797 	adcs	x22,x22,x12
1798 	mul	x12,x16,x25
1799 	adc	x23,x23,x13		// can't overflow
1800 	mul	x13,x17,x25
1801 	adds	x19,x19,x10
1802 	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1803 	adcs	x20,x20,x11
1804 	umulh	x11,x15,x25
1805 	adcs	x21,x21,x12
1806 	umulh	x12,x16,x25
1807 	adcs	x22,x22,x13
1808 	adcs	x23,x23,x0
1809 	umulh	x13,x17,x25
1810 	adc	x0,xzr,xzr
1811 	ldr	x25,[sp,x28]		// next t[0]*n0
1812 	str	x19,[x26],#8		// result!!!
1813 	adds	x19,x20,x10
1814 	sub	x10,x27,x1		// done yet?
1815 	adcs	x20,x21,x11
1816 	adcs	x21,x22,x12
1817 	adcs	x22,x23,x13
1818 	//adc	x0,x0,xzr
1819 	cbnz	x28,Loop_mul4x_1st_tail
1820 
1821 	sub	x11,x27,x5	// rewinded x1
1822 	cbz	x10,Lmul4x_proceed
1823 
1824 	ldp	x6,x7,[x1,#8*0]
1825 	ldp	x8,x9,[x1,#8*2]
1826 	add	x1,x1,#8*4
1827 	ldp	x14,x15,[x3,#8*0]
1828 	ldp	x16,x17,[x3,#8*2]
1829 	add	x3,x3,#8*4
1830 	b	Loop_mul4x_1st_tail
1831 
1832 .align	5
1833 Lmul4x_proceed:
1834 	ldr	x24,[x2,#8*4]!		// *++b
1835 	adc	x30,x0,xzr
1836 	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1837 	sub	x3,x3,x5		// rewind np
1838 	ldp	x8,x9,[x11,#8*2]
1839 	add	x1,x11,#8*4
1840 
1841 	stp	x19,x20,[x26,#8*0]	// result!!!
1842 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1843 	stp	x21,x22,[x26,#8*2]	// result!!!
1844 	ldp	x21,x22,[sp,#8*6]
1845 
1846 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1847 	mov	x26,sp
1848 	ldp	x16,x17,[x3,#8*2]
1849 	adds	x3,x3,#8*4		// clear carry bit
1850 	mov	x0,xzr
1851 
1852 .align	4
1853 Loop_mul4x_reduction:
1854 	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1855 	adc	x0,x0,xzr	// modulo-scheduled
1856 	mul	x11,x7,x24
1857 	add	x28,x28,#8
1858 	mul	x12,x8,x24
1859 	and	x28,x28,#31
1860 	mul	x13,x9,x24
1861 	adds	x19,x19,x10
1862 	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1863 	adcs	x20,x20,x11
1864 	mul	x25,x19,x4		// t[0]*n0
1865 	adcs	x21,x21,x12
1866 	umulh	x11,x7,x24
1867 	adcs	x22,x22,x13
1868 	umulh	x12,x8,x24
1869 	adc	x23,xzr,xzr
1870 	umulh	x13,x9,x24
1871 	ldr	x24,[x2,x28]		// next b[i]
1872 	adds	x20,x20,x10
1873 	// (*)	mul	x10,x14,x25
1874 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1875 	adcs	x21,x21,x11
1876 	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1877 	adcs	x22,x22,x12
1878 	mul	x12,x16,x25
1879 	adc	x23,x23,x13		// can't overflow
1880 	mul	x13,x17,x25
1881 	// (*)	adds	xzr,x19,x10
1882 	subs	xzr,x19,#1		// (*)
1883 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1884 	adcs	x19,x20,x11
1885 	umulh	x11,x15,x25
1886 	adcs	x20,x21,x12
1887 	umulh	x12,x16,x25
1888 	adcs	x21,x22,x13
1889 	umulh	x13,x17,x25
1890 	adcs	x22,x23,x0
1891 	adc	x0,xzr,xzr
1892 	adds	x19,x19,x10
1893 	adcs	x20,x20,x11
1894 	adcs	x21,x21,x12
1895 	adcs	x22,x22,x13
1896 	//adc	x0,x0,xzr
1897 	cbnz	x28,Loop_mul4x_reduction
1898 
1899 	adc	x0,x0,xzr
1900 	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1901 	ldp	x12,x13,[x26,#8*6]
1902 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1903 	ldp	x8,x9,[x1,#8*2]
1904 	add	x1,x1,#8*4
1905 	adds	x19,x19,x10
1906 	adcs	x20,x20,x11
1907 	adcs	x21,x21,x12
1908 	adcs	x22,x22,x13
1909 	//adc	x0,x0,xzr
1910 
1911 	ldr	x25,[sp]		// t[0]*n0
1912 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1913 	ldp	x16,x17,[x3,#8*2]
1914 	add	x3,x3,#8*4
1915 
1916 .align	4
1917 Loop_mul4x_tail:
1918 	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1919 	adc	x0,x0,xzr	// modulo-scheduled
1920 	mul	x11,x7,x24
1921 	add	x28,x28,#8
1922 	mul	x12,x8,x24
1923 	and	x28,x28,#31
1924 	mul	x13,x9,x24
1925 	adds	x19,x19,x10
1926 	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1927 	adcs	x20,x20,x11
1928 	umulh	x11,x7,x24
1929 	adcs	x21,x21,x12
1930 	umulh	x12,x8,x24
1931 	adcs	x22,x22,x13
1932 	umulh	x13,x9,x24
1933 	adc	x23,xzr,xzr
1934 	ldr	x24,[x2,x28]		// next b[i]
1935 	adds	x20,x20,x10
1936 	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1937 	adcs	x21,x21,x11
1938 	mul	x11,x15,x25
1939 	adcs	x22,x22,x12
1940 	mul	x12,x16,x25
1941 	adc	x23,x23,x13		// can't overflow
1942 	mul	x13,x17,x25
1943 	adds	x19,x19,x10
1944 	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1945 	adcs	x20,x20,x11
1946 	umulh	x11,x15,x25
1947 	adcs	x21,x21,x12
1948 	umulh	x12,x16,x25
1949 	adcs	x22,x22,x13
1950 	umulh	x13,x17,x25
1951 	adcs	x23,x23,x0
1952 	ldr	x25,[sp,x28]		// next a[0]*n0
1953 	adc	x0,xzr,xzr
1954 	str	x19,[x26],#8		// result!!!
1955 	adds	x19,x20,x10
1956 	sub	x10,x27,x1		// done yet?
1957 	adcs	x20,x21,x11
1958 	adcs	x21,x22,x12
1959 	adcs	x22,x23,x13
1960 	//adc	x0,x0,xzr
1961 	cbnz	x28,Loop_mul4x_tail
1962 
1963 	sub	x11,x3,x5		// rewinded np?
1964 	adc	x0,x0,xzr
1965 	cbz	x10,Loop_mul4x_break
1966 
1967 	ldp	x10,x11,[x26,#8*4]
1968 	ldp	x12,x13,[x26,#8*6]
1969 	ldp	x6,x7,[x1,#8*0]
1970 	ldp	x8,x9,[x1,#8*2]
1971 	add	x1,x1,#8*4
1972 	adds	x19,x19,x10
1973 	adcs	x20,x20,x11
1974 	adcs	x21,x21,x12
1975 	adcs	x22,x22,x13
1976 	//adc	x0,x0,xzr
1977 	ldp	x14,x15,[x3,#8*0]
1978 	ldp	x16,x17,[x3,#8*2]
1979 	add	x3,x3,#8*4
1980 	b	Loop_mul4x_tail
1981 
1982 .align	4
1983 Loop_mul4x_break:
1984 	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1985 	adds	x19,x19,x30
1986 	add	x2,x2,#8*4		// bp++
1987 	adcs	x20,x20,xzr
1988 	sub	x1,x1,x5		// rewind ap
1989 	adcs	x21,x21,xzr
1990 	stp	x19,x20,[x26,#8*0]	// result!!!
1991 	adcs	x22,x22,xzr
1992 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1993 	adc	x30,x0,xzr
1994 	stp	x21,x22,[x26,#8*2]	// result!!!
1995 	cmp	x2,x13			// done yet?
1996 	ldp	x21,x22,[sp,#8*6]
1997 	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1998 	ldp	x16,x17,[x11,#8*2]
1999 	add	x3,x11,#8*4
2000 	b.eq	Lmul4x_post
2001 
2002 	ldr	x24,[x2]
2003 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
2004 	ldp	x8,x9,[x1,#8*2]
2005 	adds	x1,x1,#8*4		// clear carry bit
2006 	mov	x0,xzr
2007 	mov	x26,sp
2008 	b	Loop_mul4x_reduction
2009 
2010 .align	4
2011 Lmul4x_post:
2012 	// Final step. We see if result is larger than modulus, and
2013 	// if it is, subtract the modulus. But comparison implies
2014 	// subtraction. So we subtract modulus, see if it borrowed,
2015 	// and conditionally copy original value.
2016 	mov	x0,x12
2017 	mov	x27,x12		// x0 copy
2018 	subs	x10,x19,x14
2019 	add	x26,sp,#8*8
2020 	sbcs	x11,x20,x15
2021 	sub	x28,x5,#8*4
2022 
2023 Lmul4x_sub:
2024 	sbcs	x12,x21,x16
2025 	ldp	x14,x15,[x3,#8*0]
2026 	sub	x28,x28,#8*4
2027 	ldp	x19,x20,[x26,#8*0]
2028 	sbcs	x13,x22,x17
2029 	ldp	x16,x17,[x3,#8*2]
2030 	add	x3,x3,#8*4
2031 	ldp	x21,x22,[x26,#8*2]
2032 	add	x26,x26,#8*4
2033 	stp	x10,x11,[x0,#8*0]
2034 	sbcs	x10,x19,x14
2035 	stp	x12,x13,[x0,#8*2]
2036 	add	x0,x0,#8*4
2037 	sbcs	x11,x20,x15
2038 	cbnz	x28,Lmul4x_sub
2039 
2040 	sbcs	x12,x21,x16
2041 	mov	x26,sp
2042 	add	x1,sp,#8*4
2043 	ldp	x6,x7,[x27,#8*0]
2044 	sbcs	x13,x22,x17
2045 	stp	x10,x11,[x0,#8*0]
2046 	ldp	x8,x9,[x27,#8*2]
2047 	stp	x12,x13,[x0,#8*2]
2048 	ldp	x19,x20,[x1,#8*0]
2049 	ldp	x21,x22,[x1,#8*2]
2050 	sbcs	xzr,x30,xzr	// did it borrow?
2051 	ldr	x30,[x29,#8]		// pull return address
2052 
2053 	sub	x28,x5,#8*4
2054 Lmul4x_cond_copy:
2055 	sub	x28,x28,#8*4
2056 	csel	x10,x19,x6,lo
2057 	stp	xzr,xzr,[x26,#8*0]
2058 	csel	x11,x20,x7,lo
2059 	ldp	x6,x7,[x27,#8*4]
2060 	ldp	x19,x20,[x1,#8*4]
2061 	csel	x12,x21,x8,lo
2062 	stp	xzr,xzr,[x26,#8*2]
2063 	add	x26,x26,#8*4
2064 	csel	x13,x22,x9,lo
2065 	ldp	x8,x9,[x27,#8*6]
2066 	ldp	x21,x22,[x1,#8*6]
2067 	add	x1,x1,#8*4
2068 	stp	x10,x11,[x27,#8*0]
2069 	stp	x12,x13,[x27,#8*2]
2070 	add	x27,x27,#8*4
2071 	cbnz	x28,Lmul4x_cond_copy
2072 
2073 	csel	x10,x19,x6,lo
2074 	stp	xzr,xzr,[x26,#8*0]
2075 	csel	x11,x20,x7,lo
2076 	stp	xzr,xzr,[x26,#8*2]
2077 	csel	x12,x21,x8,lo
2078 	stp	xzr,xzr,[x26,#8*3]
2079 	csel	x13,x22,x9,lo
2080 	stp	xzr,xzr,[x26,#8*4]
2081 	stp	x10,x11,[x27,#8*0]
2082 	stp	x12,x13,[x27,#8*2]
2083 
2084 	b	Lmul4x_done
2085 
2086 .align	4
2087 Lmul4x4_post_condition:
2088 	adc	x0,x0,xzr
2089 	ldr	x1,[x29,#96]		// pull rp
2090 	// x19-3,x0 hold result, x14-7 hold modulus
2091 	subs	x6,x19,x14
2092 	ldr	x30,[x29,#8]		// pull return address
2093 	sbcs	x7,x20,x15
2094 	stp	xzr,xzr,[sp,#8*0]
2095 	sbcs	x8,x21,x16
2096 	stp	xzr,xzr,[sp,#8*2]
2097 	sbcs	x9,x22,x17
2098 	stp	xzr,xzr,[sp,#8*4]
2099 	sbcs	xzr,x0,xzr		// did it borrow?
2100 	stp	xzr,xzr,[sp,#8*6]
2101 
2102 	// x6-3 hold result-modulus
2103 	csel	x6,x19,x6,lo
2104 	csel	x7,x20,x7,lo
2105 	csel	x8,x21,x8,lo
2106 	csel	x9,x22,x9,lo
2107 	stp	x6,x7,[x1,#8*0]
2108 	stp	x8,x9,[x1,#8*2]
2109 
2110 Lmul4x_done:
2111 	ldp	x19,x20,[x29,#16]
2112 	mov	sp,x29
2113 	ldp	x21,x22,[x29,#32]
2114 	mov	x0,#1
2115 	ldp	x23,x24,[x29,#48]
2116 	ldp	x25,x26,[x29,#64]
2117 	ldp	x27,x28,[x29,#80]
2118 	ldr	x29,[sp],#128
2119 .long	0xd50323bf		// autiasp
2120 	ret
2121 
2122 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2123 .align	2
2124 .align	4
2125