1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci
17e1051a39Sopenharmony_ci# October 2005.
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# Montgomery multiplication routine for x86_64. While it gives modest
20e1051a39Sopenharmony_ci# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
21e1051a39Sopenharmony_ci# than twice, >2x, as fast. Most common rsa1024 sign is improved by
22e1051a39Sopenharmony_ci# respectful 50%. It remains to be seen if loop unrolling and
23e1051a39Sopenharmony_ci# dedicated squaring routine can provide further improvement...
24e1051a39Sopenharmony_ci
25e1051a39Sopenharmony_ci# July 2011.
26e1051a39Sopenharmony_ci#
27e1051a39Sopenharmony_ci# Add dedicated squaring procedure. Performance improvement varies
28e1051a39Sopenharmony_ci# from platform to platform, but in average it's ~5%/15%/25%/33%
29e1051a39Sopenharmony_ci# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
30e1051a39Sopenharmony_ci
31e1051a39Sopenharmony_ci# August 2011.
32e1051a39Sopenharmony_ci#
33e1051a39Sopenharmony_ci# Unroll and modulo-schedule inner loops in such manner that they
34e1051a39Sopenharmony_ci# are "fallen through" for input lengths of 8, which is critical for
35e1051a39Sopenharmony_ci# 1024-bit RSA *sign*. Average performance improvement in comparison
36e1051a39Sopenharmony_ci# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
37e1051a39Sopenharmony_ci# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
38e1051a39Sopenharmony_ci
39e1051a39Sopenharmony_ci# June 2013.
40e1051a39Sopenharmony_ci#
41e1051a39Sopenharmony_ci# Optimize reduction in squaring procedure and improve 1024+-bit RSA
42e1051a39Sopenharmony_ci# sign performance by 10-16% on Intel Sandy Bridge and later
43e1051a39Sopenharmony_ci# (virtually same on non-Intel processors).
44e1051a39Sopenharmony_ci
45e1051a39Sopenharmony_ci# August 2013.
46e1051a39Sopenharmony_ci#
47e1051a39Sopenharmony_ci# Add MULX/ADOX/ADCX code path.
48e1051a39Sopenharmony_ci
49e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
50e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
51e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
52e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
53e1051a39Sopenharmony_ci
54e1051a39Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
55e1051a39Sopenharmony_ci
56e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
57e1051a39Sopenharmony_ci( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
58e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
59e1051a39Sopenharmony_cidie "can't locate x86_64-xlate.pl";
60e1051a39Sopenharmony_ci
61e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
62e1051a39Sopenharmony_ci    or die "can't call $xlate: $!";
63e1051a39Sopenharmony_ci*STDOUT=*OUT;
64e1051a39Sopenharmony_ci
65e1051a39Sopenharmony_ciif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
66e1051a39Sopenharmony_ci		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
67e1051a39Sopenharmony_ci	$addx = ($1>=2.23);
68e1051a39Sopenharmony_ci}
69e1051a39Sopenharmony_ci
70e1051a39Sopenharmony_ciif (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
71e1051a39Sopenharmony_ci	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
72e1051a39Sopenharmony_ci	$addx = ($1>=2.10);
73e1051a39Sopenharmony_ci}
74e1051a39Sopenharmony_ci
75e1051a39Sopenharmony_ciif (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
76e1051a39Sopenharmony_ci	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
77e1051a39Sopenharmony_ci	$addx = ($1>=12);
78e1051a39Sopenharmony_ci}
79e1051a39Sopenharmony_ci
80e1051a39Sopenharmony_ciif (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
81e1051a39Sopenharmony_ci	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
82e1051a39Sopenharmony_ci	$addx = ($ver>=3.03);
83e1051a39Sopenharmony_ci}
84e1051a39Sopenharmony_ci
85e1051a39Sopenharmony_ci# int bn_mul_mont(
86e1051a39Sopenharmony_ci$rp="%rdi";	# BN_ULONG *rp,
87e1051a39Sopenharmony_ci$ap="%rsi";	# const BN_ULONG *ap,
88e1051a39Sopenharmony_ci$bp="%rdx";	# const BN_ULONG *bp,
89e1051a39Sopenharmony_ci$np="%rcx";	# const BN_ULONG *np,
90e1051a39Sopenharmony_ci$n0="%r8";	# const BN_ULONG *n0,
91e1051a39Sopenharmony_ci$num="%r9";	# int num);
92e1051a39Sopenharmony_ci$lo0="%r10";
93e1051a39Sopenharmony_ci$hi0="%r11";
94e1051a39Sopenharmony_ci$hi1="%r13";
95e1051a39Sopenharmony_ci$i="%r14";
96e1051a39Sopenharmony_ci$j="%r15";
97e1051a39Sopenharmony_ci$m0="%rbx";
98e1051a39Sopenharmony_ci$m1="%rbp";
99e1051a39Sopenharmony_ci
100e1051a39Sopenharmony_ci$code=<<___;
101e1051a39Sopenharmony_ci.text
102e1051a39Sopenharmony_ci
103e1051a39Sopenharmony_ci.extern	OPENSSL_ia32cap_P
104e1051a39Sopenharmony_ci
105e1051a39Sopenharmony_ci.globl	bn_mul_mont
106e1051a39Sopenharmony_ci.type	bn_mul_mont,\@function,6
107e1051a39Sopenharmony_ci.align	16
108e1051a39Sopenharmony_cibn_mul_mont:
109e1051a39Sopenharmony_ci.cfi_startproc
110e1051a39Sopenharmony_ci	mov	${num}d,${num}d
111e1051a39Sopenharmony_ci	mov	%rsp,%rax
112e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
113e1051a39Sopenharmony_ci	test	\$3,${num}d
114e1051a39Sopenharmony_ci	jnz	.Lmul_enter
115e1051a39Sopenharmony_ci	cmp	\$8,${num}d
116e1051a39Sopenharmony_ci	jb	.Lmul_enter
117e1051a39Sopenharmony_ci___
118e1051a39Sopenharmony_ci$code.=<<___ if ($addx);
119e1051a39Sopenharmony_ci	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
120e1051a39Sopenharmony_ci___
121e1051a39Sopenharmony_ci$code.=<<___;
122e1051a39Sopenharmony_ci	cmp	$ap,$bp
123e1051a39Sopenharmony_ci	jne	.Lmul4x_enter
124e1051a39Sopenharmony_ci	test	\$7,${num}d
125e1051a39Sopenharmony_ci	jz	.Lsqr8x_enter
126e1051a39Sopenharmony_ci	jmp	.Lmul4x_enter
127e1051a39Sopenharmony_ci
128e1051a39Sopenharmony_ci.align	16
129e1051a39Sopenharmony_ci.Lmul_enter:
130e1051a39Sopenharmony_ci	push	%rbx
131e1051a39Sopenharmony_ci.cfi_push	%rbx
132e1051a39Sopenharmony_ci	push	%rbp
133e1051a39Sopenharmony_ci.cfi_push	%rbp
134e1051a39Sopenharmony_ci	push	%r12
135e1051a39Sopenharmony_ci.cfi_push	%r12
136e1051a39Sopenharmony_ci	push	%r13
137e1051a39Sopenharmony_ci.cfi_push	%r13
138e1051a39Sopenharmony_ci	push	%r14
139e1051a39Sopenharmony_ci.cfi_push	%r14
140e1051a39Sopenharmony_ci	push	%r15
141e1051a39Sopenharmony_ci.cfi_push	%r15
142e1051a39Sopenharmony_ci
143e1051a39Sopenharmony_ci	neg	$num
144e1051a39Sopenharmony_ci	mov	%rsp,%r11
145e1051a39Sopenharmony_ci	lea	-16(%rsp,$num,8),%r10	# future alloca(8*(num+2))
146e1051a39Sopenharmony_ci	neg	$num			# restore $num
147e1051a39Sopenharmony_ci	and	\$-1024,%r10		# minimize TLB usage
148e1051a39Sopenharmony_ci
149e1051a39Sopenharmony_ci	# An OS-agnostic version of __chkstk.
150e1051a39Sopenharmony_ci	#
151e1051a39Sopenharmony_ci	# Some OSes (Windows) insist on stack being "wired" to
152e1051a39Sopenharmony_ci	# physical memory in strictly sequential manner, i.e. if stack
153e1051a39Sopenharmony_ci	# allocation spans two pages, then reference to farmost one can
154e1051a39Sopenharmony_ci	# be punishable by SEGV. But page walking can do good even on
155e1051a39Sopenharmony_ci	# other OSes, because it guarantees that villain thread hits
156e1051a39Sopenharmony_ci	# the guard page before it can make damage to innocent one...
157e1051a39Sopenharmony_ci	sub	%r10,%r11
158e1051a39Sopenharmony_ci	and	\$-4096,%r11
159e1051a39Sopenharmony_ci	lea	(%r10,%r11),%rsp
160e1051a39Sopenharmony_ci	mov	(%rsp),%r11
161e1051a39Sopenharmony_ci	cmp	%r10,%rsp
162e1051a39Sopenharmony_ci	ja	.Lmul_page_walk
163e1051a39Sopenharmony_ci	jmp	.Lmul_page_walk_done
164e1051a39Sopenharmony_ci
165e1051a39Sopenharmony_ci.align	16
166e1051a39Sopenharmony_ci.Lmul_page_walk:
167e1051a39Sopenharmony_ci	lea	-4096(%rsp),%rsp
168e1051a39Sopenharmony_ci	mov	(%rsp),%r11
169e1051a39Sopenharmony_ci	cmp	%r10,%rsp
170e1051a39Sopenharmony_ci	ja	.Lmul_page_walk
171e1051a39Sopenharmony_ci.Lmul_page_walk_done:
172e1051a39Sopenharmony_ci
173e1051a39Sopenharmony_ci	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
174e1051a39Sopenharmony_ci.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
175e1051a39Sopenharmony_ci.Lmul_body:
176e1051a39Sopenharmony_ci	mov	$bp,%r12		# reassign $bp
177e1051a39Sopenharmony_ci___
178e1051a39Sopenharmony_ci		$bp="%r12";
179e1051a39Sopenharmony_ci$code.=<<___;
180e1051a39Sopenharmony_ci	mov	($n0),$n0		# pull n0[0] value
181e1051a39Sopenharmony_ci	mov	($bp),$m0		# m0=bp[0]
182e1051a39Sopenharmony_ci	mov	($ap),%rax
183e1051a39Sopenharmony_ci
184e1051a39Sopenharmony_ci	xor	$i,$i			# i=0
185e1051a39Sopenharmony_ci	xor	$j,$j			# j=0
186e1051a39Sopenharmony_ci
187e1051a39Sopenharmony_ci	mov	$n0,$m1
188e1051a39Sopenharmony_ci	mulq	$m0			# ap[0]*bp[0]
189e1051a39Sopenharmony_ci	mov	%rax,$lo0
190e1051a39Sopenharmony_ci	mov	($np),%rax
191e1051a39Sopenharmony_ci
192e1051a39Sopenharmony_ci	imulq	$lo0,$m1		# "tp[0]"*n0
193e1051a39Sopenharmony_ci	mov	%rdx,$hi0
194e1051a39Sopenharmony_ci
195e1051a39Sopenharmony_ci	mulq	$m1			# np[0]*m1
196e1051a39Sopenharmony_ci	add	%rax,$lo0		# discarded
197e1051a39Sopenharmony_ci	mov	8($ap),%rax
198e1051a39Sopenharmony_ci	adc	\$0,%rdx
199e1051a39Sopenharmony_ci	mov	%rdx,$hi1
200e1051a39Sopenharmony_ci
201e1051a39Sopenharmony_ci	lea	1($j),$j		# j++
202e1051a39Sopenharmony_ci	jmp	.L1st_enter
203e1051a39Sopenharmony_ci
204e1051a39Sopenharmony_ci.align	16
205e1051a39Sopenharmony_ci.L1st:
206e1051a39Sopenharmony_ci	add	%rax,$hi1
207e1051a39Sopenharmony_ci	mov	($ap,$j,8),%rax
208e1051a39Sopenharmony_ci	adc	\$0,%rdx
209e1051a39Sopenharmony_ci	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
210e1051a39Sopenharmony_ci	mov	$lo0,$hi0
211e1051a39Sopenharmony_ci	adc	\$0,%rdx
212e1051a39Sopenharmony_ci	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
213e1051a39Sopenharmony_ci	mov	%rdx,$hi1
214e1051a39Sopenharmony_ci
215e1051a39Sopenharmony_ci.L1st_enter:
216e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[0]
217e1051a39Sopenharmony_ci	add	%rax,$hi0
218e1051a39Sopenharmony_ci	mov	($np,$j,8),%rax
219e1051a39Sopenharmony_ci	adc	\$0,%rdx
220e1051a39Sopenharmony_ci	lea	1($j),$j		# j++
221e1051a39Sopenharmony_ci	mov	%rdx,$lo0
222e1051a39Sopenharmony_ci
223e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
224e1051a39Sopenharmony_ci	cmp	$num,$j
225e1051a39Sopenharmony_ci	jne	.L1st
226e1051a39Sopenharmony_ci
227e1051a39Sopenharmony_ci	add	%rax,$hi1
228e1051a39Sopenharmony_ci	mov	($ap),%rax		# ap[0]
229e1051a39Sopenharmony_ci	adc	\$0,%rdx
230e1051a39Sopenharmony_ci	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
231e1051a39Sopenharmony_ci	adc	\$0,%rdx
232e1051a39Sopenharmony_ci	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
233e1051a39Sopenharmony_ci	mov	%rdx,$hi1
234e1051a39Sopenharmony_ci	mov	$lo0,$hi0
235e1051a39Sopenharmony_ci
236e1051a39Sopenharmony_ci	xor	%rdx,%rdx
237e1051a39Sopenharmony_ci	add	$hi0,$hi1
238e1051a39Sopenharmony_ci	adc	\$0,%rdx
239e1051a39Sopenharmony_ci	mov	$hi1,-8(%rsp,$num,8)
240e1051a39Sopenharmony_ci	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
241e1051a39Sopenharmony_ci
242e1051a39Sopenharmony_ci	lea	1($i),$i		# i++
243e1051a39Sopenharmony_ci	jmp	.Louter
244e1051a39Sopenharmony_ci.align	16
245e1051a39Sopenharmony_ci.Louter:
246e1051a39Sopenharmony_ci	mov	($bp,$i,8),$m0		# m0=bp[i]
247e1051a39Sopenharmony_ci	xor	$j,$j			# j=0
248e1051a39Sopenharmony_ci	mov	$n0,$m1
249e1051a39Sopenharmony_ci	mov	(%rsp),$lo0
250e1051a39Sopenharmony_ci	mulq	$m0			# ap[0]*bp[i]
251e1051a39Sopenharmony_ci	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
252e1051a39Sopenharmony_ci	mov	($np),%rax
253e1051a39Sopenharmony_ci	adc	\$0,%rdx
254e1051a39Sopenharmony_ci
255e1051a39Sopenharmony_ci	imulq	$lo0,$m1		# tp[0]*n0
256e1051a39Sopenharmony_ci	mov	%rdx,$hi0
257e1051a39Sopenharmony_ci
258e1051a39Sopenharmony_ci	mulq	$m1			# np[0]*m1
259e1051a39Sopenharmony_ci	add	%rax,$lo0		# discarded
260e1051a39Sopenharmony_ci	mov	8($ap),%rax
261e1051a39Sopenharmony_ci	adc	\$0,%rdx
262e1051a39Sopenharmony_ci	mov	8(%rsp),$lo0		# tp[1]
263e1051a39Sopenharmony_ci	mov	%rdx,$hi1
264e1051a39Sopenharmony_ci
265e1051a39Sopenharmony_ci	lea	1($j),$j		# j++
266e1051a39Sopenharmony_ci	jmp	.Linner_enter
267e1051a39Sopenharmony_ci
268e1051a39Sopenharmony_ci.align	16
269e1051a39Sopenharmony_ci.Linner:
270e1051a39Sopenharmony_ci	add	%rax,$hi1
271e1051a39Sopenharmony_ci	mov	($ap,$j,8),%rax
272e1051a39Sopenharmony_ci	adc	\$0,%rdx
273e1051a39Sopenharmony_ci	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
274e1051a39Sopenharmony_ci	mov	(%rsp,$j,8),$lo0
275e1051a39Sopenharmony_ci	adc	\$0,%rdx
276e1051a39Sopenharmony_ci	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
277e1051a39Sopenharmony_ci	mov	%rdx,$hi1
278e1051a39Sopenharmony_ci
279e1051a39Sopenharmony_ci.Linner_enter:
280e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[i]
281e1051a39Sopenharmony_ci	add	%rax,$hi0
282e1051a39Sopenharmony_ci	mov	($np,$j,8),%rax
283e1051a39Sopenharmony_ci	adc	\$0,%rdx
284e1051a39Sopenharmony_ci	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
285e1051a39Sopenharmony_ci	mov	%rdx,$hi0
286e1051a39Sopenharmony_ci	adc	\$0,$hi0
287e1051a39Sopenharmony_ci	lea	1($j),$j		# j++
288e1051a39Sopenharmony_ci
289e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
290e1051a39Sopenharmony_ci	cmp	$num,$j
291e1051a39Sopenharmony_ci	jne	.Linner
292e1051a39Sopenharmony_ci
293e1051a39Sopenharmony_ci	add	%rax,$hi1
294e1051a39Sopenharmony_ci	mov	($ap),%rax		# ap[0]
295e1051a39Sopenharmony_ci	adc	\$0,%rdx
296e1051a39Sopenharmony_ci	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
297e1051a39Sopenharmony_ci	mov	(%rsp,$j,8),$lo0
298e1051a39Sopenharmony_ci	adc	\$0,%rdx
299e1051a39Sopenharmony_ci	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
300e1051a39Sopenharmony_ci	mov	%rdx,$hi1
301e1051a39Sopenharmony_ci
302e1051a39Sopenharmony_ci	xor	%rdx,%rdx
303e1051a39Sopenharmony_ci	add	$hi0,$hi1
304e1051a39Sopenharmony_ci	adc	\$0,%rdx
305e1051a39Sopenharmony_ci	add	$lo0,$hi1		# pull upmost overflow bit
306e1051a39Sopenharmony_ci	adc	\$0,%rdx
307e1051a39Sopenharmony_ci	mov	$hi1,-8(%rsp,$num,8)
308e1051a39Sopenharmony_ci	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
309e1051a39Sopenharmony_ci
310e1051a39Sopenharmony_ci	lea	1($i),$i		# i++
311e1051a39Sopenharmony_ci	cmp	$num,$i
312e1051a39Sopenharmony_ci	jb	.Louter
313e1051a39Sopenharmony_ci
314e1051a39Sopenharmony_ci	xor	$i,$i			# i=0 and clear CF!
315e1051a39Sopenharmony_ci	mov	(%rsp),%rax		# tp[0]
316e1051a39Sopenharmony_ci	mov	$num,$j			# j=num
317e1051a39Sopenharmony_ci
318e1051a39Sopenharmony_ci.align	16
319e1051a39Sopenharmony_ci.Lsub:	sbb	($np,$i,8),%rax
320e1051a39Sopenharmony_ci	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
321e1051a39Sopenharmony_ci	mov	8(%rsp,$i,8),%rax	# tp[i+1]
322e1051a39Sopenharmony_ci	lea	1($i),$i		# i++
323e1051a39Sopenharmony_ci	dec	$j			# doesn't affect CF!
324e1051a39Sopenharmony_ci	jnz	.Lsub
325e1051a39Sopenharmony_ci
326e1051a39Sopenharmony_ci	sbb	\$0,%rax		# handle upmost overflow bit
327e1051a39Sopenharmony_ci	mov	\$-1,%rbx
328e1051a39Sopenharmony_ci	xor	%rax,%rbx		# not %rax
329e1051a39Sopenharmony_ci	xor	$i,$i
330e1051a39Sopenharmony_ci	mov	$num,$j			# j=num
331e1051a39Sopenharmony_ci
332e1051a39Sopenharmony_ci.Lcopy:					# conditional copy
333e1051a39Sopenharmony_ci	mov	($rp,$i,8),%rcx
334e1051a39Sopenharmony_ci	mov	(%rsp,$i,8),%rdx
335e1051a39Sopenharmony_ci	and	%rbx,%rcx
336e1051a39Sopenharmony_ci	and	%rax,%rdx
337e1051a39Sopenharmony_ci	mov	$num,(%rsp,$i,8)	# zap temporary vector
338e1051a39Sopenharmony_ci	or	%rcx,%rdx
339e1051a39Sopenharmony_ci	mov	%rdx,($rp,$i,8)		# rp[i]=tp[i]
340e1051a39Sopenharmony_ci	lea	1($i),$i
341e1051a39Sopenharmony_ci	sub	\$1,$j
342e1051a39Sopenharmony_ci	jnz	.Lcopy
343e1051a39Sopenharmony_ci
344e1051a39Sopenharmony_ci	mov	8(%rsp,$num,8),%rsi	# restore %rsp
345e1051a39Sopenharmony_ci.cfi_def_cfa	%rsi,8
346e1051a39Sopenharmony_ci	mov	\$1,%rax
347e1051a39Sopenharmony_ci	mov	-48(%rsi),%r15
348e1051a39Sopenharmony_ci.cfi_restore	%r15
349e1051a39Sopenharmony_ci	mov	-40(%rsi),%r14
350e1051a39Sopenharmony_ci.cfi_restore	%r14
351e1051a39Sopenharmony_ci	mov	-32(%rsi),%r13
352e1051a39Sopenharmony_ci.cfi_restore	%r13
353e1051a39Sopenharmony_ci	mov	-24(%rsi),%r12
354e1051a39Sopenharmony_ci.cfi_restore	%r12
355e1051a39Sopenharmony_ci	mov	-16(%rsi),%rbp
356e1051a39Sopenharmony_ci.cfi_restore	%rbp
357e1051a39Sopenharmony_ci	mov	-8(%rsi),%rbx
358e1051a39Sopenharmony_ci.cfi_restore	%rbx
359e1051a39Sopenharmony_ci	lea	(%rsi),%rsp
360e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
361e1051a39Sopenharmony_ci.Lmul_epilogue:
362e1051a39Sopenharmony_ci	ret
363e1051a39Sopenharmony_ci.cfi_endproc
364e1051a39Sopenharmony_ci.size	bn_mul_mont,.-bn_mul_mont
365e1051a39Sopenharmony_ci___
366e1051a39Sopenharmony_ci{{{
367e1051a39Sopenharmony_cimy @A=("%r10","%r11");
368e1051a39Sopenharmony_cimy @N=("%r13","%rdi");
369e1051a39Sopenharmony_ci$code.=<<___;
370e1051a39Sopenharmony_ci.type	bn_mul4x_mont,\@function,6
371e1051a39Sopenharmony_ci.align	16
372e1051a39Sopenharmony_cibn_mul4x_mont:
373e1051a39Sopenharmony_ci.cfi_startproc
374e1051a39Sopenharmony_ci	mov	${num}d,${num}d
375e1051a39Sopenharmony_ci	mov	%rsp,%rax
376e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
377e1051a39Sopenharmony_ci.Lmul4x_enter:
378e1051a39Sopenharmony_ci___
379e1051a39Sopenharmony_ci$code.=<<___ if ($addx);
380e1051a39Sopenharmony_ci	and	\$0x80100,%r11d
381e1051a39Sopenharmony_ci	cmp	\$0x80100,%r11d
382e1051a39Sopenharmony_ci	je	.Lmulx4x_enter
383e1051a39Sopenharmony_ci___
384e1051a39Sopenharmony_ci$code.=<<___;
385e1051a39Sopenharmony_ci	push	%rbx
386e1051a39Sopenharmony_ci.cfi_push	%rbx
387e1051a39Sopenharmony_ci	push	%rbp
388e1051a39Sopenharmony_ci.cfi_push	%rbp
389e1051a39Sopenharmony_ci	push	%r12
390e1051a39Sopenharmony_ci.cfi_push	%r12
391e1051a39Sopenharmony_ci	push	%r13
392e1051a39Sopenharmony_ci.cfi_push	%r13
393e1051a39Sopenharmony_ci	push	%r14
394e1051a39Sopenharmony_ci.cfi_push	%r14
395e1051a39Sopenharmony_ci	push	%r15
396e1051a39Sopenharmony_ci.cfi_push	%r15
397e1051a39Sopenharmony_ci
398e1051a39Sopenharmony_ci	neg	$num
399e1051a39Sopenharmony_ci	mov	%rsp,%r11
400e1051a39Sopenharmony_ci	lea	-32(%rsp,$num,8),%r10	# future alloca(8*(num+4))
401e1051a39Sopenharmony_ci	neg	$num			# restore
402e1051a39Sopenharmony_ci	and	\$-1024,%r10		# minimize TLB usage
403e1051a39Sopenharmony_ci
404e1051a39Sopenharmony_ci	sub	%r10,%r11
405e1051a39Sopenharmony_ci	and	\$-4096,%r11
406e1051a39Sopenharmony_ci	lea	(%r10,%r11),%rsp
407e1051a39Sopenharmony_ci	mov	(%rsp),%r11
408e1051a39Sopenharmony_ci	cmp	%r10,%rsp
409e1051a39Sopenharmony_ci	ja	.Lmul4x_page_walk
410e1051a39Sopenharmony_ci	jmp	.Lmul4x_page_walk_done
411e1051a39Sopenharmony_ci
412e1051a39Sopenharmony_ci.Lmul4x_page_walk:
413e1051a39Sopenharmony_ci	lea	-4096(%rsp),%rsp
414e1051a39Sopenharmony_ci	mov	(%rsp),%r11
415e1051a39Sopenharmony_ci	cmp	%r10,%rsp
416e1051a39Sopenharmony_ci	ja	.Lmul4x_page_walk
417e1051a39Sopenharmony_ci.Lmul4x_page_walk_done:
418e1051a39Sopenharmony_ci
419e1051a39Sopenharmony_ci	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
420e1051a39Sopenharmony_ci.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
421e1051a39Sopenharmony_ci.Lmul4x_body:
422e1051a39Sopenharmony_ci	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
423e1051a39Sopenharmony_ci	mov	%rdx,%r12		# reassign $bp
424e1051a39Sopenharmony_ci___
425e1051a39Sopenharmony_ci		$bp="%r12";
426e1051a39Sopenharmony_ci$code.=<<___;
427e1051a39Sopenharmony_ci	mov	($n0),$n0		# pull n0[0] value
428e1051a39Sopenharmony_ci	mov	($bp),$m0		# m0=bp[0]
429e1051a39Sopenharmony_ci	mov	($ap),%rax
430e1051a39Sopenharmony_ci
431e1051a39Sopenharmony_ci	xor	$i,$i			# i=0
432e1051a39Sopenharmony_ci	xor	$j,$j			# j=0
433e1051a39Sopenharmony_ci
434e1051a39Sopenharmony_ci	mov	$n0,$m1
435e1051a39Sopenharmony_ci	mulq	$m0			# ap[0]*bp[0]
436e1051a39Sopenharmony_ci	mov	%rax,$A[0]
437e1051a39Sopenharmony_ci	mov	($np),%rax
438e1051a39Sopenharmony_ci
439e1051a39Sopenharmony_ci	imulq	$A[0],$m1		# "tp[0]"*n0
440e1051a39Sopenharmony_ci	mov	%rdx,$A[1]
441e1051a39Sopenharmony_ci
442e1051a39Sopenharmony_ci	mulq	$m1			# np[0]*m1
443e1051a39Sopenharmony_ci	add	%rax,$A[0]		# discarded
444e1051a39Sopenharmony_ci	mov	8($ap),%rax
445e1051a39Sopenharmony_ci	adc	\$0,%rdx
446e1051a39Sopenharmony_ci	mov	%rdx,$N[1]
447e1051a39Sopenharmony_ci
448e1051a39Sopenharmony_ci	mulq	$m0
449e1051a39Sopenharmony_ci	add	%rax,$A[1]
450e1051a39Sopenharmony_ci	mov	8($np),%rax
451e1051a39Sopenharmony_ci	adc	\$0,%rdx
452e1051a39Sopenharmony_ci	mov	%rdx,$A[0]
453e1051a39Sopenharmony_ci
454e1051a39Sopenharmony_ci	mulq	$m1
455e1051a39Sopenharmony_ci	add	%rax,$N[1]
456e1051a39Sopenharmony_ci	mov	16($ap),%rax
457e1051a39Sopenharmony_ci	adc	\$0,%rdx
458e1051a39Sopenharmony_ci	add	$A[1],$N[1]
459e1051a39Sopenharmony_ci	lea	4($j),$j		# j++
460e1051a39Sopenharmony_ci	adc	\$0,%rdx
461e1051a39Sopenharmony_ci	mov	$N[1],(%rsp)
462e1051a39Sopenharmony_ci	mov	%rdx,$N[0]
463e1051a39Sopenharmony_ci	jmp	.L1st4x
464e1051a39Sopenharmony_ci.align	16
465e1051a39Sopenharmony_ci.L1st4x:
466e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[0]
467e1051a39Sopenharmony_ci	add	%rax,$A[0]
468e1051a39Sopenharmony_ci	mov	-16($np,$j,8),%rax
469e1051a39Sopenharmony_ci	adc	\$0,%rdx
470e1051a39Sopenharmony_ci	mov	%rdx,$A[1]
471e1051a39Sopenharmony_ci
472e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
473e1051a39Sopenharmony_ci	add	%rax,$N[0]
474e1051a39Sopenharmony_ci	mov	-8($ap,$j,8),%rax
475e1051a39Sopenharmony_ci	adc	\$0,%rdx
476e1051a39Sopenharmony_ci	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
477e1051a39Sopenharmony_ci	adc	\$0,%rdx
478e1051a39Sopenharmony_ci	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
479e1051a39Sopenharmony_ci	mov	%rdx,$N[1]
480e1051a39Sopenharmony_ci
481e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[0]
482e1051a39Sopenharmony_ci	add	%rax,$A[1]
483e1051a39Sopenharmony_ci	mov	-8($np,$j,8),%rax
484e1051a39Sopenharmony_ci	adc	\$0,%rdx
485e1051a39Sopenharmony_ci	mov	%rdx,$A[0]
486e1051a39Sopenharmony_ci
487e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
488e1051a39Sopenharmony_ci	add	%rax,$N[1]
489e1051a39Sopenharmony_ci	mov	($ap,$j,8),%rax
490e1051a39Sopenharmony_ci	adc	\$0,%rdx
491e1051a39Sopenharmony_ci	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
492e1051a39Sopenharmony_ci	adc	\$0,%rdx
493e1051a39Sopenharmony_ci	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
494e1051a39Sopenharmony_ci	mov	%rdx,$N[0]
495e1051a39Sopenharmony_ci
496e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[0]
497e1051a39Sopenharmony_ci	add	%rax,$A[0]
498e1051a39Sopenharmony_ci	mov	($np,$j,8),%rax
499e1051a39Sopenharmony_ci	adc	\$0,%rdx
500e1051a39Sopenharmony_ci	mov	%rdx,$A[1]
501e1051a39Sopenharmony_ci
502e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
503e1051a39Sopenharmony_ci	add	%rax,$N[0]
504e1051a39Sopenharmony_ci	mov	8($ap,$j,8),%rax
505e1051a39Sopenharmony_ci	adc	\$0,%rdx
506e1051a39Sopenharmony_ci	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
507e1051a39Sopenharmony_ci	adc	\$0,%rdx
508e1051a39Sopenharmony_ci	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
509e1051a39Sopenharmony_ci	mov	%rdx,$N[1]
510e1051a39Sopenharmony_ci
511e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[0]
512e1051a39Sopenharmony_ci	add	%rax,$A[1]
513e1051a39Sopenharmony_ci	mov	8($np,$j,8),%rax
514e1051a39Sopenharmony_ci	adc	\$0,%rdx
515e1051a39Sopenharmony_ci	lea	4($j),$j		# j++
516e1051a39Sopenharmony_ci	mov	%rdx,$A[0]
517e1051a39Sopenharmony_ci
518e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
519e1051a39Sopenharmony_ci	add	%rax,$N[1]
520e1051a39Sopenharmony_ci	mov	-16($ap,$j,8),%rax
521e1051a39Sopenharmony_ci	adc	\$0,%rdx
522e1051a39Sopenharmony_ci	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
523e1051a39Sopenharmony_ci	adc	\$0,%rdx
524e1051a39Sopenharmony_ci	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
525e1051a39Sopenharmony_ci	mov	%rdx,$N[0]
526e1051a39Sopenharmony_ci	cmp	$num,$j
527e1051a39Sopenharmony_ci	jb	.L1st4x
528e1051a39Sopenharmony_ci
529e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[0]
530e1051a39Sopenharmony_ci	add	%rax,$A[0]
531e1051a39Sopenharmony_ci	mov	-16($np,$j,8),%rax
532e1051a39Sopenharmony_ci	adc	\$0,%rdx
533e1051a39Sopenharmony_ci	mov	%rdx,$A[1]
534e1051a39Sopenharmony_ci
535e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
536e1051a39Sopenharmony_ci	add	%rax,$N[0]
537e1051a39Sopenharmony_ci	mov	-8($ap,$j,8),%rax
538e1051a39Sopenharmony_ci	adc	\$0,%rdx
539e1051a39Sopenharmony_ci	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
540e1051a39Sopenharmony_ci	adc	\$0,%rdx
541e1051a39Sopenharmony_ci	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
542e1051a39Sopenharmony_ci	mov	%rdx,$N[1]
543e1051a39Sopenharmony_ci
544e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[0]
545e1051a39Sopenharmony_ci	add	%rax,$A[1]
546e1051a39Sopenharmony_ci	mov	-8($np,$j,8),%rax
547e1051a39Sopenharmony_ci	adc	\$0,%rdx
548e1051a39Sopenharmony_ci	mov	%rdx,$A[0]
549e1051a39Sopenharmony_ci
550e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
551e1051a39Sopenharmony_ci	add	%rax,$N[1]
552e1051a39Sopenharmony_ci	mov	($ap),%rax		# ap[0]
553e1051a39Sopenharmony_ci	adc	\$0,%rdx
554e1051a39Sopenharmony_ci	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
555e1051a39Sopenharmony_ci	adc	\$0,%rdx
556e1051a39Sopenharmony_ci	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
557e1051a39Sopenharmony_ci	mov	%rdx,$N[0]
558e1051a39Sopenharmony_ci
559e1051a39Sopenharmony_ci	xor	$N[1],$N[1]
560e1051a39Sopenharmony_ci	add	$A[0],$N[0]
561e1051a39Sopenharmony_ci	adc	\$0,$N[1]
562e1051a39Sopenharmony_ci	mov	$N[0],-8(%rsp,$j,8)
563e1051a39Sopenharmony_ci	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
564e1051a39Sopenharmony_ci
565e1051a39Sopenharmony_ci	lea	1($i),$i		# i++
566e1051a39Sopenharmony_ci.align	4
567e1051a39Sopenharmony_ci.Louter4x:
568e1051a39Sopenharmony_ci	mov	($bp,$i,8),$m0		# m0=bp[i]
569e1051a39Sopenharmony_ci	xor	$j,$j			# j=0
570e1051a39Sopenharmony_ci	mov	(%rsp),$A[0]
571e1051a39Sopenharmony_ci	mov	$n0,$m1
572e1051a39Sopenharmony_ci	mulq	$m0			# ap[0]*bp[i]
573e1051a39Sopenharmony_ci	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
574e1051a39Sopenharmony_ci	mov	($np),%rax
575e1051a39Sopenharmony_ci	adc	\$0,%rdx
576e1051a39Sopenharmony_ci
577e1051a39Sopenharmony_ci	imulq	$A[0],$m1		# tp[0]*n0
578e1051a39Sopenharmony_ci	mov	%rdx,$A[1]
579e1051a39Sopenharmony_ci
580e1051a39Sopenharmony_ci	mulq	$m1			# np[0]*m1
581e1051a39Sopenharmony_ci	add	%rax,$A[0]		# "$N[0]", discarded
582e1051a39Sopenharmony_ci	mov	8($ap),%rax
583e1051a39Sopenharmony_ci	adc	\$0,%rdx
584e1051a39Sopenharmony_ci	mov	%rdx,$N[1]
585e1051a39Sopenharmony_ci
586e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[i]
587e1051a39Sopenharmony_ci	add	%rax,$A[1]
588e1051a39Sopenharmony_ci	mov	8($np),%rax
589e1051a39Sopenharmony_ci	adc	\$0,%rdx
590e1051a39Sopenharmony_ci	add	8(%rsp),$A[1]		# +tp[1]
591e1051a39Sopenharmony_ci	adc	\$0,%rdx
592e1051a39Sopenharmony_ci	mov	%rdx,$A[0]
593e1051a39Sopenharmony_ci
594e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
595e1051a39Sopenharmony_ci	add	%rax,$N[1]
596e1051a39Sopenharmony_ci	mov	16($ap),%rax
597e1051a39Sopenharmony_ci	adc	\$0,%rdx
598e1051a39Sopenharmony_ci	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
599e1051a39Sopenharmony_ci	lea	4($j),$j		# j+=2
600e1051a39Sopenharmony_ci	adc	\$0,%rdx
601e1051a39Sopenharmony_ci	mov	$N[1],(%rsp)		# tp[j-1]
602e1051a39Sopenharmony_ci	mov	%rdx,$N[0]
603e1051a39Sopenharmony_ci	jmp	.Linner4x
604e1051a39Sopenharmony_ci.align	16
605e1051a39Sopenharmony_ci.Linner4x:
606e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[i]
607e1051a39Sopenharmony_ci	add	%rax,$A[0]
608e1051a39Sopenharmony_ci	mov	-16($np,$j,8),%rax
609e1051a39Sopenharmony_ci	adc	\$0,%rdx
610e1051a39Sopenharmony_ci	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
611e1051a39Sopenharmony_ci	adc	\$0,%rdx
612e1051a39Sopenharmony_ci	mov	%rdx,$A[1]
613e1051a39Sopenharmony_ci
614e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
615e1051a39Sopenharmony_ci	add	%rax,$N[0]
616e1051a39Sopenharmony_ci	mov	-8($ap,$j,8),%rax
617e1051a39Sopenharmony_ci	adc	\$0,%rdx
618e1051a39Sopenharmony_ci	add	$A[0],$N[0]
619e1051a39Sopenharmony_ci	adc	\$0,%rdx
620e1051a39Sopenharmony_ci	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
621e1051a39Sopenharmony_ci	mov	%rdx,$N[1]
622e1051a39Sopenharmony_ci
623e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[i]
624e1051a39Sopenharmony_ci	add	%rax,$A[1]
625e1051a39Sopenharmony_ci	mov	-8($np,$j,8),%rax
626e1051a39Sopenharmony_ci	adc	\$0,%rdx
627e1051a39Sopenharmony_ci	add	-8(%rsp,$j,8),$A[1]
628e1051a39Sopenharmony_ci	adc	\$0,%rdx
629e1051a39Sopenharmony_ci	mov	%rdx,$A[0]
630e1051a39Sopenharmony_ci
631e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
632e1051a39Sopenharmony_ci	add	%rax,$N[1]
633e1051a39Sopenharmony_ci	mov	($ap,$j,8),%rax
634e1051a39Sopenharmony_ci	adc	\$0,%rdx
635e1051a39Sopenharmony_ci	add	$A[1],$N[1]
636e1051a39Sopenharmony_ci	adc	\$0,%rdx
637e1051a39Sopenharmony_ci	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
638e1051a39Sopenharmony_ci	mov	%rdx,$N[0]
639e1051a39Sopenharmony_ci
640e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[i]
641e1051a39Sopenharmony_ci	add	%rax,$A[0]
642e1051a39Sopenharmony_ci	mov	($np,$j,8),%rax
643e1051a39Sopenharmony_ci	adc	\$0,%rdx
644e1051a39Sopenharmony_ci	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
645e1051a39Sopenharmony_ci	adc	\$0,%rdx
646e1051a39Sopenharmony_ci	mov	%rdx,$A[1]
647e1051a39Sopenharmony_ci
648e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
649e1051a39Sopenharmony_ci	add	%rax,$N[0]
650e1051a39Sopenharmony_ci	mov	8($ap,$j,8),%rax
651e1051a39Sopenharmony_ci	adc	\$0,%rdx
652e1051a39Sopenharmony_ci	add	$A[0],$N[0]
653e1051a39Sopenharmony_ci	adc	\$0,%rdx
654e1051a39Sopenharmony_ci	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
655e1051a39Sopenharmony_ci	mov	%rdx,$N[1]
656e1051a39Sopenharmony_ci
657e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[i]
658e1051a39Sopenharmony_ci	add	%rax,$A[1]
659e1051a39Sopenharmony_ci	mov	8($np,$j,8),%rax
660e1051a39Sopenharmony_ci	adc	\$0,%rdx
661e1051a39Sopenharmony_ci	add	8(%rsp,$j,8),$A[1]
662e1051a39Sopenharmony_ci	adc	\$0,%rdx
663e1051a39Sopenharmony_ci	lea	4($j),$j		# j++
664e1051a39Sopenharmony_ci	mov	%rdx,$A[0]
665e1051a39Sopenharmony_ci
666e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
667e1051a39Sopenharmony_ci	add	%rax,$N[1]
668e1051a39Sopenharmony_ci	mov	-16($ap,$j,8),%rax
669e1051a39Sopenharmony_ci	adc	\$0,%rdx
670e1051a39Sopenharmony_ci	add	$A[1],$N[1]
671e1051a39Sopenharmony_ci	adc	\$0,%rdx
672e1051a39Sopenharmony_ci	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
673e1051a39Sopenharmony_ci	mov	%rdx,$N[0]
674e1051a39Sopenharmony_ci	cmp	$num,$j
675e1051a39Sopenharmony_ci	jb	.Linner4x
676e1051a39Sopenharmony_ci
677e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[i]
678e1051a39Sopenharmony_ci	add	%rax,$A[0]
679e1051a39Sopenharmony_ci	mov	-16($np,$j,8),%rax
680e1051a39Sopenharmony_ci	adc	\$0,%rdx
681e1051a39Sopenharmony_ci	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
682e1051a39Sopenharmony_ci	adc	\$0,%rdx
683e1051a39Sopenharmony_ci	mov	%rdx,$A[1]
684e1051a39Sopenharmony_ci
685e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
686e1051a39Sopenharmony_ci	add	%rax,$N[0]
687e1051a39Sopenharmony_ci	mov	-8($ap,$j,8),%rax
688e1051a39Sopenharmony_ci	adc	\$0,%rdx
689e1051a39Sopenharmony_ci	add	$A[0],$N[0]
690e1051a39Sopenharmony_ci	adc	\$0,%rdx
691e1051a39Sopenharmony_ci	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
692e1051a39Sopenharmony_ci	mov	%rdx,$N[1]
693e1051a39Sopenharmony_ci
694e1051a39Sopenharmony_ci	mulq	$m0			# ap[j]*bp[i]
695e1051a39Sopenharmony_ci	add	%rax,$A[1]
696e1051a39Sopenharmony_ci	mov	-8($np,$j,8),%rax
697e1051a39Sopenharmony_ci	adc	\$0,%rdx
698e1051a39Sopenharmony_ci	add	-8(%rsp,$j,8),$A[1]
699e1051a39Sopenharmony_ci	adc	\$0,%rdx
700e1051a39Sopenharmony_ci	lea	1($i),$i		# i++
701e1051a39Sopenharmony_ci	mov	%rdx,$A[0]
702e1051a39Sopenharmony_ci
703e1051a39Sopenharmony_ci	mulq	$m1			# np[j]*m1
704e1051a39Sopenharmony_ci	add	%rax,$N[1]
705e1051a39Sopenharmony_ci	mov	($ap),%rax		# ap[0]
706e1051a39Sopenharmony_ci	adc	\$0,%rdx
707e1051a39Sopenharmony_ci	add	$A[1],$N[1]
708e1051a39Sopenharmony_ci	adc	\$0,%rdx
709e1051a39Sopenharmony_ci	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
710e1051a39Sopenharmony_ci	mov	%rdx,$N[0]
711e1051a39Sopenharmony_ci
712e1051a39Sopenharmony_ci	xor	$N[1],$N[1]
713e1051a39Sopenharmony_ci	add	$A[0],$N[0]
714e1051a39Sopenharmony_ci	adc	\$0,$N[1]
715e1051a39Sopenharmony_ci	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
716e1051a39Sopenharmony_ci	adc	\$0,$N[1]
717e1051a39Sopenharmony_ci	mov	$N[0],-8(%rsp,$j,8)
718e1051a39Sopenharmony_ci	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
719e1051a39Sopenharmony_ci
720e1051a39Sopenharmony_ci	cmp	$num,$i
721e1051a39Sopenharmony_ci	jb	.Louter4x
722e1051a39Sopenharmony_ci___
723e1051a39Sopenharmony_ci{
724e1051a39Sopenharmony_cimy @ri=("%rax","%rdx",$m0,$m1);
725e1051a39Sopenharmony_ci$code.=<<___;
726e1051a39Sopenharmony_ci	mov	16(%rsp,$num,8),$rp	# restore $rp
727e1051a39Sopenharmony_ci	lea	-4($num),$j
728e1051a39Sopenharmony_ci	mov	0(%rsp),@ri[0]		# tp[0]
729e1051a39Sopenharmony_ci	mov	8(%rsp),@ri[1]		# tp[1]
730e1051a39Sopenharmony_ci	shr	\$2,$j			# j=num/4-1
731e1051a39Sopenharmony_ci	lea	(%rsp),$ap		# borrow ap for tp
732e1051a39Sopenharmony_ci	xor	$i,$i			# i=0 and clear CF!
733e1051a39Sopenharmony_ci
734e1051a39Sopenharmony_ci	sub	0($np),@ri[0]
735e1051a39Sopenharmony_ci	mov	16($ap),@ri[2]		# tp[2]
736e1051a39Sopenharmony_ci	mov	24($ap),@ri[3]		# tp[3]
737e1051a39Sopenharmony_ci	sbb	8($np),@ri[1]
738e1051a39Sopenharmony_ci
739e1051a39Sopenharmony_ci.Lsub4x:
740e1051a39Sopenharmony_ci	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
741e1051a39Sopenharmony_ci	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
742e1051a39Sopenharmony_ci	sbb	16($np,$i,8),@ri[2]
743e1051a39Sopenharmony_ci	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
744e1051a39Sopenharmony_ci	mov	40($ap,$i,8),@ri[1]
745e1051a39Sopenharmony_ci	sbb	24($np,$i,8),@ri[3]
746e1051a39Sopenharmony_ci	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
747e1051a39Sopenharmony_ci	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
748e1051a39Sopenharmony_ci	sbb	32($np,$i,8),@ri[0]
749e1051a39Sopenharmony_ci	mov	48($ap,$i,8),@ri[2]
750e1051a39Sopenharmony_ci	mov	56($ap,$i,8),@ri[3]
751e1051a39Sopenharmony_ci	sbb	40($np,$i,8),@ri[1]
752e1051a39Sopenharmony_ci	lea	4($i),$i		# i++
753e1051a39Sopenharmony_ci	dec	$j			# doesn't affect CF!
754e1051a39Sopenharmony_ci	jnz	.Lsub4x
755e1051a39Sopenharmony_ci
756e1051a39Sopenharmony_ci	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
757e1051a39Sopenharmony_ci	mov	32($ap,$i,8),@ri[0]	# load overflow bit
758e1051a39Sopenharmony_ci	sbb	16($np,$i,8),@ri[2]
759e1051a39Sopenharmony_ci	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
760e1051a39Sopenharmony_ci	sbb	24($np,$i,8),@ri[3]
761e1051a39Sopenharmony_ci	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
762e1051a39Sopenharmony_ci
763e1051a39Sopenharmony_ci	sbb	\$0,@ri[0]		# handle upmost overflow bit
764e1051a39Sopenharmony_ci	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
765e1051a39Sopenharmony_ci	pxor	%xmm0,%xmm0
766e1051a39Sopenharmony_ci	movq	@ri[0],%xmm4
767e1051a39Sopenharmony_ci	pcmpeqd	%xmm5,%xmm5
768e1051a39Sopenharmony_ci	pshufd	\$0,%xmm4,%xmm4
769e1051a39Sopenharmony_ci	mov	$num,$j
770e1051a39Sopenharmony_ci	pxor	%xmm4,%xmm5
771e1051a39Sopenharmony_ci	shr	\$2,$j			# j=num/4
772e1051a39Sopenharmony_ci	xor	%eax,%eax		# i=0
773e1051a39Sopenharmony_ci
774e1051a39Sopenharmony_ci	jmp	.Lcopy4x
775e1051a39Sopenharmony_ci.align	16
776e1051a39Sopenharmony_ci.Lcopy4x:				# conditional copy
777e1051a39Sopenharmony_ci	movdqa	(%rsp,%rax),%xmm1
778e1051a39Sopenharmony_ci	movdqu	($rp,%rax),%xmm2
779e1051a39Sopenharmony_ci	pand	%xmm4,%xmm1
780e1051a39Sopenharmony_ci	pand	%xmm5,%xmm2
781e1051a39Sopenharmony_ci	movdqa	16(%rsp,%rax),%xmm3
782e1051a39Sopenharmony_ci	movdqa	%xmm0,(%rsp,%rax)
783e1051a39Sopenharmony_ci	por	%xmm2,%xmm1
784e1051a39Sopenharmony_ci	movdqu	16($rp,%rax),%xmm2
785e1051a39Sopenharmony_ci	movdqu	%xmm1,($rp,%rax)
786e1051a39Sopenharmony_ci	pand	%xmm4,%xmm3
787e1051a39Sopenharmony_ci	pand	%xmm5,%xmm2
788e1051a39Sopenharmony_ci	movdqa	%xmm0,16(%rsp,%rax)
789e1051a39Sopenharmony_ci	por	%xmm2,%xmm3
790e1051a39Sopenharmony_ci	movdqu	%xmm3,16($rp,%rax)
791e1051a39Sopenharmony_ci	lea	32(%rax),%rax
792e1051a39Sopenharmony_ci	dec	$j
793e1051a39Sopenharmony_ci	jnz	.Lcopy4x
794e1051a39Sopenharmony_ci___
795e1051a39Sopenharmony_ci}
796e1051a39Sopenharmony_ci$code.=<<___;
797e1051a39Sopenharmony_ci	mov	8(%rsp,$num,8),%rsi	# restore %rsp
798e1051a39Sopenharmony_ci.cfi_def_cfa	%rsi, 8
799e1051a39Sopenharmony_ci	mov	\$1,%rax
800e1051a39Sopenharmony_ci	mov	-48(%rsi),%r15
801e1051a39Sopenharmony_ci.cfi_restore	%r15
802e1051a39Sopenharmony_ci	mov	-40(%rsi),%r14
803e1051a39Sopenharmony_ci.cfi_restore	%r14
804e1051a39Sopenharmony_ci	mov	-32(%rsi),%r13
805e1051a39Sopenharmony_ci.cfi_restore	%r13
806e1051a39Sopenharmony_ci	mov	-24(%rsi),%r12
807e1051a39Sopenharmony_ci.cfi_restore	%r12
808e1051a39Sopenharmony_ci	mov	-16(%rsi),%rbp
809e1051a39Sopenharmony_ci.cfi_restore	%rbp
810e1051a39Sopenharmony_ci	mov	-8(%rsi),%rbx
811e1051a39Sopenharmony_ci.cfi_restore	%rbx
812e1051a39Sopenharmony_ci	lea	(%rsi),%rsp
813e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
814e1051a39Sopenharmony_ci.Lmul4x_epilogue:
815e1051a39Sopenharmony_ci	ret
816e1051a39Sopenharmony_ci.cfi_endproc
817e1051a39Sopenharmony_ci.size	bn_mul4x_mont,.-bn_mul4x_mont
818e1051a39Sopenharmony_ci___
819e1051a39Sopenharmony_ci}}}
820e1051a39Sopenharmony_ci{{{
821e1051a39Sopenharmony_ci######################################################################
822e1051a39Sopenharmony_ci# void bn_sqr8x_mont(
823e1051a39Sopenharmony_cimy $rptr="%rdi";	# const BN_ULONG *rptr,
824e1051a39Sopenharmony_cimy $aptr="%rsi";	# const BN_ULONG *aptr,
825e1051a39Sopenharmony_cimy $bptr="%rdx";	# not used
826e1051a39Sopenharmony_cimy $nptr="%rcx";	# const BN_ULONG *nptr,
827e1051a39Sopenharmony_cimy $n0  ="%r8";		# const BN_ULONG *n0);
828e1051a39Sopenharmony_cimy $num ="%r9";		# int num, has to be divisible by 8
829e1051a39Sopenharmony_ci
830e1051a39Sopenharmony_cimy ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
831e1051a39Sopenharmony_cimy @A0=("%r10","%r11");
832e1051a39Sopenharmony_cimy @A1=("%r12","%r13");
833e1051a39Sopenharmony_cimy ($a0,$a1,$ai)=("%r14","%r15","%rbx");
834e1051a39Sopenharmony_ci
835e1051a39Sopenharmony_ci$code.=<<___	if ($addx);
836e1051a39Sopenharmony_ci.extern	bn_sqrx8x_internal		# see x86_64-mont5 module
837e1051a39Sopenharmony_ci___
838e1051a39Sopenharmony_ci$code.=<<___;
839e1051a39Sopenharmony_ci.extern	bn_sqr8x_internal		# see x86_64-mont5 module
840e1051a39Sopenharmony_ci
841e1051a39Sopenharmony_ci.type	bn_sqr8x_mont,\@function,6
842e1051a39Sopenharmony_ci.align	32
843e1051a39Sopenharmony_cibn_sqr8x_mont:
844e1051a39Sopenharmony_ci.cfi_startproc
845e1051a39Sopenharmony_ci	mov	%rsp,%rax
846e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
847e1051a39Sopenharmony_ci.Lsqr8x_enter:
848e1051a39Sopenharmony_ci	push	%rbx
849e1051a39Sopenharmony_ci.cfi_push	%rbx
850e1051a39Sopenharmony_ci	push	%rbp
851e1051a39Sopenharmony_ci.cfi_push	%rbp
852e1051a39Sopenharmony_ci	push	%r12
853e1051a39Sopenharmony_ci.cfi_push	%r12
854e1051a39Sopenharmony_ci	push	%r13
855e1051a39Sopenharmony_ci.cfi_push	%r13
856e1051a39Sopenharmony_ci	push	%r14
857e1051a39Sopenharmony_ci.cfi_push	%r14
858e1051a39Sopenharmony_ci	push	%r15
859e1051a39Sopenharmony_ci.cfi_push	%r15
860e1051a39Sopenharmony_ci.Lsqr8x_prologue:
861e1051a39Sopenharmony_ci
862e1051a39Sopenharmony_ci	mov	${num}d,%r10d
863e1051a39Sopenharmony_ci	shl	\$3,${num}d		# convert $num to bytes
864e1051a39Sopenharmony_ci	shl	\$3+2,%r10		# 4*$num
865e1051a39Sopenharmony_ci	neg	$num
866e1051a39Sopenharmony_ci
867e1051a39Sopenharmony_ci	##############################################################
868e1051a39Sopenharmony_ci	# ensure that stack frame doesn't alias with $aptr modulo
869e1051a39Sopenharmony_ci	# 4096. this is done to allow memory disambiguation logic
870e1051a39Sopenharmony_ci	# do its job.
871e1051a39Sopenharmony_ci	#
872e1051a39Sopenharmony_ci	lea	-64(%rsp,$num,2),%r11
873e1051a39Sopenharmony_ci	mov	%rsp,%rbp
874e1051a39Sopenharmony_ci	mov	($n0),$n0		# *n0
875e1051a39Sopenharmony_ci	sub	$aptr,%r11
876e1051a39Sopenharmony_ci	and	\$4095,%r11
877e1051a39Sopenharmony_ci	cmp	%r11,%r10
878e1051a39Sopenharmony_ci	jb	.Lsqr8x_sp_alt
879e1051a39Sopenharmony_ci	sub	%r11,%rbp		# align with $aptr
880e1051a39Sopenharmony_ci	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
881e1051a39Sopenharmony_ci	jmp	.Lsqr8x_sp_done
882e1051a39Sopenharmony_ci
883e1051a39Sopenharmony_ci.align	32
884e1051a39Sopenharmony_ci.Lsqr8x_sp_alt:
885e1051a39Sopenharmony_ci	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
886e1051a39Sopenharmony_ci	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
887e1051a39Sopenharmony_ci	sub	%r10,%r11
888e1051a39Sopenharmony_ci	mov	\$0,%r10
889e1051a39Sopenharmony_ci	cmovc	%r10,%r11
890e1051a39Sopenharmony_ci	sub	%r11,%rbp
891e1051a39Sopenharmony_ci.Lsqr8x_sp_done:
892e1051a39Sopenharmony_ci	and	\$-64,%rbp
893e1051a39Sopenharmony_ci	mov	%rsp,%r11
894e1051a39Sopenharmony_ci	sub	%rbp,%r11
895e1051a39Sopenharmony_ci	and	\$-4096,%r11
896e1051a39Sopenharmony_ci	lea	(%rbp,%r11),%rsp
897e1051a39Sopenharmony_ci	mov	(%rsp),%r10
898e1051a39Sopenharmony_ci	cmp	%rbp,%rsp
899e1051a39Sopenharmony_ci	ja	.Lsqr8x_page_walk
900e1051a39Sopenharmony_ci	jmp	.Lsqr8x_page_walk_done
901e1051a39Sopenharmony_ci
902e1051a39Sopenharmony_ci.align	16
903e1051a39Sopenharmony_ci.Lsqr8x_page_walk:
904e1051a39Sopenharmony_ci	lea	-4096(%rsp),%rsp
905e1051a39Sopenharmony_ci	mov	(%rsp),%r10
906e1051a39Sopenharmony_ci	cmp	%rbp,%rsp
907e1051a39Sopenharmony_ci	ja	.Lsqr8x_page_walk
908e1051a39Sopenharmony_ci.Lsqr8x_page_walk_done:
909e1051a39Sopenharmony_ci
910e1051a39Sopenharmony_ci	mov	$num,%r10
911e1051a39Sopenharmony_ci	neg	$num
912e1051a39Sopenharmony_ci
913e1051a39Sopenharmony_ci	mov	$n0,  32(%rsp)
914e1051a39Sopenharmony_ci	mov	%rax, 40(%rsp)		# save original %rsp
915e1051a39Sopenharmony_ci.cfi_cfa_expression	%rsp+40,deref,+8
916e1051a39Sopenharmony_ci.Lsqr8x_body:
917e1051a39Sopenharmony_ci
918e1051a39Sopenharmony_ci	movq	$nptr, %xmm2		# save pointer to modulus
919e1051a39Sopenharmony_ci	pxor	%xmm0,%xmm0
920e1051a39Sopenharmony_ci	movq	$rptr,%xmm1		# save $rptr
921e1051a39Sopenharmony_ci	movq	%r10, %xmm3		# -$num
922e1051a39Sopenharmony_ci___
923e1051a39Sopenharmony_ci$code.=<<___ if ($addx);
924e1051a39Sopenharmony_ci	mov	OPENSSL_ia32cap_P+8(%rip),%eax
925e1051a39Sopenharmony_ci	and	\$0x80100,%eax
926e1051a39Sopenharmony_ci	cmp	\$0x80100,%eax
927e1051a39Sopenharmony_ci	jne	.Lsqr8x_nox
928e1051a39Sopenharmony_ci
929e1051a39Sopenharmony_ci	call	bn_sqrx8x_internal	# see x86_64-mont5 module
930e1051a39Sopenharmony_ci					# %rax	top-most carry
931e1051a39Sopenharmony_ci					# %rbp	nptr
932e1051a39Sopenharmony_ci					# %rcx	-8*num
933e1051a39Sopenharmony_ci					# %r8	end of tp[2*num]
934e1051a39Sopenharmony_ci	lea	(%r8,%rcx),%rbx
935e1051a39Sopenharmony_ci	mov	%rcx,$num
936e1051a39Sopenharmony_ci	mov	%rcx,%rdx
937e1051a39Sopenharmony_ci	movq	%xmm1,$rptr
938e1051a39Sopenharmony_ci	sar	\$3+2,%rcx		# %cf=0
939e1051a39Sopenharmony_ci	jmp	.Lsqr8x_sub
940e1051a39Sopenharmony_ci
941e1051a39Sopenharmony_ci.align	32
942e1051a39Sopenharmony_ci.Lsqr8x_nox:
943e1051a39Sopenharmony_ci___
944e1051a39Sopenharmony_ci$code.=<<___;
945e1051a39Sopenharmony_ci	call	bn_sqr8x_internal	# see x86_64-mont5 module
946e1051a39Sopenharmony_ci					# %rax	top-most carry
947e1051a39Sopenharmony_ci					# %rbp	nptr
948e1051a39Sopenharmony_ci					# %r8	-8*num
949e1051a39Sopenharmony_ci					# %rdi	end of tp[2*num]
950e1051a39Sopenharmony_ci	lea	(%rdi,$num),%rbx
951e1051a39Sopenharmony_ci	mov	$num,%rcx
952e1051a39Sopenharmony_ci	mov	$num,%rdx
953e1051a39Sopenharmony_ci	movq	%xmm1,$rptr
954e1051a39Sopenharmony_ci	sar	\$3+2,%rcx		# %cf=0
955e1051a39Sopenharmony_ci	jmp	.Lsqr8x_sub
956e1051a39Sopenharmony_ci
957e1051a39Sopenharmony_ci.align	32
958e1051a39Sopenharmony_ci.Lsqr8x_sub:
959e1051a39Sopenharmony_ci	mov	8*0(%rbx),%r12
960e1051a39Sopenharmony_ci	mov	8*1(%rbx),%r13
961e1051a39Sopenharmony_ci	mov	8*2(%rbx),%r14
962e1051a39Sopenharmony_ci	mov	8*3(%rbx),%r15
963e1051a39Sopenharmony_ci	lea	8*4(%rbx),%rbx
964e1051a39Sopenharmony_ci	sbb	8*0(%rbp),%r12
965e1051a39Sopenharmony_ci	sbb	8*1(%rbp),%r13
966e1051a39Sopenharmony_ci	sbb	8*2(%rbp),%r14
967e1051a39Sopenharmony_ci	sbb	8*3(%rbp),%r15
968e1051a39Sopenharmony_ci	lea	8*4(%rbp),%rbp
969e1051a39Sopenharmony_ci	mov	%r12,8*0($rptr)
970e1051a39Sopenharmony_ci	mov	%r13,8*1($rptr)
971e1051a39Sopenharmony_ci	mov	%r14,8*2($rptr)
972e1051a39Sopenharmony_ci	mov	%r15,8*3($rptr)
973e1051a39Sopenharmony_ci	lea	8*4($rptr),$rptr
974e1051a39Sopenharmony_ci	inc	%rcx			# preserves %cf
975e1051a39Sopenharmony_ci	jnz	.Lsqr8x_sub
976e1051a39Sopenharmony_ci
977e1051a39Sopenharmony_ci	sbb	\$0,%rax		# top-most carry
978e1051a39Sopenharmony_ci	lea	(%rbx,$num),%rbx	# rewind
979e1051a39Sopenharmony_ci	lea	($rptr,$num),$rptr	# rewind
980e1051a39Sopenharmony_ci
981e1051a39Sopenharmony_ci	movq	%rax,%xmm1
982e1051a39Sopenharmony_ci	pxor	%xmm0,%xmm0
983e1051a39Sopenharmony_ci	pshufd	\$0,%xmm1,%xmm1
984e1051a39Sopenharmony_ci	mov	40(%rsp),%rsi		# restore %rsp
985e1051a39Sopenharmony_ci.cfi_def_cfa	%rsi,8
986e1051a39Sopenharmony_ci	jmp	.Lsqr8x_cond_copy
987e1051a39Sopenharmony_ci
988e1051a39Sopenharmony_ci.align	32
989e1051a39Sopenharmony_ci.Lsqr8x_cond_copy:
990e1051a39Sopenharmony_ci	movdqa	16*0(%rbx),%xmm2
991e1051a39Sopenharmony_ci	movdqa	16*1(%rbx),%xmm3
992e1051a39Sopenharmony_ci	lea	16*2(%rbx),%rbx
993e1051a39Sopenharmony_ci	movdqu	16*0($rptr),%xmm4
994e1051a39Sopenharmony_ci	movdqu	16*1($rptr),%xmm5
995e1051a39Sopenharmony_ci	lea	16*2($rptr),$rptr
996e1051a39Sopenharmony_ci	movdqa	%xmm0,-16*2(%rbx)	# zero tp
997e1051a39Sopenharmony_ci	movdqa	%xmm0,-16*1(%rbx)
998e1051a39Sopenharmony_ci	movdqa	%xmm0,-16*2(%rbx,%rdx)
999e1051a39Sopenharmony_ci	movdqa	%xmm0,-16*1(%rbx,%rdx)
1000e1051a39Sopenharmony_ci	pcmpeqd	%xmm1,%xmm0
1001e1051a39Sopenharmony_ci	pand	%xmm1,%xmm2
1002e1051a39Sopenharmony_ci	pand	%xmm1,%xmm3
1003e1051a39Sopenharmony_ci	pand	%xmm0,%xmm4
1004e1051a39Sopenharmony_ci	pand	%xmm0,%xmm5
1005e1051a39Sopenharmony_ci	pxor	%xmm0,%xmm0
1006e1051a39Sopenharmony_ci	por	%xmm2,%xmm4
1007e1051a39Sopenharmony_ci	por	%xmm3,%xmm5
1008e1051a39Sopenharmony_ci	movdqu	%xmm4,-16*2($rptr)
1009e1051a39Sopenharmony_ci	movdqu	%xmm5,-16*1($rptr)
1010e1051a39Sopenharmony_ci	add	\$32,$num
1011e1051a39Sopenharmony_ci	jnz	.Lsqr8x_cond_copy
1012e1051a39Sopenharmony_ci
1013e1051a39Sopenharmony_ci	mov	\$1,%rax
1014e1051a39Sopenharmony_ci	mov	-48(%rsi),%r15
1015e1051a39Sopenharmony_ci.cfi_restore	%r15
1016e1051a39Sopenharmony_ci	mov	-40(%rsi),%r14
1017e1051a39Sopenharmony_ci.cfi_restore	%r14
1018e1051a39Sopenharmony_ci	mov	-32(%rsi),%r13
1019e1051a39Sopenharmony_ci.cfi_restore	%r13
1020e1051a39Sopenharmony_ci	mov	-24(%rsi),%r12
1021e1051a39Sopenharmony_ci.cfi_restore	%r12
1022e1051a39Sopenharmony_ci	mov	-16(%rsi),%rbp
1023e1051a39Sopenharmony_ci.cfi_restore	%rbp
1024e1051a39Sopenharmony_ci	mov	-8(%rsi),%rbx
1025e1051a39Sopenharmony_ci.cfi_restore	%rbx
1026e1051a39Sopenharmony_ci	lea	(%rsi),%rsp
1027e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
1028e1051a39Sopenharmony_ci.Lsqr8x_epilogue:
1029e1051a39Sopenharmony_ci	ret
1030e1051a39Sopenharmony_ci.cfi_endproc
1031e1051a39Sopenharmony_ci.size	bn_sqr8x_mont,.-bn_sqr8x_mont
1032e1051a39Sopenharmony_ci___
1033e1051a39Sopenharmony_ci}}}
1034e1051a39Sopenharmony_ci
1035e1051a39Sopenharmony_ciif ($addx) {{{
1036e1051a39Sopenharmony_cimy $bp="%rdx";	# original value
1037e1051a39Sopenharmony_ci
1038e1051a39Sopenharmony_ci$code.=<<___;
1039e1051a39Sopenharmony_ci.type	bn_mulx4x_mont,\@function,6
1040e1051a39Sopenharmony_ci.align	32
1041e1051a39Sopenharmony_cibn_mulx4x_mont:
1042e1051a39Sopenharmony_ci.cfi_startproc
1043e1051a39Sopenharmony_ci	mov	%rsp,%rax
1044e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
1045e1051a39Sopenharmony_ci.Lmulx4x_enter:
1046e1051a39Sopenharmony_ci	push	%rbx
1047e1051a39Sopenharmony_ci.cfi_push	%rbx
1048e1051a39Sopenharmony_ci	push	%rbp
1049e1051a39Sopenharmony_ci.cfi_push	%rbp
1050e1051a39Sopenharmony_ci	push	%r12
1051e1051a39Sopenharmony_ci.cfi_push	%r12
1052e1051a39Sopenharmony_ci	push	%r13
1053e1051a39Sopenharmony_ci.cfi_push	%r13
1054e1051a39Sopenharmony_ci	push	%r14
1055e1051a39Sopenharmony_ci.cfi_push	%r14
1056e1051a39Sopenharmony_ci	push	%r15
1057e1051a39Sopenharmony_ci.cfi_push	%r15
1058e1051a39Sopenharmony_ci.Lmulx4x_prologue:
1059e1051a39Sopenharmony_ci
1060e1051a39Sopenharmony_ci	shl	\$3,${num}d		# convert $num to bytes
1061e1051a39Sopenharmony_ci	xor	%r10,%r10
1062e1051a39Sopenharmony_ci	sub	$num,%r10		# -$num
1063e1051a39Sopenharmony_ci	mov	($n0),$n0		# *n0
1064e1051a39Sopenharmony_ci	lea	-72(%rsp,%r10),%rbp	# future alloca(frame+$num+8)
1065e1051a39Sopenharmony_ci	and	\$-128,%rbp
1066e1051a39Sopenharmony_ci	mov	%rsp,%r11
1067e1051a39Sopenharmony_ci	sub	%rbp,%r11
1068e1051a39Sopenharmony_ci	and	\$-4096,%r11
1069e1051a39Sopenharmony_ci	lea	(%rbp,%r11),%rsp
1070e1051a39Sopenharmony_ci	mov	(%rsp),%r10
1071e1051a39Sopenharmony_ci	cmp	%rbp,%rsp
1072e1051a39Sopenharmony_ci	ja	.Lmulx4x_page_walk
1073e1051a39Sopenharmony_ci	jmp	.Lmulx4x_page_walk_done
1074e1051a39Sopenharmony_ci
1075e1051a39Sopenharmony_ci.align	16
1076e1051a39Sopenharmony_ci.Lmulx4x_page_walk:
1077e1051a39Sopenharmony_ci	lea	-4096(%rsp),%rsp
1078e1051a39Sopenharmony_ci	mov	(%rsp),%r10
1079e1051a39Sopenharmony_ci	cmp	%rbp,%rsp
1080e1051a39Sopenharmony_ci	ja	.Lmulx4x_page_walk
1081e1051a39Sopenharmony_ci.Lmulx4x_page_walk_done:
1082e1051a39Sopenharmony_ci
1083e1051a39Sopenharmony_ci	lea	($bp,$num),%r10
1084e1051a39Sopenharmony_ci	##############################################################
1085e1051a39Sopenharmony_ci	# Stack layout
1086e1051a39Sopenharmony_ci	# +0	num
1087e1051a39Sopenharmony_ci	# +8	off-loaded &b[i]
1088e1051a39Sopenharmony_ci	# +16	end of b[num]
1089e1051a39Sopenharmony_ci	# +24	saved n0
1090e1051a39Sopenharmony_ci	# +32	saved rp
1091e1051a39Sopenharmony_ci	# +40	saved %rsp
1092e1051a39Sopenharmony_ci	# +48	inner counter
1093e1051a39Sopenharmony_ci	# +56
1094e1051a39Sopenharmony_ci	# +64	tmp[num+1]
1095e1051a39Sopenharmony_ci	#
1096e1051a39Sopenharmony_ci	mov	$num,0(%rsp)		# save $num
1097e1051a39Sopenharmony_ci	shr	\$5,$num
1098e1051a39Sopenharmony_ci	mov	%r10,16(%rsp)		# end of b[num]
1099e1051a39Sopenharmony_ci	sub	\$1,$num
1100e1051a39Sopenharmony_ci	mov	$n0, 24(%rsp)		# save *n0
1101e1051a39Sopenharmony_ci	mov	$rp, 32(%rsp)		# save $rp
1102e1051a39Sopenharmony_ci	mov	%rax,40(%rsp)		# save original %rsp
1103e1051a39Sopenharmony_ci.cfi_cfa_expression	%rsp+40,deref,+8
1104e1051a39Sopenharmony_ci	mov	$num,48(%rsp)		# inner counter
1105e1051a39Sopenharmony_ci	jmp	.Lmulx4x_body
1106e1051a39Sopenharmony_ci
1107e1051a39Sopenharmony_ci.align	32
1108e1051a39Sopenharmony_ci.Lmulx4x_body:
1109e1051a39Sopenharmony_ci___
1110e1051a39Sopenharmony_cimy ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
1111e1051a39Sopenharmony_ci   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
1112e1051a39Sopenharmony_cimy $rptr=$bptr;
1113e1051a39Sopenharmony_ci$code.=<<___;
1114e1051a39Sopenharmony_ci	lea	8($bp),$bptr
1115e1051a39Sopenharmony_ci	mov	($bp),%rdx		# b[0], $bp==%rdx actually
1116e1051a39Sopenharmony_ci	lea	64+32(%rsp),$tptr
1117e1051a39Sopenharmony_ci	mov	%rdx,$bi
1118e1051a39Sopenharmony_ci
1119e1051a39Sopenharmony_ci	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
1120e1051a39Sopenharmony_ci	mulx	1*8($aptr),%r11,%r14	# a[1]*b[0]
1121e1051a39Sopenharmony_ci	add	%rax,%r11
1122e1051a39Sopenharmony_ci	mov	$bptr,8(%rsp)		# off-load &b[i]
1123e1051a39Sopenharmony_ci	mulx	2*8($aptr),%r12,%r13	# ...
1124e1051a39Sopenharmony_ci	adc	%r14,%r12
1125e1051a39Sopenharmony_ci	adc	\$0,%r13
1126e1051a39Sopenharmony_ci
1127e1051a39Sopenharmony_ci	mov	$mi,$bptr		# borrow $bptr
1128e1051a39Sopenharmony_ci	imulq	24(%rsp),$mi		# "t[0]"*n0
1129e1051a39Sopenharmony_ci	xor	$zero,$zero		# cf=0, of=0
1130e1051a39Sopenharmony_ci
1131e1051a39Sopenharmony_ci	mulx	3*8($aptr),%rax,%r14
1132e1051a39Sopenharmony_ci	 mov	$mi,%rdx
1133e1051a39Sopenharmony_ci	lea	4*8($aptr),$aptr
1134e1051a39Sopenharmony_ci	adcx	%rax,%r13
1135e1051a39Sopenharmony_ci	adcx	$zero,%r14		# cf=0
1136e1051a39Sopenharmony_ci
1137e1051a39Sopenharmony_ci	mulx	0*8($nptr),%rax,%r10
1138e1051a39Sopenharmony_ci	adcx	%rax,$bptr		# discarded
1139e1051a39Sopenharmony_ci	adox	%r11,%r10
1140e1051a39Sopenharmony_ci	mulx	1*8($nptr),%rax,%r11
1141e1051a39Sopenharmony_ci	adcx	%rax,%r10
1142e1051a39Sopenharmony_ci	adox	%r12,%r11
1143e1051a39Sopenharmony_ci	.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00	# mulx	2*8($nptr),%rax,%r12
1144e1051a39Sopenharmony_ci	mov	48(%rsp),$bptr		# counter value
1145e1051a39Sopenharmony_ci	mov	%r10,-4*8($tptr)
1146e1051a39Sopenharmony_ci	adcx	%rax,%r11
1147e1051a39Sopenharmony_ci	adox	%r13,%r12
1148e1051a39Sopenharmony_ci	mulx	3*8($nptr),%rax,%r15
1149e1051a39Sopenharmony_ci	 mov	$bi,%rdx
1150e1051a39Sopenharmony_ci	mov	%r11,-3*8($tptr)
1151e1051a39Sopenharmony_ci	adcx	%rax,%r12
1152e1051a39Sopenharmony_ci	adox	$zero,%r15		# of=0
1153e1051a39Sopenharmony_ci	lea	4*8($nptr),$nptr
1154e1051a39Sopenharmony_ci	mov	%r12,-2*8($tptr)
1155e1051a39Sopenharmony_ci
1156e1051a39Sopenharmony_ci	jmp	.Lmulx4x_1st
1157e1051a39Sopenharmony_ci
1158e1051a39Sopenharmony_ci.align	32
1159e1051a39Sopenharmony_ci.Lmulx4x_1st:
1160e1051a39Sopenharmony_ci	adcx	$zero,%r15		# cf=0, modulo-scheduled
1161e1051a39Sopenharmony_ci	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
1162e1051a39Sopenharmony_ci	adcx	%r14,%r10
1163e1051a39Sopenharmony_ci	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
1164e1051a39Sopenharmony_ci	adcx	%rax,%r11
1165e1051a39Sopenharmony_ci	mulx	2*8($aptr),%r12,%rax	# ...
1166e1051a39Sopenharmony_ci	adcx	%r14,%r12
1167e1051a39Sopenharmony_ci	mulx	3*8($aptr),%r13,%r14
1168e1051a39Sopenharmony_ci	 .byte	0x67,0x67
1169e1051a39Sopenharmony_ci	 mov	$mi,%rdx
1170e1051a39Sopenharmony_ci	adcx	%rax,%r13
1171e1051a39Sopenharmony_ci	adcx	$zero,%r14		# cf=0
1172e1051a39Sopenharmony_ci	lea	4*8($aptr),$aptr
1173e1051a39Sopenharmony_ci	lea	4*8($tptr),$tptr
1174e1051a39Sopenharmony_ci
1175e1051a39Sopenharmony_ci	adox	%r15,%r10
1176e1051a39Sopenharmony_ci	mulx	0*8($nptr),%rax,%r15
1177e1051a39Sopenharmony_ci	adcx	%rax,%r10
1178e1051a39Sopenharmony_ci	adox	%r15,%r11
1179e1051a39Sopenharmony_ci	mulx	1*8($nptr),%rax,%r15
1180e1051a39Sopenharmony_ci	adcx	%rax,%r11
1181e1051a39Sopenharmony_ci	adox	%r15,%r12
1182e1051a39Sopenharmony_ci	mulx	2*8($nptr),%rax,%r15
1183e1051a39Sopenharmony_ci	mov	%r10,-5*8($tptr)
1184e1051a39Sopenharmony_ci	adcx	%rax,%r12
1185e1051a39Sopenharmony_ci	mov	%r11,-4*8($tptr)
1186e1051a39Sopenharmony_ci	adox	%r15,%r13
1187e1051a39Sopenharmony_ci	mulx	3*8($nptr),%rax,%r15
1188e1051a39Sopenharmony_ci	 mov	$bi,%rdx
1189e1051a39Sopenharmony_ci	mov	%r12,-3*8($tptr)
1190e1051a39Sopenharmony_ci	adcx	%rax,%r13
1191e1051a39Sopenharmony_ci	adox	$zero,%r15
1192e1051a39Sopenharmony_ci	lea	4*8($nptr),$nptr
1193e1051a39Sopenharmony_ci	mov	%r13,-2*8($tptr)
1194e1051a39Sopenharmony_ci
1195e1051a39Sopenharmony_ci	dec	$bptr			# of=0, pass cf
1196e1051a39Sopenharmony_ci	jnz	.Lmulx4x_1st
1197e1051a39Sopenharmony_ci
1198e1051a39Sopenharmony_ci	mov	0(%rsp),$num		# load num
1199e1051a39Sopenharmony_ci	mov	8(%rsp),$bptr		# re-load &b[i]
1200e1051a39Sopenharmony_ci	adc	$zero,%r15		# modulo-scheduled
1201e1051a39Sopenharmony_ci	add	%r15,%r14
1202e1051a39Sopenharmony_ci	sbb	%r15,%r15		# top-most carry
1203e1051a39Sopenharmony_ci	mov	%r14,-1*8($tptr)
1204e1051a39Sopenharmony_ci	jmp	.Lmulx4x_outer
1205e1051a39Sopenharmony_ci
1206e1051a39Sopenharmony_ci.align	32
1207e1051a39Sopenharmony_ci.Lmulx4x_outer:
1208e1051a39Sopenharmony_ci	mov	($bptr),%rdx		# b[i]
1209e1051a39Sopenharmony_ci	lea	8($bptr),$bptr		# b++
1210e1051a39Sopenharmony_ci	sub	$num,$aptr		# rewind $aptr
1211e1051a39Sopenharmony_ci	mov	%r15,($tptr)		# save top-most carry
1212e1051a39Sopenharmony_ci	lea	64+4*8(%rsp),$tptr
1213e1051a39Sopenharmony_ci	sub	$num,$nptr		# rewind $nptr
1214e1051a39Sopenharmony_ci
1215e1051a39Sopenharmony_ci	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
1216e1051a39Sopenharmony_ci	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
1217e1051a39Sopenharmony_ci	mov	%rdx,$bi
1218e1051a39Sopenharmony_ci	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
1219e1051a39Sopenharmony_ci	adox	-4*8($tptr),$mi
1220e1051a39Sopenharmony_ci	adcx	%r14,%r11
1221e1051a39Sopenharmony_ci	mulx	2*8($aptr),%r15,%r13	# ...
1222e1051a39Sopenharmony_ci	adox	-3*8($tptr),%r11
1223e1051a39Sopenharmony_ci	adcx	%r15,%r12
1224e1051a39Sopenharmony_ci	adox	-2*8($tptr),%r12
1225e1051a39Sopenharmony_ci	adcx	$zero,%r13
1226e1051a39Sopenharmony_ci	adox	$zero,%r13
1227e1051a39Sopenharmony_ci
1228e1051a39Sopenharmony_ci	mov	$bptr,8(%rsp)		# off-load &b[i]
1229e1051a39Sopenharmony_ci	mov	$mi,%r15
1230e1051a39Sopenharmony_ci	imulq	24(%rsp),$mi		# "t[0]"*n0
1231e1051a39Sopenharmony_ci	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
1232e1051a39Sopenharmony_ci
1233e1051a39Sopenharmony_ci	mulx	3*8($aptr),%rax,%r14
1234e1051a39Sopenharmony_ci	 mov	$mi,%rdx
1235e1051a39Sopenharmony_ci	adcx	%rax,%r13
1236e1051a39Sopenharmony_ci	adox	-1*8($tptr),%r13
1237e1051a39Sopenharmony_ci	adcx	$zero,%r14
1238e1051a39Sopenharmony_ci	lea	4*8($aptr),$aptr
1239e1051a39Sopenharmony_ci	adox	$zero,%r14
1240e1051a39Sopenharmony_ci
1241e1051a39Sopenharmony_ci	mulx	0*8($nptr),%rax,%r10
1242e1051a39Sopenharmony_ci	adcx	%rax,%r15		# discarded
1243e1051a39Sopenharmony_ci	adox	%r11,%r10
1244e1051a39Sopenharmony_ci	mulx	1*8($nptr),%rax,%r11
1245e1051a39Sopenharmony_ci	adcx	%rax,%r10
1246e1051a39Sopenharmony_ci	adox	%r12,%r11
1247e1051a39Sopenharmony_ci	mulx	2*8($nptr),%rax,%r12
1248e1051a39Sopenharmony_ci	mov	%r10,-4*8($tptr)
1249e1051a39Sopenharmony_ci	adcx	%rax,%r11
1250e1051a39Sopenharmony_ci	adox	%r13,%r12
1251e1051a39Sopenharmony_ci	mulx	3*8($nptr),%rax,%r15
1252e1051a39Sopenharmony_ci	 mov	$bi,%rdx
1253e1051a39Sopenharmony_ci	mov	%r11,-3*8($tptr)
1254e1051a39Sopenharmony_ci	lea	4*8($nptr),$nptr
1255e1051a39Sopenharmony_ci	adcx	%rax,%r12
1256e1051a39Sopenharmony_ci	adox	$zero,%r15		# of=0
1257e1051a39Sopenharmony_ci	mov	48(%rsp),$bptr		# counter value
1258e1051a39Sopenharmony_ci	mov	%r12,-2*8($tptr)
1259e1051a39Sopenharmony_ci
1260e1051a39Sopenharmony_ci	jmp	.Lmulx4x_inner
1261e1051a39Sopenharmony_ci
1262e1051a39Sopenharmony_ci.align	32
1263e1051a39Sopenharmony_ci.Lmulx4x_inner:
1264e1051a39Sopenharmony_ci	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
1265e1051a39Sopenharmony_ci	adcx	$zero,%r15		# cf=0, modulo-scheduled
1266e1051a39Sopenharmony_ci	adox	%r14,%r10
1267e1051a39Sopenharmony_ci	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
1268e1051a39Sopenharmony_ci	adcx	0*8($tptr),%r10
1269e1051a39Sopenharmony_ci	adox	%rax,%r11
1270e1051a39Sopenharmony_ci	mulx	2*8($aptr),%r12,%rax	# ...
1271e1051a39Sopenharmony_ci	adcx	1*8($tptr),%r11
1272e1051a39Sopenharmony_ci	adox	%r14,%r12
1273e1051a39Sopenharmony_ci	mulx	3*8($aptr),%r13,%r14
1274e1051a39Sopenharmony_ci	 mov	$mi,%rdx
1275e1051a39Sopenharmony_ci	adcx	2*8($tptr),%r12
1276e1051a39Sopenharmony_ci	adox	%rax,%r13
1277e1051a39Sopenharmony_ci	adcx	3*8($tptr),%r13
1278e1051a39Sopenharmony_ci	adox	$zero,%r14		# of=0
1279e1051a39Sopenharmony_ci	lea	4*8($aptr),$aptr
1280e1051a39Sopenharmony_ci	lea	4*8($tptr),$tptr
1281e1051a39Sopenharmony_ci	adcx	$zero,%r14		# cf=0
1282e1051a39Sopenharmony_ci
1283e1051a39Sopenharmony_ci	adox	%r15,%r10
1284e1051a39Sopenharmony_ci	mulx	0*8($nptr),%rax,%r15
1285e1051a39Sopenharmony_ci	adcx	%rax,%r10
1286e1051a39Sopenharmony_ci	adox	%r15,%r11
1287e1051a39Sopenharmony_ci	mulx	1*8($nptr),%rax,%r15
1288e1051a39Sopenharmony_ci	adcx	%rax,%r11
1289e1051a39Sopenharmony_ci	adox	%r15,%r12
1290e1051a39Sopenharmony_ci	mulx	2*8($nptr),%rax,%r15
1291e1051a39Sopenharmony_ci	mov	%r10,-5*8($tptr)
1292e1051a39Sopenharmony_ci	adcx	%rax,%r12
1293e1051a39Sopenharmony_ci	adox	%r15,%r13
1294e1051a39Sopenharmony_ci	mulx	3*8($nptr),%rax,%r15
1295e1051a39Sopenharmony_ci	 mov	$bi,%rdx
1296e1051a39Sopenharmony_ci	mov	%r11,-4*8($tptr)
1297e1051a39Sopenharmony_ci	mov	%r12,-3*8($tptr)
1298e1051a39Sopenharmony_ci	adcx	%rax,%r13
1299e1051a39Sopenharmony_ci	adox	$zero,%r15
1300e1051a39Sopenharmony_ci	lea	4*8($nptr),$nptr
1301e1051a39Sopenharmony_ci	mov	%r13,-2*8($tptr)
1302e1051a39Sopenharmony_ci
1303e1051a39Sopenharmony_ci	dec	$bptr			# of=0, pass cf
1304e1051a39Sopenharmony_ci	jnz	.Lmulx4x_inner
1305e1051a39Sopenharmony_ci
1306e1051a39Sopenharmony_ci	mov	0(%rsp),$num		# load num
1307e1051a39Sopenharmony_ci	mov	8(%rsp),$bptr		# re-load &b[i]
1308e1051a39Sopenharmony_ci	adc	$zero,%r15		# modulo-scheduled
1309e1051a39Sopenharmony_ci	sub	0*8($tptr),$zero	# pull top-most carry
1310e1051a39Sopenharmony_ci	adc	%r15,%r14
1311e1051a39Sopenharmony_ci	sbb	%r15,%r15		# top-most carry
1312e1051a39Sopenharmony_ci	mov	%r14,-1*8($tptr)
1313e1051a39Sopenharmony_ci
1314e1051a39Sopenharmony_ci	cmp	16(%rsp),$bptr
1315e1051a39Sopenharmony_ci	jne	.Lmulx4x_outer
1316e1051a39Sopenharmony_ci
1317e1051a39Sopenharmony_ci	lea	64(%rsp),$tptr
1318e1051a39Sopenharmony_ci	sub	$num,$nptr		# rewind $nptr
1319e1051a39Sopenharmony_ci	neg	%r15
1320e1051a39Sopenharmony_ci	mov	$num,%rdx
1321e1051a39Sopenharmony_ci	shr	\$3+2,$num		# %cf=0
1322e1051a39Sopenharmony_ci	mov	32(%rsp),$rptr		# restore rp
1323e1051a39Sopenharmony_ci	jmp	.Lmulx4x_sub
1324e1051a39Sopenharmony_ci
1325e1051a39Sopenharmony_ci.align	32
1326e1051a39Sopenharmony_ci.Lmulx4x_sub:
1327e1051a39Sopenharmony_ci	mov	8*0($tptr),%r11
1328e1051a39Sopenharmony_ci	mov	8*1($tptr),%r12
1329e1051a39Sopenharmony_ci	mov	8*2($tptr),%r13
1330e1051a39Sopenharmony_ci	mov	8*3($tptr),%r14
1331e1051a39Sopenharmony_ci	lea	8*4($tptr),$tptr
1332e1051a39Sopenharmony_ci	sbb	8*0($nptr),%r11
1333e1051a39Sopenharmony_ci	sbb	8*1($nptr),%r12
1334e1051a39Sopenharmony_ci	sbb	8*2($nptr),%r13
1335e1051a39Sopenharmony_ci	sbb	8*3($nptr),%r14
1336e1051a39Sopenharmony_ci	lea	8*4($nptr),$nptr
1337e1051a39Sopenharmony_ci	mov	%r11,8*0($rptr)
1338e1051a39Sopenharmony_ci	mov	%r12,8*1($rptr)
1339e1051a39Sopenharmony_ci	mov	%r13,8*2($rptr)
1340e1051a39Sopenharmony_ci	mov	%r14,8*3($rptr)
1341e1051a39Sopenharmony_ci	lea	8*4($rptr),$rptr
1342e1051a39Sopenharmony_ci	dec	$num			# preserves %cf
1343e1051a39Sopenharmony_ci	jnz	.Lmulx4x_sub
1344e1051a39Sopenharmony_ci
1345e1051a39Sopenharmony_ci	sbb	\$0,%r15		# top-most carry
1346e1051a39Sopenharmony_ci	lea	64(%rsp),$tptr
1347e1051a39Sopenharmony_ci	sub	%rdx,$rptr		# rewind
1348e1051a39Sopenharmony_ci
1349e1051a39Sopenharmony_ci	movq	%r15,%xmm1
1350e1051a39Sopenharmony_ci	pxor	%xmm0,%xmm0
1351e1051a39Sopenharmony_ci	pshufd	\$0,%xmm1,%xmm1
1352e1051a39Sopenharmony_ci	mov	40(%rsp),%rsi		# restore %rsp
1353e1051a39Sopenharmony_ci.cfi_def_cfa	%rsi,8
1354e1051a39Sopenharmony_ci	jmp	.Lmulx4x_cond_copy
1355e1051a39Sopenharmony_ci
1356e1051a39Sopenharmony_ci.align	32
1357e1051a39Sopenharmony_ci.Lmulx4x_cond_copy:
1358e1051a39Sopenharmony_ci	movdqa	16*0($tptr),%xmm2
1359e1051a39Sopenharmony_ci	movdqa	16*1($tptr),%xmm3
1360e1051a39Sopenharmony_ci	lea	16*2($tptr),$tptr
1361e1051a39Sopenharmony_ci	movdqu	16*0($rptr),%xmm4
1362e1051a39Sopenharmony_ci	movdqu	16*1($rptr),%xmm5
1363e1051a39Sopenharmony_ci	lea	16*2($rptr),$rptr
1364e1051a39Sopenharmony_ci	movdqa	%xmm0,-16*2($tptr)	# zero tp
1365e1051a39Sopenharmony_ci	movdqa	%xmm0,-16*1($tptr)
1366e1051a39Sopenharmony_ci	pcmpeqd	%xmm1,%xmm0
1367e1051a39Sopenharmony_ci	pand	%xmm1,%xmm2
1368e1051a39Sopenharmony_ci	pand	%xmm1,%xmm3
1369e1051a39Sopenharmony_ci	pand	%xmm0,%xmm4
1370e1051a39Sopenharmony_ci	pand	%xmm0,%xmm5
1371e1051a39Sopenharmony_ci	pxor	%xmm0,%xmm0
1372e1051a39Sopenharmony_ci	por	%xmm2,%xmm4
1373e1051a39Sopenharmony_ci	por	%xmm3,%xmm5
1374e1051a39Sopenharmony_ci	movdqu	%xmm4,-16*2($rptr)
1375e1051a39Sopenharmony_ci	movdqu	%xmm5,-16*1($rptr)
1376e1051a39Sopenharmony_ci	sub	\$32,%rdx
1377e1051a39Sopenharmony_ci	jnz	.Lmulx4x_cond_copy
1378e1051a39Sopenharmony_ci
1379e1051a39Sopenharmony_ci	mov	%rdx,($tptr)
1380e1051a39Sopenharmony_ci
1381e1051a39Sopenharmony_ci	mov	\$1,%rax
1382e1051a39Sopenharmony_ci	mov	-48(%rsi),%r15
1383e1051a39Sopenharmony_ci.cfi_restore	%r15
1384e1051a39Sopenharmony_ci	mov	-40(%rsi),%r14
1385e1051a39Sopenharmony_ci.cfi_restore	%r14
1386e1051a39Sopenharmony_ci	mov	-32(%rsi),%r13
1387e1051a39Sopenharmony_ci.cfi_restore	%r13
1388e1051a39Sopenharmony_ci	mov	-24(%rsi),%r12
1389e1051a39Sopenharmony_ci.cfi_restore	%r12
1390e1051a39Sopenharmony_ci	mov	-16(%rsi),%rbp
1391e1051a39Sopenharmony_ci.cfi_restore	%rbp
1392e1051a39Sopenharmony_ci	mov	-8(%rsi),%rbx
1393e1051a39Sopenharmony_ci.cfi_restore	%rbx
1394e1051a39Sopenharmony_ci	lea	(%rsi),%rsp
1395e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
1396e1051a39Sopenharmony_ci.Lmulx4x_epilogue:
1397e1051a39Sopenharmony_ci	ret
1398e1051a39Sopenharmony_ci.cfi_endproc
1399e1051a39Sopenharmony_ci.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1400e1051a39Sopenharmony_ci___
1401e1051a39Sopenharmony_ci}}}
1402e1051a39Sopenharmony_ci$code.=<<___;
1403e1051a39Sopenharmony_ci.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1404e1051a39Sopenharmony_ci.align	16
1405e1051a39Sopenharmony_ci___
1406e1051a39Sopenharmony_ci
1407e1051a39Sopenharmony_ci# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1408e1051a39Sopenharmony_ci#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1409e1051a39Sopenharmony_ciif ($win64) {
1410e1051a39Sopenharmony_ci$rec="%rcx";
1411e1051a39Sopenharmony_ci$frame="%rdx";
1412e1051a39Sopenharmony_ci$context="%r8";
1413e1051a39Sopenharmony_ci$disp="%r9";
1414e1051a39Sopenharmony_ci
1415e1051a39Sopenharmony_ci$code.=<<___;
1416e1051a39Sopenharmony_ci.extern	__imp_RtlVirtualUnwind
1417e1051a39Sopenharmony_ci.type	mul_handler,\@abi-omnipotent
1418e1051a39Sopenharmony_ci.align	16
1419e1051a39Sopenharmony_cimul_handler:
1420e1051a39Sopenharmony_ci	push	%rsi
1421e1051a39Sopenharmony_ci	push	%rdi
1422e1051a39Sopenharmony_ci	push	%rbx
1423e1051a39Sopenharmony_ci	push	%rbp
1424e1051a39Sopenharmony_ci	push	%r12
1425e1051a39Sopenharmony_ci	push	%r13
1426e1051a39Sopenharmony_ci	push	%r14
1427e1051a39Sopenharmony_ci	push	%r15
1428e1051a39Sopenharmony_ci	pushfq
1429e1051a39Sopenharmony_ci	sub	\$64,%rsp
1430e1051a39Sopenharmony_ci
1431e1051a39Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
1432e1051a39Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
1433e1051a39Sopenharmony_ci
1434e1051a39Sopenharmony_ci	mov	8($disp),%rsi		# disp->ImageBase
1435e1051a39Sopenharmony_ci	mov	56($disp),%r11		# disp->HandlerData
1436e1051a39Sopenharmony_ci
1437e1051a39Sopenharmony_ci	mov	0(%r11),%r10d		# HandlerData[0]
1438e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# end of prologue label
1439e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<end of prologue label
1440e1051a39Sopenharmony_ci	jb	.Lcommon_seh_tail
1441e1051a39Sopenharmony_ci
1442e1051a39Sopenharmony_ci	mov	152($context),%rax	# pull context->Rsp
1443e1051a39Sopenharmony_ci
1444e1051a39Sopenharmony_ci	mov	4(%r11),%r10d		# HandlerData[1]
1445e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# epilogue label
1446e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>=epilogue label
1447e1051a39Sopenharmony_ci	jae	.Lcommon_seh_tail
1448e1051a39Sopenharmony_ci
1449e1051a39Sopenharmony_ci	mov	192($context),%r10	# pull $num
1450e1051a39Sopenharmony_ci	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1451e1051a39Sopenharmony_ci
1452e1051a39Sopenharmony_ci	jmp	.Lcommon_pop_regs
1453e1051a39Sopenharmony_ci.size	mul_handler,.-mul_handler
1454e1051a39Sopenharmony_ci
1455e1051a39Sopenharmony_ci.type	sqr_handler,\@abi-omnipotent
1456e1051a39Sopenharmony_ci.align	16
1457e1051a39Sopenharmony_cisqr_handler:
1458e1051a39Sopenharmony_ci	push	%rsi
1459e1051a39Sopenharmony_ci	push	%rdi
1460e1051a39Sopenharmony_ci	push	%rbx
1461e1051a39Sopenharmony_ci	push	%rbp
1462e1051a39Sopenharmony_ci	push	%r12
1463e1051a39Sopenharmony_ci	push	%r13
1464e1051a39Sopenharmony_ci	push	%r14
1465e1051a39Sopenharmony_ci	push	%r15
1466e1051a39Sopenharmony_ci	pushfq
1467e1051a39Sopenharmony_ci	sub	\$64,%rsp
1468e1051a39Sopenharmony_ci
1469e1051a39Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
1470e1051a39Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
1471e1051a39Sopenharmony_ci
1472e1051a39Sopenharmony_ci	mov	8($disp),%rsi		# disp->ImageBase
1473e1051a39Sopenharmony_ci	mov	56($disp),%r11		# disp->HandlerData
1474e1051a39Sopenharmony_ci
1475e1051a39Sopenharmony_ci	mov	0(%r11),%r10d		# HandlerData[0]
1476e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# end of prologue label
1477e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<.Lsqr_prologue
1478e1051a39Sopenharmony_ci	jb	.Lcommon_seh_tail
1479e1051a39Sopenharmony_ci
1480e1051a39Sopenharmony_ci	mov	4(%r11),%r10d		# HandlerData[1]
1481e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# body label
1482e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
1483e1051a39Sopenharmony_ci	jb	.Lcommon_pop_regs
1484e1051a39Sopenharmony_ci
1485e1051a39Sopenharmony_ci	mov	152($context),%rax	# pull context->Rsp
1486e1051a39Sopenharmony_ci
1487e1051a39Sopenharmony_ci	mov	8(%r11),%r10d		# HandlerData[2]
1488e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# epilogue label
1489e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
1490e1051a39Sopenharmony_ci	jae	.Lcommon_seh_tail
1491e1051a39Sopenharmony_ci
1492e1051a39Sopenharmony_ci	mov	40(%rax),%rax		# pull saved stack pointer
1493e1051a39Sopenharmony_ci
1494e1051a39Sopenharmony_ci.Lcommon_pop_regs:
1495e1051a39Sopenharmony_ci	mov	-8(%rax),%rbx
1496e1051a39Sopenharmony_ci	mov	-16(%rax),%rbp
1497e1051a39Sopenharmony_ci	mov	-24(%rax),%r12
1498e1051a39Sopenharmony_ci	mov	-32(%rax),%r13
1499e1051a39Sopenharmony_ci	mov	-40(%rax),%r14
1500e1051a39Sopenharmony_ci	mov	-48(%rax),%r15
1501e1051a39Sopenharmony_ci	mov	%rbx,144($context)	# restore context->Rbx
1502e1051a39Sopenharmony_ci	mov	%rbp,160($context)	# restore context->Rbp
1503e1051a39Sopenharmony_ci	mov	%r12,216($context)	# restore context->R12
1504e1051a39Sopenharmony_ci	mov	%r13,224($context)	# restore context->R13
1505e1051a39Sopenharmony_ci	mov	%r14,232($context)	# restore context->R14
1506e1051a39Sopenharmony_ci	mov	%r15,240($context)	# restore context->R15
1507e1051a39Sopenharmony_ci
1508e1051a39Sopenharmony_ci.Lcommon_seh_tail:
1509e1051a39Sopenharmony_ci	mov	8(%rax),%rdi
1510e1051a39Sopenharmony_ci	mov	16(%rax),%rsi
1511e1051a39Sopenharmony_ci	mov	%rax,152($context)	# restore context->Rsp
1512e1051a39Sopenharmony_ci	mov	%rsi,168($context)	# restore context->Rsi
1513e1051a39Sopenharmony_ci	mov	%rdi,176($context)	# restore context->Rdi
1514e1051a39Sopenharmony_ci
1515e1051a39Sopenharmony_ci	mov	40($disp),%rdi		# disp->ContextRecord
1516e1051a39Sopenharmony_ci	mov	$context,%rsi		# context
1517e1051a39Sopenharmony_ci	mov	\$154,%ecx		# sizeof(CONTEXT)
1518e1051a39Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
1519e1051a39Sopenharmony_ci
1520e1051a39Sopenharmony_ci	mov	$disp,%rsi
1521e1051a39Sopenharmony_ci	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1522e1051a39Sopenharmony_ci	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1523e1051a39Sopenharmony_ci	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1524e1051a39Sopenharmony_ci	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1525e1051a39Sopenharmony_ci	mov	40(%rsi),%r10		# disp->ContextRecord
1526e1051a39Sopenharmony_ci	lea	56(%rsi),%r11		# &disp->HandlerData
1527e1051a39Sopenharmony_ci	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1528e1051a39Sopenharmony_ci	mov	%r10,32(%rsp)		# arg5
1529e1051a39Sopenharmony_ci	mov	%r11,40(%rsp)		# arg6
1530e1051a39Sopenharmony_ci	mov	%r12,48(%rsp)		# arg7
1531e1051a39Sopenharmony_ci	mov	%rcx,56(%rsp)		# arg8, (NULL)
1532e1051a39Sopenharmony_ci	call	*__imp_RtlVirtualUnwind(%rip)
1533e1051a39Sopenharmony_ci
1534e1051a39Sopenharmony_ci	mov	\$1,%eax		# ExceptionContinueSearch
1535e1051a39Sopenharmony_ci	add	\$64,%rsp
1536e1051a39Sopenharmony_ci	popfq
1537e1051a39Sopenharmony_ci	pop	%r15
1538e1051a39Sopenharmony_ci	pop	%r14
1539e1051a39Sopenharmony_ci	pop	%r13
1540e1051a39Sopenharmony_ci	pop	%r12
1541e1051a39Sopenharmony_ci	pop	%rbp
1542e1051a39Sopenharmony_ci	pop	%rbx
1543e1051a39Sopenharmony_ci	pop	%rdi
1544e1051a39Sopenharmony_ci	pop	%rsi
1545e1051a39Sopenharmony_ci	ret
1546e1051a39Sopenharmony_ci.size	sqr_handler,.-sqr_handler
1547e1051a39Sopenharmony_ci
1548e1051a39Sopenharmony_ci.section	.pdata
1549e1051a39Sopenharmony_ci.align	4
1550e1051a39Sopenharmony_ci	.rva	.LSEH_begin_bn_mul_mont
1551e1051a39Sopenharmony_ci	.rva	.LSEH_end_bn_mul_mont
1552e1051a39Sopenharmony_ci	.rva	.LSEH_info_bn_mul_mont
1553e1051a39Sopenharmony_ci
1554e1051a39Sopenharmony_ci	.rva	.LSEH_begin_bn_mul4x_mont
1555e1051a39Sopenharmony_ci	.rva	.LSEH_end_bn_mul4x_mont
1556e1051a39Sopenharmony_ci	.rva	.LSEH_info_bn_mul4x_mont
1557e1051a39Sopenharmony_ci
1558e1051a39Sopenharmony_ci	.rva	.LSEH_begin_bn_sqr8x_mont
1559e1051a39Sopenharmony_ci	.rva	.LSEH_end_bn_sqr8x_mont
1560e1051a39Sopenharmony_ci	.rva	.LSEH_info_bn_sqr8x_mont
1561e1051a39Sopenharmony_ci___
1562e1051a39Sopenharmony_ci$code.=<<___ if ($addx);
1563e1051a39Sopenharmony_ci	.rva	.LSEH_begin_bn_mulx4x_mont
1564e1051a39Sopenharmony_ci	.rva	.LSEH_end_bn_mulx4x_mont
1565e1051a39Sopenharmony_ci	.rva	.LSEH_info_bn_mulx4x_mont
1566e1051a39Sopenharmony_ci___
1567e1051a39Sopenharmony_ci$code.=<<___;
1568e1051a39Sopenharmony_ci.section	.xdata
1569e1051a39Sopenharmony_ci.align	8
1570e1051a39Sopenharmony_ci.LSEH_info_bn_mul_mont:
1571e1051a39Sopenharmony_ci	.byte	9,0,0,0
1572e1051a39Sopenharmony_ci	.rva	mul_handler
1573e1051a39Sopenharmony_ci	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
1574e1051a39Sopenharmony_ci.LSEH_info_bn_mul4x_mont:
1575e1051a39Sopenharmony_ci	.byte	9,0,0,0
1576e1051a39Sopenharmony_ci	.rva	mul_handler
1577e1051a39Sopenharmony_ci	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1578e1051a39Sopenharmony_ci.LSEH_info_bn_sqr8x_mont:
1579e1051a39Sopenharmony_ci	.byte	9,0,0,0
1580e1051a39Sopenharmony_ci	.rva	sqr_handler
1581e1051a39Sopenharmony_ci	.rva	.Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue		# HandlerData[]
1582e1051a39Sopenharmony_ci.align	8
1583e1051a39Sopenharmony_ci___
1584e1051a39Sopenharmony_ci$code.=<<___ if ($addx);
1585e1051a39Sopenharmony_ci.LSEH_info_bn_mulx4x_mont:
1586e1051a39Sopenharmony_ci	.byte	9,0,0,0
1587e1051a39Sopenharmony_ci	.rva	sqr_handler
1588e1051a39Sopenharmony_ci	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
1589e1051a39Sopenharmony_ci.align	8
1590e1051a39Sopenharmony_ci___
1591e1051a39Sopenharmony_ci}
1592e1051a39Sopenharmony_ci
1593e1051a39Sopenharmony_ciprint $code;
1594e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
1595