1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci
17e1051a39Sopenharmony_ci# October 2005
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# This is a "teaser" code, as it can be improved in several ways...
20e1051a39Sopenharmony_ci# First of all non-SSE2 path should be implemented (yes, for now it
21e1051a39Sopenharmony_ci# performs Montgomery multiplication/convolution only on SSE2-capable
22e1051a39Sopenharmony_ci# CPUs such as P4, others fall down to original code). Then inner loop
23e1051a39Sopenharmony_ci# can be unrolled and modulo-scheduled to improve ILP and possibly
24e1051a39Sopenharmony_ci# moved to 128-bit XMM register bank (though it would require input
25e1051a39Sopenharmony_ci# rearrangement and/or increase bus bandwidth utilization). Dedicated
26e1051a39Sopenharmony_ci# squaring procedure should give further performance improvement...
27e1051a39Sopenharmony_ci# Yet, for being draft, the code improves rsa512 *sign* benchmark by
28e1051a39Sopenharmony_ci# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
29e1051a39Sopenharmony_ci
30e1051a39Sopenharmony_ci# December 2006
31e1051a39Sopenharmony_ci#
32e1051a39Sopenharmony_ci# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
33e1051a39Sopenharmony_ci# Integer-only code [being equipped with dedicated squaring procedure]
34e1051a39Sopenharmony_ci# gives ~40% on rsa512 sign benchmark...
35e1051a39Sopenharmony_ci
36e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm");
38e1051a39Sopenharmony_cirequire "x86asm.pl";
39e1051a39Sopenharmony_ci
40e1051a39Sopenharmony_ci$output = pop and open STDOUT,">$output";
41e1051a39Sopenharmony_ci
42e1051a39Sopenharmony_ci&asm_init($ARGV[0]);
43e1051a39Sopenharmony_ci
44e1051a39Sopenharmony_ci$sse2=0;
45e1051a39Sopenharmony_cifor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
46e1051a39Sopenharmony_ci
47e1051a39Sopenharmony_ci&external_label("OPENSSL_ia32cap_P") if ($sse2);
48e1051a39Sopenharmony_ci
49e1051a39Sopenharmony_ci&function_begin("bn_mul_mont");
50e1051a39Sopenharmony_ci
51e1051a39Sopenharmony_ci$i="edx";
52e1051a39Sopenharmony_ci$j="ecx";
53e1051a39Sopenharmony_ci$ap="esi";	$tp="esi";		# overlapping variables!!!
54e1051a39Sopenharmony_ci$rp="edi";	$bp="edi";		# overlapping variables!!!
55e1051a39Sopenharmony_ci$np="ebp";
56e1051a39Sopenharmony_ci$num="ebx";
57e1051a39Sopenharmony_ci
58e1051a39Sopenharmony_ci$_num=&DWP(4*0,"esp");			# stack top layout
59e1051a39Sopenharmony_ci$_rp=&DWP(4*1,"esp");
60e1051a39Sopenharmony_ci$_ap=&DWP(4*2,"esp");
61e1051a39Sopenharmony_ci$_bp=&DWP(4*3,"esp");
62e1051a39Sopenharmony_ci$_np=&DWP(4*4,"esp");
63e1051a39Sopenharmony_ci$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
64e1051a39Sopenharmony_ci$_sp=&DWP(4*6,"esp");
65e1051a39Sopenharmony_ci$_bpend=&DWP(4*7,"esp");
66e1051a39Sopenharmony_ci$frame=32;				# size of above frame rounded up to 16n
67e1051a39Sopenharmony_ci
68e1051a39Sopenharmony_ci	&xor	("eax","eax");
69e1051a39Sopenharmony_ci	&mov	("edi",&wparam(5));	# int num
70e1051a39Sopenharmony_ci	&cmp	("edi",4);
71e1051a39Sopenharmony_ci	&jl	(&label("just_leave"));
72e1051a39Sopenharmony_ci
73e1051a39Sopenharmony_ci	&lea	("esi",&wparam(0));	# put aside pointer to argument block
74e1051a39Sopenharmony_ci	&lea	("edx",&wparam(1));	# load ap
75e1051a39Sopenharmony_ci	&add	("edi",2);		# extra two words on top of tp
76e1051a39Sopenharmony_ci	&neg	("edi");
77e1051a39Sopenharmony_ci	&lea	("ebp",&DWP(-$frame,"esp","edi",4));	# future alloca($frame+4*(num+2))
78e1051a39Sopenharmony_ci	&neg	("edi");
79e1051a39Sopenharmony_ci
80e1051a39Sopenharmony_ci	# minimize cache contention by arranging 2K window between stack
81e1051a39Sopenharmony_ci	# pointer and ap argument [np is also position sensitive vector,
82e1051a39Sopenharmony_ci	# but it's assumed to be near ap, as it's allocated at ~same
83e1051a39Sopenharmony_ci	# time].
84e1051a39Sopenharmony_ci	&mov	("eax","ebp");
85e1051a39Sopenharmony_ci	&sub	("eax","edx");
86e1051a39Sopenharmony_ci	&and	("eax",2047);
87e1051a39Sopenharmony_ci	&sub	("ebp","eax");		# this aligns sp and ap modulo 2048
88e1051a39Sopenharmony_ci
89e1051a39Sopenharmony_ci	&xor	("edx","ebp");
90e1051a39Sopenharmony_ci	&and	("edx",2048);
91e1051a39Sopenharmony_ci	&xor	("edx",2048);
92e1051a39Sopenharmony_ci	&sub	("ebp","edx");		# this splits them apart modulo 4096
93e1051a39Sopenharmony_ci
94e1051a39Sopenharmony_ci	&and	("ebp",-64);		# align to cache line
95e1051a39Sopenharmony_ci
96e1051a39Sopenharmony_ci	# An OS-agnostic version of __chkstk.
97e1051a39Sopenharmony_ci	#
98e1051a39Sopenharmony_ci	# Some OSes (Windows) insist on stack being "wired" to
99e1051a39Sopenharmony_ci	# physical memory in strictly sequential manner, i.e. if stack
100e1051a39Sopenharmony_ci	# allocation spans two pages, then reference to farmost one can
101e1051a39Sopenharmony_ci	# be punishable by SEGV. But page walking can do good even on
102e1051a39Sopenharmony_ci	# other OSes, because it guarantees that villain thread hits
103e1051a39Sopenharmony_ci	# the guard page before it can make damage to innocent one...
104e1051a39Sopenharmony_ci	&mov	("eax","esp");
105e1051a39Sopenharmony_ci	&sub	("eax","ebp");
106e1051a39Sopenharmony_ci	&and	("eax",-4096);
107e1051a39Sopenharmony_ci	&mov	("edx","esp");		# saved stack pointer!
108e1051a39Sopenharmony_ci	&lea	("esp",&DWP(0,"ebp","eax"));
109e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,"esp"));
110e1051a39Sopenharmony_ci	&cmp	("esp","ebp");
111e1051a39Sopenharmony_ci	&ja	(&label("page_walk"));
112e1051a39Sopenharmony_ci	&jmp	(&label("page_walk_done"));
113e1051a39Sopenharmony_ci
114e1051a39Sopenharmony_ci&set_label("page_walk",16);
115e1051a39Sopenharmony_ci	&lea	("esp",&DWP(-4096,"esp"));
116e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,"esp"));
117e1051a39Sopenharmony_ci	&cmp	("esp","ebp");
118e1051a39Sopenharmony_ci	&ja	(&label("page_walk"));
119e1051a39Sopenharmony_ci&set_label("page_walk_done");
120e1051a39Sopenharmony_ci
121e1051a39Sopenharmony_ci	################################# load argument block...
122e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
123e1051a39Sopenharmony_ci	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
124e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
125e1051a39Sopenharmony_ci	&mov	("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
126e1051a39Sopenharmony_ci	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
127e1051a39Sopenharmony_ci	#&mov	("edi",&DWP(5*4,"esi"));# int num
128e1051a39Sopenharmony_ci
129e1051a39Sopenharmony_ci	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
130e1051a39Sopenharmony_ci	&mov	($_rp,"eax");		# ... save a copy of argument block
131e1051a39Sopenharmony_ci	&mov	($_ap,"ebx");
132e1051a39Sopenharmony_ci	&mov	($_bp,"ecx");
133e1051a39Sopenharmony_ci	&mov	($_np,"ebp");
134e1051a39Sopenharmony_ci	&mov	($_n0,"esi");
135e1051a39Sopenharmony_ci	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
136e1051a39Sopenharmony_ci	#&mov	($_num,$num);		# redundant as $num is not reused
137e1051a39Sopenharmony_ci	&mov	($_sp,"edx");		# saved stack pointer!
138e1051a39Sopenharmony_ci
139e1051a39Sopenharmony_ciif($sse2) {
140e1051a39Sopenharmony_ci$acc0="mm0";	# mmx register bank layout
141e1051a39Sopenharmony_ci$acc1="mm1";
142e1051a39Sopenharmony_ci$car0="mm2";
143e1051a39Sopenharmony_ci$car1="mm3";
144e1051a39Sopenharmony_ci$mul0="mm4";
145e1051a39Sopenharmony_ci$mul1="mm5";
146e1051a39Sopenharmony_ci$temp="mm6";
147e1051a39Sopenharmony_ci$mask="mm7";
148e1051a39Sopenharmony_ci
149e1051a39Sopenharmony_ci	&picmeup("eax","OPENSSL_ia32cap_P");
150e1051a39Sopenharmony_ci	&bt	(&DWP(0,"eax"),26);
151e1051a39Sopenharmony_ci	&jnc	(&label("non_sse2"));
152e1051a39Sopenharmony_ci
153e1051a39Sopenharmony_ci	&mov	("eax",-1);
154e1051a39Sopenharmony_ci	&movd	($mask,"eax");		# mask 32 lower bits
155e1051a39Sopenharmony_ci
156e1051a39Sopenharmony_ci	&mov	($ap,$_ap);		# load input pointers
157e1051a39Sopenharmony_ci	&mov	($bp,$_bp);
158e1051a39Sopenharmony_ci	&mov	($np,$_np);
159e1051a39Sopenharmony_ci
160e1051a39Sopenharmony_ci	&xor	($i,$i);		# i=0
161e1051a39Sopenharmony_ci	&xor	($j,$j);		# j=0
162e1051a39Sopenharmony_ci
163e1051a39Sopenharmony_ci	&movd	($mul0,&DWP(0,$bp));		# bp[0]
164e1051a39Sopenharmony_ci	&movd	($mul1,&DWP(0,$ap));		# ap[0]
165e1051a39Sopenharmony_ci	&movd	($car1,&DWP(0,$np));		# np[0]
166e1051a39Sopenharmony_ci
167e1051a39Sopenharmony_ci	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
168e1051a39Sopenharmony_ci	&movq	($car0,$mul1);
169e1051a39Sopenharmony_ci	&movq	($acc0,$mul1);			# I wish movd worked for
170e1051a39Sopenharmony_ci	&pand	($acc0,$mask);			# inter-register transfers
171e1051a39Sopenharmony_ci
172e1051a39Sopenharmony_ci	&pmuludq($mul1,$_n0q);			# *=n0
173e1051a39Sopenharmony_ci
174e1051a39Sopenharmony_ci	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
175e1051a39Sopenharmony_ci	&paddq	($car1,$acc0);
176e1051a39Sopenharmony_ci
177e1051a39Sopenharmony_ci	&movd	($acc1,&DWP(4,$np));		# np[1]
178e1051a39Sopenharmony_ci	&movd	($acc0,&DWP(4,$ap));		# ap[1]
179e1051a39Sopenharmony_ci
180e1051a39Sopenharmony_ci	&psrlq	($car0,32);
181e1051a39Sopenharmony_ci	&psrlq	($car1,32);
182e1051a39Sopenharmony_ci
183e1051a39Sopenharmony_ci	&inc	($j);				# j++
184e1051a39Sopenharmony_ci&set_label("1st",16);
185e1051a39Sopenharmony_ci	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
186e1051a39Sopenharmony_ci	&pmuludq($acc1,$mul1);			# np[j]*m1
187e1051a39Sopenharmony_ci	&paddq	($car0,$acc0);			# +=c0
188e1051a39Sopenharmony_ci	&paddq	($car1,$acc1);			# +=c1
189e1051a39Sopenharmony_ci
190e1051a39Sopenharmony_ci	&movq	($acc0,$car0);
191e1051a39Sopenharmony_ci	&pand	($acc0,$mask);
192e1051a39Sopenharmony_ci	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
193e1051a39Sopenharmony_ci	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
194e1051a39Sopenharmony_ci	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
195e1051a39Sopenharmony_ci	&psrlq	($car0,32);
196e1051a39Sopenharmony_ci	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
197e1051a39Sopenharmony_ci	&psrlq	($car1,32);
198e1051a39Sopenharmony_ci
199e1051a39Sopenharmony_ci	&lea	($j,&DWP(1,$j));
200e1051a39Sopenharmony_ci	&cmp	($j,$num);
201e1051a39Sopenharmony_ci	&jl	(&label("1st"));
202e1051a39Sopenharmony_ci
203e1051a39Sopenharmony_ci	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
204e1051a39Sopenharmony_ci	&pmuludq($acc1,$mul1);			# np[num-1]*m1
205e1051a39Sopenharmony_ci	&paddq	($car0,$acc0);			# +=c0
206e1051a39Sopenharmony_ci	&paddq	($car1,$acc1);			# +=c1
207e1051a39Sopenharmony_ci
208e1051a39Sopenharmony_ci	&movq	($acc0,$car0);
209e1051a39Sopenharmony_ci	&pand	($acc0,$mask);
210e1051a39Sopenharmony_ci	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
211e1051a39Sopenharmony_ci	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
212e1051a39Sopenharmony_ci
213e1051a39Sopenharmony_ci	&psrlq	($car0,32);
214e1051a39Sopenharmony_ci	&psrlq	($car1,32);
215e1051a39Sopenharmony_ci
216e1051a39Sopenharmony_ci	&paddq	($car1,$car0);
217e1051a39Sopenharmony_ci	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
218e1051a39Sopenharmony_ci
219e1051a39Sopenharmony_ci	&inc	($i);				# i++
220e1051a39Sopenharmony_ci&set_label("outer");
221e1051a39Sopenharmony_ci	&xor	($j,$j);			# j=0
222e1051a39Sopenharmony_ci
223e1051a39Sopenharmony_ci	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
224e1051a39Sopenharmony_ci	&movd	($mul1,&DWP(0,$ap));		# ap[0]
225e1051a39Sopenharmony_ci	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
226e1051a39Sopenharmony_ci	&movd	($car1,&DWP(0,$np));		# np[0]
227e1051a39Sopenharmony_ci	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
228e1051a39Sopenharmony_ci
229e1051a39Sopenharmony_ci	&paddq	($mul1,$temp);			# +=tp[0]
230e1051a39Sopenharmony_ci	&movq	($acc0,$mul1);
231e1051a39Sopenharmony_ci	&movq	($car0,$mul1);
232e1051a39Sopenharmony_ci	&pand	($acc0,$mask);
233e1051a39Sopenharmony_ci
234e1051a39Sopenharmony_ci	&pmuludq($mul1,$_n0q);			# *=n0
235e1051a39Sopenharmony_ci
236e1051a39Sopenharmony_ci	&pmuludq($car1,$mul1);
237e1051a39Sopenharmony_ci	&paddq	($car1,$acc0);
238e1051a39Sopenharmony_ci
239e1051a39Sopenharmony_ci	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
240e1051a39Sopenharmony_ci	&movd	($acc1,&DWP(4,$np));		# np[1]
241e1051a39Sopenharmony_ci	&movd	($acc0,&DWP(4,$ap));		# ap[1]
242e1051a39Sopenharmony_ci
243e1051a39Sopenharmony_ci	&psrlq	($car0,32);
244e1051a39Sopenharmony_ci	&psrlq	($car1,32);
245e1051a39Sopenharmony_ci	&paddq	($car0,$temp);			# +=tp[1]
246e1051a39Sopenharmony_ci
247e1051a39Sopenharmony_ci	&inc	($j);				# j++
248e1051a39Sopenharmony_ci	&dec	($num);
249e1051a39Sopenharmony_ci&set_label("inner");
250e1051a39Sopenharmony_ci	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
251e1051a39Sopenharmony_ci	&pmuludq($acc1,$mul1);			# np[j]*m1
252e1051a39Sopenharmony_ci	&paddq	($car0,$acc0);			# +=c0
253e1051a39Sopenharmony_ci	&paddq	($car1,$acc1);			# +=c1
254e1051a39Sopenharmony_ci
255e1051a39Sopenharmony_ci	&movq	($acc0,$car0);
256e1051a39Sopenharmony_ci	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
257e1051a39Sopenharmony_ci	&pand	($acc0,$mask);
258e1051a39Sopenharmony_ci	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
259e1051a39Sopenharmony_ci	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
260e1051a39Sopenharmony_ci	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
261e1051a39Sopenharmony_ci	&psrlq	($car0,32);
262e1051a39Sopenharmony_ci	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
263e1051a39Sopenharmony_ci	&psrlq	($car1,32);
264e1051a39Sopenharmony_ci	&paddq	($car0,$temp);			# +=tp[j+1]
265e1051a39Sopenharmony_ci
266e1051a39Sopenharmony_ci	&dec	($num);
267e1051a39Sopenharmony_ci	&lea	($j,&DWP(1,$j));		# j++
268e1051a39Sopenharmony_ci	&jnz	(&label("inner"));
269e1051a39Sopenharmony_ci
270e1051a39Sopenharmony_ci	&mov	($num,$j);
271e1051a39Sopenharmony_ci	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
272e1051a39Sopenharmony_ci	&pmuludq($acc1,$mul1);			# np[num-1]*m1
273e1051a39Sopenharmony_ci	&paddq	($car0,$acc0);			# +=c0
274e1051a39Sopenharmony_ci	&paddq	($car1,$acc1);			# +=c1
275e1051a39Sopenharmony_ci
276e1051a39Sopenharmony_ci	&movq	($acc0,$car0);
277e1051a39Sopenharmony_ci	&pand	($acc0,$mask);
278e1051a39Sopenharmony_ci	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
279e1051a39Sopenharmony_ci	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
280e1051a39Sopenharmony_ci	&psrlq	($car0,32);
281e1051a39Sopenharmony_ci	&psrlq	($car1,32);
282e1051a39Sopenharmony_ci
283e1051a39Sopenharmony_ci	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
284e1051a39Sopenharmony_ci	&paddq	($car1,$car0);
285e1051a39Sopenharmony_ci	&paddq	($car1,$temp);
286e1051a39Sopenharmony_ci	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
287e1051a39Sopenharmony_ci
288e1051a39Sopenharmony_ci	&lea	($i,&DWP(1,$i));		# i++
289e1051a39Sopenharmony_ci	&cmp	($i,$num);
290e1051a39Sopenharmony_ci	&jle	(&label("outer"));
291e1051a39Sopenharmony_ci
292e1051a39Sopenharmony_ci	&emms	();				# done with mmx bank
293e1051a39Sopenharmony_ci	&jmp	(&label("common_tail"));
294e1051a39Sopenharmony_ci
295e1051a39Sopenharmony_ci&set_label("non_sse2",16);
296e1051a39Sopenharmony_ci}
297e1051a39Sopenharmony_ci
298e1051a39Sopenharmony_ciif (0) {
299e1051a39Sopenharmony_ci	&mov	("esp",$_sp);
300e1051a39Sopenharmony_ci	&xor	("eax","eax");	# signal "not fast enough [yet]"
301e1051a39Sopenharmony_ci	&jmp	(&label("just_leave"));
302e1051a39Sopenharmony_ci	# While the below code provides competitive performance for
303e1051a39Sopenharmony_ci	# all key lengths on modern Intel cores, it's still more
304e1051a39Sopenharmony_ci	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
305e1051a39Sopenharmony_ci	# means compared to the original integer-only assembler.
306e1051a39Sopenharmony_ci	# 512-bit RSA sign is better by ~40%, but that's about all
307e1051a39Sopenharmony_ci	# one can say about all CPUs...
308e1051a39Sopenharmony_ci} else {
309e1051a39Sopenharmony_ci$inp="esi";	# integer path uses these registers differently
310e1051a39Sopenharmony_ci$word="edi";
311e1051a39Sopenharmony_ci$carry="ebp";
312e1051a39Sopenharmony_ci
313e1051a39Sopenharmony_ci	&mov	($inp,$_ap);
314e1051a39Sopenharmony_ci	&lea	($carry,&DWP(1,$num));
315e1051a39Sopenharmony_ci	&mov	($word,$_bp);
316e1051a39Sopenharmony_ci	&xor	($j,$j);				# j=0
317e1051a39Sopenharmony_ci	&mov	("edx",$inp);
318e1051a39Sopenharmony_ci	&and	($carry,1);				# see if num is even
319e1051a39Sopenharmony_ci	&sub	("edx",$word);				# see if ap==bp
320e1051a39Sopenharmony_ci	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
321e1051a39Sopenharmony_ci	&or	($carry,"edx");
322e1051a39Sopenharmony_ci	&mov	($word,&DWP(0,$word));			# bp[0]
323e1051a39Sopenharmony_ci	&jz	(&label("bn_sqr_mont"));
324e1051a39Sopenharmony_ci	&mov	($_bpend,"eax");
325e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,$inp));
326e1051a39Sopenharmony_ci	&xor	("edx","edx");
327e1051a39Sopenharmony_ci
328e1051a39Sopenharmony_ci&set_label("mull",16);
329e1051a39Sopenharmony_ci	&mov	($carry,"edx");
330e1051a39Sopenharmony_ci	&mul	($word);				# ap[j]*bp[0]
331e1051a39Sopenharmony_ci	&add	($carry,"eax");
332e1051a39Sopenharmony_ci	&lea	($j,&DWP(1,$j));
333e1051a39Sopenharmony_ci	&adc	("edx",0);
334e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
335e1051a39Sopenharmony_ci	&cmp	($j,$num);
336e1051a39Sopenharmony_ci	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
337e1051a39Sopenharmony_ci	&jl	(&label("mull"));
338e1051a39Sopenharmony_ci
339e1051a39Sopenharmony_ci	&mov	($carry,"edx");
340e1051a39Sopenharmony_ci	&mul	($word);				# ap[num-1]*bp[0]
341e1051a39Sopenharmony_ci	 &mov	($word,$_n0);
342e1051a39Sopenharmony_ci	&add	("eax",$carry);
343e1051a39Sopenharmony_ci	 &mov	($inp,$_np);
344e1051a39Sopenharmony_ci	&adc	("edx",0);
345e1051a39Sopenharmony_ci	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
346e1051a39Sopenharmony_ci
347e1051a39Sopenharmony_ci	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
348e1051a39Sopenharmony_ci	&xor	($j,$j);
349e1051a39Sopenharmony_ci	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
350e1051a39Sopenharmony_ci	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
351e1051a39Sopenharmony_ci
352e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,$inp));			# np[0]
353e1051a39Sopenharmony_ci	&mul	($word);				# np[0]*m
354e1051a39Sopenharmony_ci	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
355e1051a39Sopenharmony_ci	&mov	("eax",&DWP(4,$inp));			# np[1]
356e1051a39Sopenharmony_ci	&adc	("edx",0);
357e1051a39Sopenharmony_ci	&inc	($j);
358e1051a39Sopenharmony_ci
359e1051a39Sopenharmony_ci	&jmp	(&label("2ndmadd"));
360e1051a39Sopenharmony_ci
361e1051a39Sopenharmony_ci&set_label("1stmadd",16);
362e1051a39Sopenharmony_ci	&mov	($carry,"edx");
363e1051a39Sopenharmony_ci	&mul	($word);				# ap[j]*bp[i]
364e1051a39Sopenharmony_ci	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
365e1051a39Sopenharmony_ci	&lea	($j,&DWP(1,$j));
366e1051a39Sopenharmony_ci	&adc	("edx",0);
367e1051a39Sopenharmony_ci	&add	($carry,"eax");
368e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
369e1051a39Sopenharmony_ci	&adc	("edx",0);
370e1051a39Sopenharmony_ci	&cmp	($j,$num);
371e1051a39Sopenharmony_ci	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
372e1051a39Sopenharmony_ci	&jl	(&label("1stmadd"));
373e1051a39Sopenharmony_ci
374e1051a39Sopenharmony_ci	&mov	($carry,"edx");
375e1051a39Sopenharmony_ci	&mul	($word);				# ap[num-1]*bp[i]
376e1051a39Sopenharmony_ci	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
377e1051a39Sopenharmony_ci	 &mov	($word,$_n0);
378e1051a39Sopenharmony_ci	&adc	("edx",0);
379e1051a39Sopenharmony_ci	 &mov	($inp,$_np);
380e1051a39Sopenharmony_ci	&add	($carry,"eax");
381e1051a39Sopenharmony_ci	&adc	("edx",0);
382e1051a39Sopenharmony_ci	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
383e1051a39Sopenharmony_ci
384e1051a39Sopenharmony_ci	&xor	($j,$j);
385e1051a39Sopenharmony_ci	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
386e1051a39Sopenharmony_ci	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
387e1051a39Sopenharmony_ci	&adc	($j,0);
388e1051a39Sopenharmony_ci	 &mov	("eax",&DWP(0,$inp));			# np[0]
389e1051a39Sopenharmony_ci	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
390e1051a39Sopenharmony_ci	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
391e1051a39Sopenharmony_ci
392e1051a39Sopenharmony_ci	&mul	($word);				# np[0]*m
393e1051a39Sopenharmony_ci	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
394e1051a39Sopenharmony_ci	&mov	("eax",&DWP(4,$inp));			# np[1]
395e1051a39Sopenharmony_ci	&adc	("edx",0);
396e1051a39Sopenharmony_ci	&mov	($j,1);
397e1051a39Sopenharmony_ci
398e1051a39Sopenharmony_ci&set_label("2ndmadd",16);
399e1051a39Sopenharmony_ci	&mov	($carry,"edx");
400e1051a39Sopenharmony_ci	&mul	($word);				# np[j]*m
401e1051a39Sopenharmony_ci	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
402e1051a39Sopenharmony_ci	&lea	($j,&DWP(1,$j));
403e1051a39Sopenharmony_ci	&adc	("edx",0);
404e1051a39Sopenharmony_ci	&add	($carry,"eax");
405e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
406e1051a39Sopenharmony_ci	&adc	("edx",0);
407e1051a39Sopenharmony_ci	&cmp	($j,$num);
408e1051a39Sopenharmony_ci	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
409e1051a39Sopenharmony_ci	&jl	(&label("2ndmadd"));
410e1051a39Sopenharmony_ci
411e1051a39Sopenharmony_ci	&mov	($carry,"edx");
412e1051a39Sopenharmony_ci	&mul	($word);				# np[j]*m
413e1051a39Sopenharmony_ci	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
414e1051a39Sopenharmony_ci	&adc	("edx",0);
415e1051a39Sopenharmony_ci	&add	($carry,"eax");
416e1051a39Sopenharmony_ci	&adc	("edx",0);
417e1051a39Sopenharmony_ci	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
418e1051a39Sopenharmony_ci
419e1051a39Sopenharmony_ci	&xor	("eax","eax");
420e1051a39Sopenharmony_ci	 &mov	($j,$_bp);				# &bp[i]
421e1051a39Sopenharmony_ci	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
422e1051a39Sopenharmony_ci	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
423e1051a39Sopenharmony_ci	 &lea	($j,&DWP(4,$j));
424e1051a39Sopenharmony_ci	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
425e1051a39Sopenharmony_ci	 &cmp	($j,$_bpend);
426e1051a39Sopenharmony_ci	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
427e1051a39Sopenharmony_ci	&je	(&label("common_tail"));
428e1051a39Sopenharmony_ci
429e1051a39Sopenharmony_ci	&mov	($word,&DWP(0,$j));			# bp[i+1]
430e1051a39Sopenharmony_ci	&mov	($inp,$_ap);
431e1051a39Sopenharmony_ci	&mov	($_bp,$j);				# &bp[++i]
432e1051a39Sopenharmony_ci	&xor	($j,$j);
433e1051a39Sopenharmony_ci	&xor	("edx","edx");
434e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,$inp));
435e1051a39Sopenharmony_ci	&jmp	(&label("1stmadd"));
436e1051a39Sopenharmony_ci
437e1051a39Sopenharmony_ci&set_label("bn_sqr_mont",16);
438e1051a39Sopenharmony_ci$sbit=$num;
439e1051a39Sopenharmony_ci	&mov	($_num,$num);
440e1051a39Sopenharmony_ci	&mov	($_bp,$j);				# i=0
441e1051a39Sopenharmony_ci
442e1051a39Sopenharmony_ci	&mov	("eax",$word);				# ap[0]
443e1051a39Sopenharmony_ci	&mul	($word);				# ap[0]*ap[0]
444e1051a39Sopenharmony_ci	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
445e1051a39Sopenharmony_ci	&mov	($sbit,"edx");
446e1051a39Sopenharmony_ci	&shr	("edx",1);
447e1051a39Sopenharmony_ci	&and	($sbit,1);
448e1051a39Sopenharmony_ci	&inc	($j);
449e1051a39Sopenharmony_ci&set_label("sqr",16);
450e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
451e1051a39Sopenharmony_ci	&mov	($carry,"edx");
452e1051a39Sopenharmony_ci	&mul	($word);				# ap[j]*ap[0]
453e1051a39Sopenharmony_ci	&add	("eax",$carry);
454e1051a39Sopenharmony_ci	&lea	($j,&DWP(1,$j));
455e1051a39Sopenharmony_ci	&adc	("edx",0);
456e1051a39Sopenharmony_ci	&lea	($carry,&DWP(0,$sbit,"eax",2));
457e1051a39Sopenharmony_ci	&shr	("eax",31);
458e1051a39Sopenharmony_ci	&cmp	($j,$_num);
459e1051a39Sopenharmony_ci	&mov	($sbit,"eax");
460e1051a39Sopenharmony_ci	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
461e1051a39Sopenharmony_ci	&jl	(&label("sqr"));
462e1051a39Sopenharmony_ci
463e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
464e1051a39Sopenharmony_ci	&mov	($carry,"edx");
465e1051a39Sopenharmony_ci	&mul	($word);				# ap[num-1]*ap[0]
466e1051a39Sopenharmony_ci	&add	("eax",$carry);
467e1051a39Sopenharmony_ci	 &mov	($word,$_n0);
468e1051a39Sopenharmony_ci	&adc	("edx",0);
469e1051a39Sopenharmony_ci	 &mov	($inp,$_np);
470e1051a39Sopenharmony_ci	&lea	($carry,&DWP(0,$sbit,"eax",2));
471e1051a39Sopenharmony_ci	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
472e1051a39Sopenharmony_ci	&shr	("eax",31);
473e1051a39Sopenharmony_ci	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
474e1051a39Sopenharmony_ci
475e1051a39Sopenharmony_ci	&lea	($carry,&DWP(0,"eax","edx",2));
476e1051a39Sopenharmony_ci	 &mov	("eax",&DWP(0,$inp));			# np[0]
477e1051a39Sopenharmony_ci	&shr	("edx",31);
478e1051a39Sopenharmony_ci	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
479e1051a39Sopenharmony_ci	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
480e1051a39Sopenharmony_ci
481e1051a39Sopenharmony_ci	&mul	($word);				# np[0]*m
482e1051a39Sopenharmony_ci	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
483e1051a39Sopenharmony_ci	&mov	($num,$j);
484e1051a39Sopenharmony_ci	&adc	("edx",0);
485e1051a39Sopenharmony_ci	&mov	("eax",&DWP(4,$inp));			# np[1]
486e1051a39Sopenharmony_ci	&mov	($j,1);
487e1051a39Sopenharmony_ci
488e1051a39Sopenharmony_ci&set_label("3rdmadd",16);
489e1051a39Sopenharmony_ci	&mov	($carry,"edx");
490e1051a39Sopenharmony_ci	&mul	($word);				# np[j]*m
491e1051a39Sopenharmony_ci	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
492e1051a39Sopenharmony_ci	&adc	("edx",0);
493e1051a39Sopenharmony_ci	&add	($carry,"eax");
494e1051a39Sopenharmony_ci	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
495e1051a39Sopenharmony_ci	&adc	("edx",0);
496e1051a39Sopenharmony_ci	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
497e1051a39Sopenharmony_ci
498e1051a39Sopenharmony_ci	&mov	($carry,"edx");
499e1051a39Sopenharmony_ci	&mul	($word);				# np[j+1]*m
500e1051a39Sopenharmony_ci	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
501e1051a39Sopenharmony_ci	&lea	($j,&DWP(2,$j));
502e1051a39Sopenharmony_ci	&adc	("edx",0);
503e1051a39Sopenharmony_ci	&add	($carry,"eax");
504e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
505e1051a39Sopenharmony_ci	&adc	("edx",0);
506e1051a39Sopenharmony_ci	&cmp	($j,$num);
507e1051a39Sopenharmony_ci	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
508e1051a39Sopenharmony_ci	&jl	(&label("3rdmadd"));
509e1051a39Sopenharmony_ci
510e1051a39Sopenharmony_ci	&mov	($carry,"edx");
511e1051a39Sopenharmony_ci	&mul	($word);				# np[j]*m
512e1051a39Sopenharmony_ci	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
513e1051a39Sopenharmony_ci	&adc	("edx",0);
514e1051a39Sopenharmony_ci	&add	($carry,"eax");
515e1051a39Sopenharmony_ci	&adc	("edx",0);
516e1051a39Sopenharmony_ci	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
517e1051a39Sopenharmony_ci
518e1051a39Sopenharmony_ci	&mov	($j,$_bp);				# i
519e1051a39Sopenharmony_ci	&xor	("eax","eax");
520e1051a39Sopenharmony_ci	&mov	($inp,$_ap);
521e1051a39Sopenharmony_ci	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
522e1051a39Sopenharmony_ci	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
523e1051a39Sopenharmony_ci	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
524e1051a39Sopenharmony_ci	&cmp	($j,$num);
525e1051a39Sopenharmony_ci	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
526e1051a39Sopenharmony_ci	&je	(&label("common_tail"));
527e1051a39Sopenharmony_ci
528e1051a39Sopenharmony_ci	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
529e1051a39Sopenharmony_ci	&lea	($j,&DWP(1,$j));
530e1051a39Sopenharmony_ci	&mov	("eax",$word);
531e1051a39Sopenharmony_ci	&mov	($_bp,$j);				# ++i
532e1051a39Sopenharmony_ci	&mul	($word);				# ap[i]*ap[i]
533e1051a39Sopenharmony_ci	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
534e1051a39Sopenharmony_ci	&adc	("edx",0);
535e1051a39Sopenharmony_ci	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
536e1051a39Sopenharmony_ci	&xor	($carry,$carry);
537e1051a39Sopenharmony_ci	&cmp	($j,$num);
538e1051a39Sopenharmony_ci	&lea	($j,&DWP(1,$j));
539e1051a39Sopenharmony_ci	&je	(&label("sqrlast"));
540e1051a39Sopenharmony_ci
541e1051a39Sopenharmony_ci	&mov	($sbit,"edx");				# zaps $num
542e1051a39Sopenharmony_ci	&shr	("edx",1);
543e1051a39Sopenharmony_ci	&and	($sbit,1);
544e1051a39Sopenharmony_ci&set_label("sqradd",16);
545e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
546e1051a39Sopenharmony_ci	&mov	($carry,"edx");
547e1051a39Sopenharmony_ci	&mul	($word);				# ap[j]*ap[i]
548e1051a39Sopenharmony_ci	&add	("eax",$carry);
549e1051a39Sopenharmony_ci	&lea	($carry,&DWP(0,"eax","eax"));
550e1051a39Sopenharmony_ci	&adc	("edx",0);
551e1051a39Sopenharmony_ci	&shr	("eax",31);
552e1051a39Sopenharmony_ci	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
553e1051a39Sopenharmony_ci	&lea	($j,&DWP(1,$j));
554e1051a39Sopenharmony_ci	&adc	("eax",0);
555e1051a39Sopenharmony_ci	&add	($carry,$sbit);
556e1051a39Sopenharmony_ci	&adc	("eax",0);
557e1051a39Sopenharmony_ci	&cmp	($j,$_num);
558e1051a39Sopenharmony_ci	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
559e1051a39Sopenharmony_ci	&mov	($sbit,"eax");
560e1051a39Sopenharmony_ci	&jle	(&label("sqradd"));
561e1051a39Sopenharmony_ci
562e1051a39Sopenharmony_ci	&mov	($carry,"edx");
563e1051a39Sopenharmony_ci	&add	("edx","edx");
564e1051a39Sopenharmony_ci	&shr	($carry,31);
565e1051a39Sopenharmony_ci	&add	("edx",$sbit);
566e1051a39Sopenharmony_ci	&adc	($carry,0);
567e1051a39Sopenharmony_ci&set_label("sqrlast");
568e1051a39Sopenharmony_ci	&mov	($word,$_n0);
569e1051a39Sopenharmony_ci	&mov	($inp,$_np);
570e1051a39Sopenharmony_ci	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
571e1051a39Sopenharmony_ci
572e1051a39Sopenharmony_ci	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
573e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,$inp));			# np[0]
574e1051a39Sopenharmony_ci	&adc	($carry,0);
575e1051a39Sopenharmony_ci	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
576e1051a39Sopenharmony_ci	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
577e1051a39Sopenharmony_ci
578e1051a39Sopenharmony_ci	&mul	($word);				# np[0]*m
579e1051a39Sopenharmony_ci	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
580e1051a39Sopenharmony_ci	&lea	($num,&DWP(-1,$j));
581e1051a39Sopenharmony_ci	&adc	("edx",0);
582e1051a39Sopenharmony_ci	&mov	($j,1);
583e1051a39Sopenharmony_ci	&mov	("eax",&DWP(4,$inp));			# np[1]
584e1051a39Sopenharmony_ci
585e1051a39Sopenharmony_ci	&jmp	(&label("3rdmadd"));
586e1051a39Sopenharmony_ci}
587e1051a39Sopenharmony_ci
588e1051a39Sopenharmony_ci&set_label("common_tail",16);
589e1051a39Sopenharmony_ci	&mov	($np,$_np);			# load modulus pointer
590e1051a39Sopenharmony_ci	&mov	($rp,$_rp);			# load result pointer
591e1051a39Sopenharmony_ci	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
592e1051a39Sopenharmony_ci
593e1051a39Sopenharmony_ci	&mov	("eax",&DWP(0,$tp));		# tp[0]
594e1051a39Sopenharmony_ci	&mov	($j,$num);			# j=num-1
595e1051a39Sopenharmony_ci	&xor	($i,$i);			# i=0 and clear CF!
596e1051a39Sopenharmony_ci
597e1051a39Sopenharmony_ci&set_label("sub",16);
598e1051a39Sopenharmony_ci	&sbb	("eax",&DWP(0,$np,$i,4));
599e1051a39Sopenharmony_ci	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
600e1051a39Sopenharmony_ci	&dec	($j);				# doesn't affect CF!
601e1051a39Sopenharmony_ci	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
602e1051a39Sopenharmony_ci	&lea	($i,&DWP(1,$i));		# i++
603e1051a39Sopenharmony_ci	&jge	(&label("sub"));
604e1051a39Sopenharmony_ci
605e1051a39Sopenharmony_ci	&sbb	("eax",0);			# handle upmost overflow bit
606e1051a39Sopenharmony_ci	&mov	("edx",-1);
607e1051a39Sopenharmony_ci	&xor	("edx","eax");
608e1051a39Sopenharmony_ci	&jmp	(&label("copy"));
609e1051a39Sopenharmony_ci
610e1051a39Sopenharmony_ci&set_label("copy",16);				# conditional copy
611e1051a39Sopenharmony_ci	&mov	($tp,&DWP($frame,"esp",$num,4));
612e1051a39Sopenharmony_ci	&mov	($np,&DWP(0,$rp,$num,4));
613e1051a39Sopenharmony_ci	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
614e1051a39Sopenharmony_ci	&and	($tp,"eax");
615e1051a39Sopenharmony_ci	&and	($np,"edx");
616e1051a39Sopenharmony_ci	&or	($np,$tp);
617e1051a39Sopenharmony_ci	&mov	(&DWP(0,$rp,$num,4),$np);
618e1051a39Sopenharmony_ci	&dec	($num);
619e1051a39Sopenharmony_ci	&jge	(&label("copy"));
620e1051a39Sopenharmony_ci
621e1051a39Sopenharmony_ci	&mov	("esp",$_sp);		# pull saved stack pointer
622e1051a39Sopenharmony_ci	&mov	("eax",1);
623e1051a39Sopenharmony_ci&set_label("just_leave");
624e1051a39Sopenharmony_ci&function_end("bn_mul_mont");
625e1051a39Sopenharmony_ci
626e1051a39Sopenharmony_ci&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
627e1051a39Sopenharmony_ci
628e1051a39Sopenharmony_ci&asm_finish();
629e1051a39Sopenharmony_ci
630e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
631