1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 1995-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
11e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm");
12e1051a39Sopenharmony_cirequire "x86asm.pl";
13e1051a39Sopenharmony_ci
14e1051a39Sopenharmony_ci$output = pop and open STDOUT,">$output";
15e1051a39Sopenharmony_ci
16e1051a39Sopenharmony_ci&asm_init($ARGV[0]);
17e1051a39Sopenharmony_ci
18e1051a39Sopenharmony_ci$sse2=0;
19e1051a39Sopenharmony_cifor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
20e1051a39Sopenharmony_ci
21e1051a39Sopenharmony_ci&external_label("OPENSSL_ia32cap_P") if ($sse2);
22e1051a39Sopenharmony_ci
23e1051a39Sopenharmony_ci&bn_mul_add_words("bn_mul_add_words");
24e1051a39Sopenharmony_ci&bn_mul_words("bn_mul_words");
25e1051a39Sopenharmony_ci&bn_sqr_words("bn_sqr_words");
26e1051a39Sopenharmony_ci&bn_div_words("bn_div_words");
27e1051a39Sopenharmony_ci&bn_add_words("bn_add_words");
28e1051a39Sopenharmony_ci&bn_sub_words("bn_sub_words");
29e1051a39Sopenharmony_ci&bn_sub_part_words("bn_sub_part_words");
30e1051a39Sopenharmony_ci
31e1051a39Sopenharmony_ci&asm_finish();
32e1051a39Sopenharmony_ci
33e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
34e1051a39Sopenharmony_ci
35e1051a39Sopenharmony_cisub bn_mul_add_words
36e1051a39Sopenharmony_ci	{
37e1051a39Sopenharmony_ci	local($name)=@_;
38e1051a39Sopenharmony_ci
39e1051a39Sopenharmony_ci	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
40e1051a39Sopenharmony_ci
41e1051a39Sopenharmony_ci	$r="eax";
42e1051a39Sopenharmony_ci	$a="edx";
43e1051a39Sopenharmony_ci	$c="ecx";
44e1051a39Sopenharmony_ci
45e1051a39Sopenharmony_ci	if ($sse2) {
46e1051a39Sopenharmony_ci		&picmeup("eax","OPENSSL_ia32cap_P");
47e1051a39Sopenharmony_ci		&bt(&DWP(0,"eax"),26);
48e1051a39Sopenharmony_ci		&jnc(&label("maw_non_sse2"));
49e1051a39Sopenharmony_ci
50e1051a39Sopenharmony_ci		&mov($r,&wparam(0));
51e1051a39Sopenharmony_ci		&mov($a,&wparam(1));
52e1051a39Sopenharmony_ci		&mov($c,&wparam(2));
53e1051a39Sopenharmony_ci		&movd("mm0",&wparam(3));	# mm0 = w
54e1051a39Sopenharmony_ci		&pxor("mm1","mm1");		# mm1 = carry_in
55e1051a39Sopenharmony_ci		&jmp(&label("maw_sse2_entry"));
56e1051a39Sopenharmony_ci
57e1051a39Sopenharmony_ci	&set_label("maw_sse2_unrolled",16);
58e1051a39Sopenharmony_ci		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
59e1051a39Sopenharmony_ci		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
60e1051a39Sopenharmony_ci		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
61e1051a39Sopenharmony_ci		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
62e1051a39Sopenharmony_ci		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
63e1051a39Sopenharmony_ci		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
64e1051a39Sopenharmony_ci		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
65e1051a39Sopenharmony_ci		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
66e1051a39Sopenharmony_ci		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
67e1051a39Sopenharmony_ci		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
68e1051a39Sopenharmony_ci		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
69e1051a39Sopenharmony_ci		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
70e1051a39Sopenharmony_ci		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
71e1051a39Sopenharmony_ci		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
72e1051a39Sopenharmony_ci		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
73e1051a39Sopenharmony_ci		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
74e1051a39Sopenharmony_ci		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
75e1051a39Sopenharmony_ci		&movd(&DWP(0,$r,"",0),"mm1");
76e1051a39Sopenharmony_ci		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
77e1051a39Sopenharmony_ci		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
78e1051a39Sopenharmony_ci		&psrlq("mm1",32);		# mm1 = carry0
79e1051a39Sopenharmony_ci		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
80e1051a39Sopenharmony_ci		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
81e1051a39Sopenharmony_ci		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
82e1051a39Sopenharmony_ci		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
83e1051a39Sopenharmony_ci		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
84e1051a39Sopenharmony_ci		&movd(&DWP(4,$r,"",0),"mm1");
85e1051a39Sopenharmony_ci		&psrlq("mm1",32);		# mm1 = carry1
86e1051a39Sopenharmony_ci		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
87e1051a39Sopenharmony_ci		&add($a,32);
88e1051a39Sopenharmony_ci		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
89e1051a39Sopenharmony_ci		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
90e1051a39Sopenharmony_ci		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
91e1051a39Sopenharmony_ci		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
92e1051a39Sopenharmony_ci		&movd(&DWP(8,$r,"",0),"mm1");
93e1051a39Sopenharmony_ci		&psrlq("mm1",32);		# mm1 = carry2
94e1051a39Sopenharmony_ci		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
95e1051a39Sopenharmony_ci		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
96e1051a39Sopenharmony_ci		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
97e1051a39Sopenharmony_ci		&movd(&DWP(12,$r,"",0),"mm1");
98e1051a39Sopenharmony_ci		&psrlq("mm1",32);		# mm1 = carry3
99e1051a39Sopenharmony_ci		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
100e1051a39Sopenharmony_ci		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
101e1051a39Sopenharmony_ci		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
102e1051a39Sopenharmony_ci		&movd(&DWP(16,$r,"",0),"mm1");
103e1051a39Sopenharmony_ci		&psrlq("mm1",32);		# mm1 = carry4
104e1051a39Sopenharmony_ci		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
105e1051a39Sopenharmony_ci		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
106e1051a39Sopenharmony_ci		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
107e1051a39Sopenharmony_ci		&movd(&DWP(20,$r,"",0),"mm1");
108e1051a39Sopenharmony_ci		&psrlq("mm1",32);		# mm1 = carry5
109e1051a39Sopenharmony_ci		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
110e1051a39Sopenharmony_ci		&movd(&DWP(24,$r,"",0),"mm1");
111e1051a39Sopenharmony_ci		&psrlq("mm1",32);		# mm1 = carry6
112e1051a39Sopenharmony_ci		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
113e1051a39Sopenharmony_ci		&movd(&DWP(28,$r,"",0),"mm1");
114e1051a39Sopenharmony_ci		&lea($r,&DWP(32,$r));
115e1051a39Sopenharmony_ci		&psrlq("mm1",32);		# mm1 = carry_out
116e1051a39Sopenharmony_ci
117e1051a39Sopenharmony_ci		&sub($c,8);
118e1051a39Sopenharmony_ci		&jz(&label("maw_sse2_exit"));
119e1051a39Sopenharmony_ci	&set_label("maw_sse2_entry");
120e1051a39Sopenharmony_ci		&test($c,0xfffffff8);
121e1051a39Sopenharmony_ci		&jnz(&label("maw_sse2_unrolled"));
122e1051a39Sopenharmony_ci
123e1051a39Sopenharmony_ci	&set_label("maw_sse2_loop",4);
124e1051a39Sopenharmony_ci		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
125e1051a39Sopenharmony_ci		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
126e1051a39Sopenharmony_ci		&pmuludq("mm2","mm0");		# a[i] *= w
127e1051a39Sopenharmony_ci		&lea($a,&DWP(4,$a));
128e1051a39Sopenharmony_ci		&paddq("mm1","mm3");		# carry += r[i]
129e1051a39Sopenharmony_ci		&paddq("mm1","mm2");		# carry += a[i]*w
130e1051a39Sopenharmony_ci		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
131e1051a39Sopenharmony_ci		&sub($c,1);
132e1051a39Sopenharmony_ci		&psrlq("mm1",32);		# carry = carry_high
133e1051a39Sopenharmony_ci		&lea($r,&DWP(4,$r));
134e1051a39Sopenharmony_ci		&jnz(&label("maw_sse2_loop"));
135e1051a39Sopenharmony_ci	&set_label("maw_sse2_exit");
136e1051a39Sopenharmony_ci		&movd("eax","mm1");		# c = carry_out
137e1051a39Sopenharmony_ci		&emms();
138e1051a39Sopenharmony_ci		&ret();
139e1051a39Sopenharmony_ci
140e1051a39Sopenharmony_ci	&set_label("maw_non_sse2",16);
141e1051a39Sopenharmony_ci	}
142e1051a39Sopenharmony_ci
143e1051a39Sopenharmony_ci	# function_begin prologue
144e1051a39Sopenharmony_ci	&push("ebp");
145e1051a39Sopenharmony_ci	&push("ebx");
146e1051a39Sopenharmony_ci	&push("esi");
147e1051a39Sopenharmony_ci	&push("edi");
148e1051a39Sopenharmony_ci
149e1051a39Sopenharmony_ci	&comment("");
150e1051a39Sopenharmony_ci	$Low="eax";
151e1051a39Sopenharmony_ci	$High="edx";
152e1051a39Sopenharmony_ci	$a="ebx";
153e1051a39Sopenharmony_ci	$w="ebp";
154e1051a39Sopenharmony_ci	$r="edi";
155e1051a39Sopenharmony_ci	$c="esi";
156e1051a39Sopenharmony_ci
157e1051a39Sopenharmony_ci	&xor($c,$c);		# clear carry
158e1051a39Sopenharmony_ci	&mov($r,&wparam(0));	#
159e1051a39Sopenharmony_ci
160e1051a39Sopenharmony_ci	&mov("ecx",&wparam(2));	#
161e1051a39Sopenharmony_ci	&mov($a,&wparam(1));	#
162e1051a39Sopenharmony_ci
163e1051a39Sopenharmony_ci	&and("ecx",0xfffffff8);	# num / 8
164e1051a39Sopenharmony_ci	&mov($w,&wparam(3));	#
165e1051a39Sopenharmony_ci
166e1051a39Sopenharmony_ci	&push("ecx");		# Up the stack for a tmp variable
167e1051a39Sopenharmony_ci
168e1051a39Sopenharmony_ci	&jz(&label("maw_finish"));
169e1051a39Sopenharmony_ci
170e1051a39Sopenharmony_ci	&set_label("maw_loop",16);
171e1051a39Sopenharmony_ci
172e1051a39Sopenharmony_ci	for ($i=0; $i<32; $i+=4)
173e1051a39Sopenharmony_ci		{
174e1051a39Sopenharmony_ci		&comment("Round $i");
175e1051a39Sopenharmony_ci
176e1051a39Sopenharmony_ci		 &mov("eax",&DWP($i,$a)); 	# *a
177e1051a39Sopenharmony_ci		&mul($w);			# *a * w
178e1051a39Sopenharmony_ci		&add("eax",$c);			# L(t)+= c
179e1051a39Sopenharmony_ci		&adc("edx",0);			# H(t)+=carry
180e1051a39Sopenharmony_ci		 &add("eax",&DWP($i,$r));	# L(t)+= *r
181e1051a39Sopenharmony_ci		&adc("edx",0);			# H(t)+=carry
182e1051a39Sopenharmony_ci		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
183e1051a39Sopenharmony_ci		&mov($c,"edx");			# c=  H(t);
184e1051a39Sopenharmony_ci		}
185e1051a39Sopenharmony_ci
186e1051a39Sopenharmony_ci	&comment("");
187e1051a39Sopenharmony_ci	&sub("ecx",8);
188e1051a39Sopenharmony_ci	&lea($a,&DWP(32,$a));
189e1051a39Sopenharmony_ci	&lea($r,&DWP(32,$r));
190e1051a39Sopenharmony_ci	&jnz(&label("maw_loop"));
191e1051a39Sopenharmony_ci
192e1051a39Sopenharmony_ci	&set_label("maw_finish",0);
193e1051a39Sopenharmony_ci	&mov("ecx",&wparam(2));	# get num
194e1051a39Sopenharmony_ci	&and("ecx",7);
195e1051a39Sopenharmony_ci	&jnz(&label("maw_finish2"));	# helps branch prediction
196e1051a39Sopenharmony_ci	&jmp(&label("maw_end"));
197e1051a39Sopenharmony_ci
198e1051a39Sopenharmony_ci	&set_label("maw_finish2",1);
199e1051a39Sopenharmony_ci	for ($i=0; $i<7; $i++)
200e1051a39Sopenharmony_ci		{
201e1051a39Sopenharmony_ci		&comment("Tail Round $i");
202e1051a39Sopenharmony_ci		 &mov("eax",&DWP($i*4,$a));	# *a
203e1051a39Sopenharmony_ci		&mul($w);			# *a * w
204e1051a39Sopenharmony_ci		&add("eax",$c);			# L(t)+=c
205e1051a39Sopenharmony_ci		&adc("edx",0);			# H(t)+=carry
206e1051a39Sopenharmony_ci		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
207e1051a39Sopenharmony_ci		&adc("edx",0);			# H(t)+=carry
208e1051a39Sopenharmony_ci		 &dec("ecx") if ($i != 7-1);
209e1051a39Sopenharmony_ci		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
210e1051a39Sopenharmony_ci		 &mov($c,"edx");		# c=  H(t);
211e1051a39Sopenharmony_ci		&jz(&label("maw_end")) if ($i != 7-1);
212e1051a39Sopenharmony_ci		}
213e1051a39Sopenharmony_ci	&set_label("maw_end",0);
214e1051a39Sopenharmony_ci	&mov("eax",$c);
215e1051a39Sopenharmony_ci
216e1051a39Sopenharmony_ci	&pop("ecx");	# clear variable from
217e1051a39Sopenharmony_ci
218e1051a39Sopenharmony_ci	&function_end($name);
219e1051a39Sopenharmony_ci	}
220e1051a39Sopenharmony_ci
221e1051a39Sopenharmony_cisub bn_mul_words
222e1051a39Sopenharmony_ci	{
223e1051a39Sopenharmony_ci	local($name)=@_;
224e1051a39Sopenharmony_ci
225e1051a39Sopenharmony_ci	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
226e1051a39Sopenharmony_ci
227e1051a39Sopenharmony_ci	$r="eax";
228e1051a39Sopenharmony_ci	$a="edx";
229e1051a39Sopenharmony_ci	$c="ecx";
230e1051a39Sopenharmony_ci
231e1051a39Sopenharmony_ci	if ($sse2) {
232e1051a39Sopenharmony_ci		&picmeup("eax","OPENSSL_ia32cap_P");
233e1051a39Sopenharmony_ci		&bt(&DWP(0,"eax"),26);
234e1051a39Sopenharmony_ci		&jnc(&label("mw_non_sse2"));
235e1051a39Sopenharmony_ci
236e1051a39Sopenharmony_ci		&mov($r,&wparam(0));
237e1051a39Sopenharmony_ci		&mov($a,&wparam(1));
238e1051a39Sopenharmony_ci		&mov($c,&wparam(2));
239e1051a39Sopenharmony_ci		&movd("mm0",&wparam(3));	# mm0 = w
240e1051a39Sopenharmony_ci		&pxor("mm1","mm1");		# mm1 = carry = 0
241e1051a39Sopenharmony_ci
242e1051a39Sopenharmony_ci	&set_label("mw_sse2_loop",16);
243e1051a39Sopenharmony_ci		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
244e1051a39Sopenharmony_ci		&pmuludq("mm2","mm0");		# a[i] *= w
245e1051a39Sopenharmony_ci		&lea($a,&DWP(4,$a));
246e1051a39Sopenharmony_ci		&paddq("mm1","mm2");		# carry += a[i]*w
247e1051a39Sopenharmony_ci		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
248e1051a39Sopenharmony_ci		&sub($c,1);
249e1051a39Sopenharmony_ci		&psrlq("mm1",32);		# carry = carry_high
250e1051a39Sopenharmony_ci		&lea($r,&DWP(4,$r));
251e1051a39Sopenharmony_ci		&jnz(&label("mw_sse2_loop"));
252e1051a39Sopenharmony_ci
253e1051a39Sopenharmony_ci		&movd("eax","mm1");		# return carry
254e1051a39Sopenharmony_ci		&emms();
255e1051a39Sopenharmony_ci		&ret();
256e1051a39Sopenharmony_ci	&set_label("mw_non_sse2",16);
257e1051a39Sopenharmony_ci	}
258e1051a39Sopenharmony_ci
259e1051a39Sopenharmony_ci	# function_begin prologue
260e1051a39Sopenharmony_ci	&push("ebp");
261e1051a39Sopenharmony_ci	&push("ebx");
262e1051a39Sopenharmony_ci	&push("esi");
263e1051a39Sopenharmony_ci	&push("edi");
264e1051a39Sopenharmony_ci
265e1051a39Sopenharmony_ci	&comment("");
266e1051a39Sopenharmony_ci	$Low="eax";
267e1051a39Sopenharmony_ci	$High="edx";
268e1051a39Sopenharmony_ci	$a="ebx";
269e1051a39Sopenharmony_ci	$w="ecx";
270e1051a39Sopenharmony_ci	$r="edi";
271e1051a39Sopenharmony_ci	$c="esi";
272e1051a39Sopenharmony_ci	$num="ebp";
273e1051a39Sopenharmony_ci
274e1051a39Sopenharmony_ci	&xor($c,$c);		# clear carry
275e1051a39Sopenharmony_ci	&mov($r,&wparam(0));	#
276e1051a39Sopenharmony_ci	&mov($a,&wparam(1));	#
277e1051a39Sopenharmony_ci	&mov($num,&wparam(2));	#
278e1051a39Sopenharmony_ci	&mov($w,&wparam(3));	#
279e1051a39Sopenharmony_ci
280e1051a39Sopenharmony_ci	&and($num,0xfffffff8);	# num / 8
281e1051a39Sopenharmony_ci	&jz(&label("mw_finish"));
282e1051a39Sopenharmony_ci
283e1051a39Sopenharmony_ci	&set_label("mw_loop",0);
284e1051a39Sopenharmony_ci	for ($i=0; $i<32; $i+=4)
285e1051a39Sopenharmony_ci		{
286e1051a39Sopenharmony_ci		&comment("Round $i");
287e1051a39Sopenharmony_ci
288e1051a39Sopenharmony_ci		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
289e1051a39Sopenharmony_ci		&mul($w);			# *a * w
290e1051a39Sopenharmony_ci		&add("eax",$c);			# L(t)+=c
291e1051a39Sopenharmony_ci		 # XXX
292e1051a39Sopenharmony_ci
293e1051a39Sopenharmony_ci		&adc("edx",0);			# H(t)+=carry
294e1051a39Sopenharmony_ci		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
295e1051a39Sopenharmony_ci
296e1051a39Sopenharmony_ci		&mov($c,"edx");			# c=  H(t);
297e1051a39Sopenharmony_ci		}
298e1051a39Sopenharmony_ci
299e1051a39Sopenharmony_ci	&comment("");
300e1051a39Sopenharmony_ci	&add($a,32);
301e1051a39Sopenharmony_ci	&add($r,32);
302e1051a39Sopenharmony_ci	&sub($num,8);
303e1051a39Sopenharmony_ci	&jz(&label("mw_finish"));
304e1051a39Sopenharmony_ci	&jmp(&label("mw_loop"));
305e1051a39Sopenharmony_ci
306e1051a39Sopenharmony_ci	&set_label("mw_finish",0);
307e1051a39Sopenharmony_ci	&mov($num,&wparam(2));	# get num
308e1051a39Sopenharmony_ci	&and($num,7);
309e1051a39Sopenharmony_ci	&jnz(&label("mw_finish2"));
310e1051a39Sopenharmony_ci	&jmp(&label("mw_end"));
311e1051a39Sopenharmony_ci
312e1051a39Sopenharmony_ci	&set_label("mw_finish2",1);
313e1051a39Sopenharmony_ci	for ($i=0; $i<7; $i++)
314e1051a39Sopenharmony_ci		{
315e1051a39Sopenharmony_ci		&comment("Tail Round $i");
316e1051a39Sopenharmony_ci		 &mov("eax",&DWP($i*4,$a,"",0));# *a
317e1051a39Sopenharmony_ci		&mul($w);			# *a * w
318e1051a39Sopenharmony_ci		&add("eax",$c);			# L(t)+=c
319e1051a39Sopenharmony_ci		 # XXX
320e1051a39Sopenharmony_ci		&adc("edx",0);			# H(t)+=carry
321e1051a39Sopenharmony_ci		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
322e1051a39Sopenharmony_ci		&mov($c,"edx");			# c=  H(t);
323e1051a39Sopenharmony_ci		 &dec($num) if ($i != 7-1);
324e1051a39Sopenharmony_ci		&jz(&label("mw_end")) if ($i != 7-1);
325e1051a39Sopenharmony_ci		}
326e1051a39Sopenharmony_ci	&set_label("mw_end",0);
327e1051a39Sopenharmony_ci	&mov("eax",$c);
328e1051a39Sopenharmony_ci
329e1051a39Sopenharmony_ci	&function_end($name);
330e1051a39Sopenharmony_ci	}
331e1051a39Sopenharmony_ci
332e1051a39Sopenharmony_cisub bn_sqr_words
333e1051a39Sopenharmony_ci	{
334e1051a39Sopenharmony_ci	local($name)=@_;
335e1051a39Sopenharmony_ci
336e1051a39Sopenharmony_ci	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
337e1051a39Sopenharmony_ci
338e1051a39Sopenharmony_ci	$r="eax";
339e1051a39Sopenharmony_ci	$a="edx";
340e1051a39Sopenharmony_ci	$c="ecx";
341e1051a39Sopenharmony_ci
342e1051a39Sopenharmony_ci	if ($sse2) {
343e1051a39Sopenharmony_ci		&picmeup("eax","OPENSSL_ia32cap_P");
344e1051a39Sopenharmony_ci		&bt(&DWP(0,"eax"),26);
345e1051a39Sopenharmony_ci		&jnc(&label("sqr_non_sse2"));
346e1051a39Sopenharmony_ci
347e1051a39Sopenharmony_ci		&mov($r,&wparam(0));
348e1051a39Sopenharmony_ci		&mov($a,&wparam(1));
349e1051a39Sopenharmony_ci		&mov($c,&wparam(2));
350e1051a39Sopenharmony_ci
351e1051a39Sopenharmony_ci	&set_label("sqr_sse2_loop",16);
352e1051a39Sopenharmony_ci		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
353e1051a39Sopenharmony_ci		&pmuludq("mm0","mm0");		# a[i] *= a[i]
354e1051a39Sopenharmony_ci		&lea($a,&DWP(4,$a));		# a++
355e1051a39Sopenharmony_ci		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
356e1051a39Sopenharmony_ci		&sub($c,1);
357e1051a39Sopenharmony_ci		&lea($r,&DWP(8,$r));		# r += 2
358e1051a39Sopenharmony_ci		&jnz(&label("sqr_sse2_loop"));
359e1051a39Sopenharmony_ci
360e1051a39Sopenharmony_ci		&emms();
361e1051a39Sopenharmony_ci		&ret();
362e1051a39Sopenharmony_ci	&set_label("sqr_non_sse2",16);
363e1051a39Sopenharmony_ci	}
364e1051a39Sopenharmony_ci
365e1051a39Sopenharmony_ci	# function_begin prologue
366e1051a39Sopenharmony_ci	&push("ebp");
367e1051a39Sopenharmony_ci	&push("ebx");
368e1051a39Sopenharmony_ci	&push("esi");
369e1051a39Sopenharmony_ci	&push("edi");
370e1051a39Sopenharmony_ci
371e1051a39Sopenharmony_ci	&comment("");
372e1051a39Sopenharmony_ci	$r="esi";
373e1051a39Sopenharmony_ci	$a="edi";
374e1051a39Sopenharmony_ci	$num="ebx";
375e1051a39Sopenharmony_ci
376e1051a39Sopenharmony_ci	&mov($r,&wparam(0));	#
377e1051a39Sopenharmony_ci	&mov($a,&wparam(1));	#
378e1051a39Sopenharmony_ci	&mov($num,&wparam(2));	#
379e1051a39Sopenharmony_ci
380e1051a39Sopenharmony_ci	&and($num,0xfffffff8);	# num / 8
381e1051a39Sopenharmony_ci	&jz(&label("sw_finish"));
382e1051a39Sopenharmony_ci
383e1051a39Sopenharmony_ci	&set_label("sw_loop",0);
384e1051a39Sopenharmony_ci	for ($i=0; $i<32; $i+=4)
385e1051a39Sopenharmony_ci		{
386e1051a39Sopenharmony_ci		&comment("Round $i");
387e1051a39Sopenharmony_ci		&mov("eax",&DWP($i,$a,"",0)); 	# *a
388e1051a39Sopenharmony_ci		 # XXX
389e1051a39Sopenharmony_ci		&mul("eax");			# *a * *a
390e1051a39Sopenharmony_ci		&mov(&DWP($i*2,$r,"",0),"eax");	#
391e1051a39Sopenharmony_ci		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
392e1051a39Sopenharmony_ci		}
393e1051a39Sopenharmony_ci
394e1051a39Sopenharmony_ci	&comment("");
395e1051a39Sopenharmony_ci	&add($a,32);
396e1051a39Sopenharmony_ci	&add($r,64);
397e1051a39Sopenharmony_ci	&sub($num,8);
398e1051a39Sopenharmony_ci	&jnz(&label("sw_loop"));
399e1051a39Sopenharmony_ci
400e1051a39Sopenharmony_ci	&set_label("sw_finish",0);
401e1051a39Sopenharmony_ci	&mov($num,&wparam(2));	# get num
402e1051a39Sopenharmony_ci	&and($num,7);
403e1051a39Sopenharmony_ci	&jz(&label("sw_end"));
404e1051a39Sopenharmony_ci
405e1051a39Sopenharmony_ci	for ($i=0; $i<7; $i++)
406e1051a39Sopenharmony_ci		{
407e1051a39Sopenharmony_ci		&comment("Tail Round $i");
408e1051a39Sopenharmony_ci		&mov("eax",&DWP($i*4,$a,"",0));	# *a
409e1051a39Sopenharmony_ci		 # XXX
410e1051a39Sopenharmony_ci		&mul("eax");			# *a * *a
411e1051a39Sopenharmony_ci		&mov(&DWP($i*8,$r,"",0),"eax");	#
412e1051a39Sopenharmony_ci		 &dec($num) if ($i != 7-1);
413e1051a39Sopenharmony_ci		&mov(&DWP($i*8+4,$r,"",0),"edx");
414e1051a39Sopenharmony_ci		 &jz(&label("sw_end")) if ($i != 7-1);
415e1051a39Sopenharmony_ci		}
416e1051a39Sopenharmony_ci	&set_label("sw_end",0);
417e1051a39Sopenharmony_ci
418e1051a39Sopenharmony_ci	&function_end($name);
419e1051a39Sopenharmony_ci	}
420e1051a39Sopenharmony_ci
421e1051a39Sopenharmony_cisub bn_div_words
422e1051a39Sopenharmony_ci	{
423e1051a39Sopenharmony_ci	local($name)=@_;
424e1051a39Sopenharmony_ci
425e1051a39Sopenharmony_ci	&function_begin_B($name,"");
426e1051a39Sopenharmony_ci	&mov("edx",&wparam(0));	#
427e1051a39Sopenharmony_ci	&mov("eax",&wparam(1));	#
428e1051a39Sopenharmony_ci	&mov("ecx",&wparam(2));	#
429e1051a39Sopenharmony_ci	&div("ecx");
430e1051a39Sopenharmony_ci	&ret();
431e1051a39Sopenharmony_ci	&function_end_B($name);
432e1051a39Sopenharmony_ci	}
433e1051a39Sopenharmony_ci
434e1051a39Sopenharmony_cisub bn_add_words
435e1051a39Sopenharmony_ci	{
436e1051a39Sopenharmony_ci	local($name)=@_;
437e1051a39Sopenharmony_ci
438e1051a39Sopenharmony_ci	&function_begin($name,"");
439e1051a39Sopenharmony_ci
440e1051a39Sopenharmony_ci	&comment("");
441e1051a39Sopenharmony_ci	$a="esi";
442e1051a39Sopenharmony_ci	$b="edi";
443e1051a39Sopenharmony_ci	$c="eax";
444e1051a39Sopenharmony_ci	$r="ebx";
445e1051a39Sopenharmony_ci	$tmp1="ecx";
446e1051a39Sopenharmony_ci	$tmp2="edx";
447e1051a39Sopenharmony_ci	$num="ebp";
448e1051a39Sopenharmony_ci
449e1051a39Sopenharmony_ci	&mov($r,&wparam(0));	# get r
450e1051a39Sopenharmony_ci	 &mov($a,&wparam(1));	# get a
451e1051a39Sopenharmony_ci	&mov($b,&wparam(2));	# get b
452e1051a39Sopenharmony_ci	 &mov($num,&wparam(3));	# get num
453e1051a39Sopenharmony_ci	&xor($c,$c);		# clear carry
454e1051a39Sopenharmony_ci	 &and($num,0xfffffff8);	# num / 8
455e1051a39Sopenharmony_ci
456e1051a39Sopenharmony_ci	&jz(&label("aw_finish"));
457e1051a39Sopenharmony_ci
458e1051a39Sopenharmony_ci	&set_label("aw_loop",0);
459e1051a39Sopenharmony_ci	for ($i=0; $i<8; $i++)
460e1051a39Sopenharmony_ci		{
461e1051a39Sopenharmony_ci		&comment("Round $i");
462e1051a39Sopenharmony_ci
463e1051a39Sopenharmony_ci		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
464e1051a39Sopenharmony_ci		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
465e1051a39Sopenharmony_ci		&add($tmp1,$c);
466e1051a39Sopenharmony_ci		 &mov($c,0);
467e1051a39Sopenharmony_ci		&adc($c,$c);
468e1051a39Sopenharmony_ci		 &add($tmp1,$tmp2);
469e1051a39Sopenharmony_ci		&adc($c,0);
470e1051a39Sopenharmony_ci		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
471e1051a39Sopenharmony_ci		}
472e1051a39Sopenharmony_ci
473e1051a39Sopenharmony_ci	&comment("");
474e1051a39Sopenharmony_ci	&add($a,32);
475e1051a39Sopenharmony_ci	 &add($b,32);
476e1051a39Sopenharmony_ci	&add($r,32);
477e1051a39Sopenharmony_ci	 &sub($num,8);
478e1051a39Sopenharmony_ci	&jnz(&label("aw_loop"));
479e1051a39Sopenharmony_ci
480e1051a39Sopenharmony_ci	&set_label("aw_finish",0);
481e1051a39Sopenharmony_ci	&mov($num,&wparam(3));	# get num
482e1051a39Sopenharmony_ci	&and($num,7);
483e1051a39Sopenharmony_ci	 &jz(&label("aw_end"));
484e1051a39Sopenharmony_ci
485e1051a39Sopenharmony_ci	for ($i=0; $i<7; $i++)
486e1051a39Sopenharmony_ci		{
487e1051a39Sopenharmony_ci		&comment("Tail Round $i");
488e1051a39Sopenharmony_ci		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
489e1051a39Sopenharmony_ci		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
490e1051a39Sopenharmony_ci		&add($tmp1,$c);
491e1051a39Sopenharmony_ci		 &mov($c,0);
492e1051a39Sopenharmony_ci		&adc($c,$c);
493e1051a39Sopenharmony_ci		 &add($tmp1,$tmp2);
494e1051a39Sopenharmony_ci		&adc($c,0);
495e1051a39Sopenharmony_ci		 &dec($num) if ($i != 6);
496e1051a39Sopenharmony_ci		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
497e1051a39Sopenharmony_ci		 &jz(&label("aw_end")) if ($i != 6);
498e1051a39Sopenharmony_ci		}
499e1051a39Sopenharmony_ci	&set_label("aw_end",0);
500e1051a39Sopenharmony_ci
501e1051a39Sopenharmony_ci#	&mov("eax",$c);		# $c is "eax"
502e1051a39Sopenharmony_ci
503e1051a39Sopenharmony_ci	&function_end($name);
504e1051a39Sopenharmony_ci	}
505e1051a39Sopenharmony_ci
506e1051a39Sopenharmony_cisub bn_sub_words
507e1051a39Sopenharmony_ci	{
508e1051a39Sopenharmony_ci	local($name)=@_;
509e1051a39Sopenharmony_ci
510e1051a39Sopenharmony_ci	&function_begin($name,"");
511e1051a39Sopenharmony_ci
512e1051a39Sopenharmony_ci	&comment("");
513e1051a39Sopenharmony_ci	$a="esi";
514e1051a39Sopenharmony_ci	$b="edi";
515e1051a39Sopenharmony_ci	$c="eax";
516e1051a39Sopenharmony_ci	$r="ebx";
517e1051a39Sopenharmony_ci	$tmp1="ecx";
518e1051a39Sopenharmony_ci	$tmp2="edx";
519e1051a39Sopenharmony_ci	$num="ebp";
520e1051a39Sopenharmony_ci
521e1051a39Sopenharmony_ci	&mov($r,&wparam(0));	# get r
522e1051a39Sopenharmony_ci	 &mov($a,&wparam(1));	# get a
523e1051a39Sopenharmony_ci	&mov($b,&wparam(2));	# get b
524e1051a39Sopenharmony_ci	 &mov($num,&wparam(3));	# get num
525e1051a39Sopenharmony_ci	&xor($c,$c);		# clear carry
526e1051a39Sopenharmony_ci	 &and($num,0xfffffff8);	# num / 8
527e1051a39Sopenharmony_ci
528e1051a39Sopenharmony_ci	&jz(&label("aw_finish"));
529e1051a39Sopenharmony_ci
530e1051a39Sopenharmony_ci	&set_label("aw_loop",0);
531e1051a39Sopenharmony_ci	for ($i=0; $i<8; $i++)
532e1051a39Sopenharmony_ci		{
533e1051a39Sopenharmony_ci		&comment("Round $i");
534e1051a39Sopenharmony_ci
535e1051a39Sopenharmony_ci		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
536e1051a39Sopenharmony_ci		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
537e1051a39Sopenharmony_ci		&sub($tmp1,$c);
538e1051a39Sopenharmony_ci		 &mov($c,0);
539e1051a39Sopenharmony_ci		&adc($c,$c);
540e1051a39Sopenharmony_ci		 &sub($tmp1,$tmp2);
541e1051a39Sopenharmony_ci		&adc($c,0);
542e1051a39Sopenharmony_ci		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
543e1051a39Sopenharmony_ci		}
544e1051a39Sopenharmony_ci
545e1051a39Sopenharmony_ci	&comment("");
546e1051a39Sopenharmony_ci	&add($a,32);
547e1051a39Sopenharmony_ci	 &add($b,32);
548e1051a39Sopenharmony_ci	&add($r,32);
549e1051a39Sopenharmony_ci	 &sub($num,8);
550e1051a39Sopenharmony_ci	&jnz(&label("aw_loop"));
551e1051a39Sopenharmony_ci
552e1051a39Sopenharmony_ci	&set_label("aw_finish",0);
553e1051a39Sopenharmony_ci	&mov($num,&wparam(3));	# get num
554e1051a39Sopenharmony_ci	&and($num,7);
555e1051a39Sopenharmony_ci	 &jz(&label("aw_end"));
556e1051a39Sopenharmony_ci
557e1051a39Sopenharmony_ci	for ($i=0; $i<7; $i++)
558e1051a39Sopenharmony_ci		{
559e1051a39Sopenharmony_ci		&comment("Tail Round $i");
560e1051a39Sopenharmony_ci		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
561e1051a39Sopenharmony_ci		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
562e1051a39Sopenharmony_ci		&sub($tmp1,$c);
563e1051a39Sopenharmony_ci		 &mov($c,0);
564e1051a39Sopenharmony_ci		&adc($c,$c);
565e1051a39Sopenharmony_ci		 &sub($tmp1,$tmp2);
566e1051a39Sopenharmony_ci		&adc($c,0);
567e1051a39Sopenharmony_ci		 &dec($num) if ($i != 6);
568e1051a39Sopenharmony_ci		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
569e1051a39Sopenharmony_ci		 &jz(&label("aw_end")) if ($i != 6);
570e1051a39Sopenharmony_ci		}
571e1051a39Sopenharmony_ci	&set_label("aw_end",0);
572e1051a39Sopenharmony_ci
573e1051a39Sopenharmony_ci#	&mov("eax",$c);		# $c is "eax"
574e1051a39Sopenharmony_ci
575e1051a39Sopenharmony_ci	&function_end($name);
576e1051a39Sopenharmony_ci	}
577e1051a39Sopenharmony_ci
578e1051a39Sopenharmony_cisub bn_sub_part_words
579e1051a39Sopenharmony_ci	{
580e1051a39Sopenharmony_ci	local($name)=@_;
581e1051a39Sopenharmony_ci
582e1051a39Sopenharmony_ci	&function_begin($name,"");
583e1051a39Sopenharmony_ci
584e1051a39Sopenharmony_ci	&comment("");
585e1051a39Sopenharmony_ci	$a="esi";
586e1051a39Sopenharmony_ci	$b="edi";
587e1051a39Sopenharmony_ci	$c="eax";
588e1051a39Sopenharmony_ci	$r="ebx";
589e1051a39Sopenharmony_ci	$tmp1="ecx";
590e1051a39Sopenharmony_ci	$tmp2="edx";
591e1051a39Sopenharmony_ci	$num="ebp";
592e1051a39Sopenharmony_ci
593e1051a39Sopenharmony_ci	&mov($r,&wparam(0));	# get r
594e1051a39Sopenharmony_ci	 &mov($a,&wparam(1));	# get a
595e1051a39Sopenharmony_ci	&mov($b,&wparam(2));	# get b
596e1051a39Sopenharmony_ci	 &mov($num,&wparam(3));	# get num
597e1051a39Sopenharmony_ci	&xor($c,$c);		# clear carry
598e1051a39Sopenharmony_ci	 &and($num,0xfffffff8);	# num / 8
599e1051a39Sopenharmony_ci
600e1051a39Sopenharmony_ci	&jz(&label("aw_finish"));
601e1051a39Sopenharmony_ci
602e1051a39Sopenharmony_ci	&set_label("aw_loop",0);
603e1051a39Sopenharmony_ci	for ($i=0; $i<8; $i++)
604e1051a39Sopenharmony_ci		{
605e1051a39Sopenharmony_ci		&comment("Round $i");
606e1051a39Sopenharmony_ci
607e1051a39Sopenharmony_ci		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
608e1051a39Sopenharmony_ci		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
609e1051a39Sopenharmony_ci		&sub($tmp1,$c);
610e1051a39Sopenharmony_ci		 &mov($c,0);
611e1051a39Sopenharmony_ci		&adc($c,$c);
612e1051a39Sopenharmony_ci		 &sub($tmp1,$tmp2);
613e1051a39Sopenharmony_ci		&adc($c,0);
614e1051a39Sopenharmony_ci		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
615e1051a39Sopenharmony_ci		}
616e1051a39Sopenharmony_ci
617e1051a39Sopenharmony_ci	&comment("");
618e1051a39Sopenharmony_ci	&add($a,32);
619e1051a39Sopenharmony_ci	 &add($b,32);
620e1051a39Sopenharmony_ci	&add($r,32);
621e1051a39Sopenharmony_ci	 &sub($num,8);
622e1051a39Sopenharmony_ci	&jnz(&label("aw_loop"));
623e1051a39Sopenharmony_ci
624e1051a39Sopenharmony_ci	&set_label("aw_finish",0);
625e1051a39Sopenharmony_ci	&mov($num,&wparam(3));	# get num
626e1051a39Sopenharmony_ci	&and($num,7);
627e1051a39Sopenharmony_ci	 &jz(&label("aw_end"));
628e1051a39Sopenharmony_ci
629e1051a39Sopenharmony_ci	for ($i=0; $i<7; $i++)
630e1051a39Sopenharmony_ci		{
631e1051a39Sopenharmony_ci		&comment("Tail Round $i");
632e1051a39Sopenharmony_ci		&mov($tmp1,&DWP(0,$a,"",0));	# *a
633e1051a39Sopenharmony_ci		 &mov($tmp2,&DWP(0,$b,"",0));# *b
634e1051a39Sopenharmony_ci		&sub($tmp1,$c);
635e1051a39Sopenharmony_ci		 &mov($c,0);
636e1051a39Sopenharmony_ci		&adc($c,$c);
637e1051a39Sopenharmony_ci		 &sub($tmp1,$tmp2);
638e1051a39Sopenharmony_ci		&adc($c,0);
639e1051a39Sopenharmony_ci		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
640e1051a39Sopenharmony_ci		&add($a, 4);
641e1051a39Sopenharmony_ci		&add($b, 4);
642e1051a39Sopenharmony_ci		&add($r, 4);
643e1051a39Sopenharmony_ci		 &dec($num) if ($i != 6);
644e1051a39Sopenharmony_ci		 &jz(&label("aw_end")) if ($i != 6);
645e1051a39Sopenharmony_ci		}
646e1051a39Sopenharmony_ci	&set_label("aw_end",0);
647e1051a39Sopenharmony_ci
648e1051a39Sopenharmony_ci	&cmp(&wparam(4),0);
649e1051a39Sopenharmony_ci	&je(&label("pw_end"));
650e1051a39Sopenharmony_ci
651e1051a39Sopenharmony_ci	&mov($num,&wparam(4));	# get dl
652e1051a39Sopenharmony_ci	&cmp($num,0);
653e1051a39Sopenharmony_ci	&je(&label("pw_end"));
654e1051a39Sopenharmony_ci	&jge(&label("pw_pos"));
655e1051a39Sopenharmony_ci
656e1051a39Sopenharmony_ci	&comment("pw_neg");
657e1051a39Sopenharmony_ci	&mov($tmp2,0);
658e1051a39Sopenharmony_ci	&sub($tmp2,$num);
659e1051a39Sopenharmony_ci	&mov($num,$tmp2);
660e1051a39Sopenharmony_ci	&and($num,0xfffffff8);	# num / 8
661e1051a39Sopenharmony_ci	&jz(&label("pw_neg_finish"));
662e1051a39Sopenharmony_ci
663e1051a39Sopenharmony_ci	&set_label("pw_neg_loop",0);
664e1051a39Sopenharmony_ci	for ($i=0; $i<8; $i++)
665e1051a39Sopenharmony_ci	{
666e1051a39Sopenharmony_ci	    &comment("dl<0 Round $i");
667e1051a39Sopenharmony_ci
668e1051a39Sopenharmony_ci	    &mov($tmp1,0);
669e1051a39Sopenharmony_ci	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
670e1051a39Sopenharmony_ci	    &sub($tmp1,$c);
671e1051a39Sopenharmony_ci	    &mov($c,0);
672e1051a39Sopenharmony_ci	    &adc($c,$c);
673e1051a39Sopenharmony_ci	    &sub($tmp1,$tmp2);
674e1051a39Sopenharmony_ci	    &adc($c,0);
675e1051a39Sopenharmony_ci	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
676e1051a39Sopenharmony_ci	}
677e1051a39Sopenharmony_ci
678e1051a39Sopenharmony_ci	&comment("");
679e1051a39Sopenharmony_ci	&add($b,32);
680e1051a39Sopenharmony_ci	&add($r,32);
681e1051a39Sopenharmony_ci	&sub($num,8);
682e1051a39Sopenharmony_ci	&jnz(&label("pw_neg_loop"));
683e1051a39Sopenharmony_ci
684e1051a39Sopenharmony_ci	&set_label("pw_neg_finish",0);
685e1051a39Sopenharmony_ci	&mov($tmp2,&wparam(4));	# get dl
686e1051a39Sopenharmony_ci	&mov($num,0);
687e1051a39Sopenharmony_ci	&sub($num,$tmp2);
688e1051a39Sopenharmony_ci	&and($num,7);
689e1051a39Sopenharmony_ci	&jz(&label("pw_end"));
690e1051a39Sopenharmony_ci
691e1051a39Sopenharmony_ci	for ($i=0; $i<7; $i++)
692e1051a39Sopenharmony_ci	{
693e1051a39Sopenharmony_ci	    &comment("dl<0 Tail Round $i");
694e1051a39Sopenharmony_ci	    &mov($tmp1,0);
695e1051a39Sopenharmony_ci	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
696e1051a39Sopenharmony_ci	    &sub($tmp1,$c);
697e1051a39Sopenharmony_ci	    &mov($c,0);
698e1051a39Sopenharmony_ci	    &adc($c,$c);
699e1051a39Sopenharmony_ci	    &sub($tmp1,$tmp2);
700e1051a39Sopenharmony_ci	    &adc($c,0);
701e1051a39Sopenharmony_ci	    &dec($num) if ($i != 6);
702e1051a39Sopenharmony_ci	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
703e1051a39Sopenharmony_ci	    &jz(&label("pw_end")) if ($i != 6);
704e1051a39Sopenharmony_ci	}
705e1051a39Sopenharmony_ci
706e1051a39Sopenharmony_ci	&jmp(&label("pw_end"));
707e1051a39Sopenharmony_ci
708e1051a39Sopenharmony_ci	&set_label("pw_pos",0);
709e1051a39Sopenharmony_ci
710e1051a39Sopenharmony_ci	&and($num,0xfffffff8);	# num / 8
711e1051a39Sopenharmony_ci	&jz(&label("pw_pos_finish"));
712e1051a39Sopenharmony_ci
713e1051a39Sopenharmony_ci	&set_label("pw_pos_loop",0);
714e1051a39Sopenharmony_ci
715e1051a39Sopenharmony_ci	for ($i=0; $i<8; $i++)
716e1051a39Sopenharmony_ci	{
717e1051a39Sopenharmony_ci	    &comment("dl>0 Round $i");
718e1051a39Sopenharmony_ci
719e1051a39Sopenharmony_ci	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
720e1051a39Sopenharmony_ci	    &sub($tmp1,$c);
721e1051a39Sopenharmony_ci	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
722e1051a39Sopenharmony_ci	    &jnc(&label("pw_nc".$i));
723e1051a39Sopenharmony_ci	}
724e1051a39Sopenharmony_ci
725e1051a39Sopenharmony_ci	&comment("");
726e1051a39Sopenharmony_ci	&add($a,32);
727e1051a39Sopenharmony_ci	&add($r,32);
728e1051a39Sopenharmony_ci	&sub($num,8);
729e1051a39Sopenharmony_ci	&jnz(&label("pw_pos_loop"));
730e1051a39Sopenharmony_ci
731e1051a39Sopenharmony_ci	&set_label("pw_pos_finish",0);
732e1051a39Sopenharmony_ci	&mov($num,&wparam(4));	# get dl
733e1051a39Sopenharmony_ci	&and($num,7);
734e1051a39Sopenharmony_ci	&jz(&label("pw_end"));
735e1051a39Sopenharmony_ci
736e1051a39Sopenharmony_ci	for ($i=0; $i<7; $i++)
737e1051a39Sopenharmony_ci	{
738e1051a39Sopenharmony_ci	    &comment("dl>0 Tail Round $i");
739e1051a39Sopenharmony_ci	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
740e1051a39Sopenharmony_ci	    &sub($tmp1,$c);
741e1051a39Sopenharmony_ci	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
742e1051a39Sopenharmony_ci	    &jnc(&label("pw_tail_nc".$i));
743e1051a39Sopenharmony_ci	    &dec($num) if ($i != 6);
744e1051a39Sopenharmony_ci	    &jz(&label("pw_end")) if ($i != 6);
745e1051a39Sopenharmony_ci	}
746e1051a39Sopenharmony_ci	&mov($c,1);
747e1051a39Sopenharmony_ci	&jmp(&label("pw_end"));
748e1051a39Sopenharmony_ci
749e1051a39Sopenharmony_ci	&set_label("pw_nc_loop",0);
750e1051a39Sopenharmony_ci	for ($i=0; $i<8; $i++)
751e1051a39Sopenharmony_ci	{
752e1051a39Sopenharmony_ci	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
753e1051a39Sopenharmony_ci	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
754e1051a39Sopenharmony_ci	    &set_label("pw_nc".$i,0);
755e1051a39Sopenharmony_ci	}
756e1051a39Sopenharmony_ci
757e1051a39Sopenharmony_ci	&comment("");
758e1051a39Sopenharmony_ci	&add($a,32);
759e1051a39Sopenharmony_ci	&add($r,32);
760e1051a39Sopenharmony_ci	&sub($num,8);
761e1051a39Sopenharmony_ci	&jnz(&label("pw_nc_loop"));
762e1051a39Sopenharmony_ci
763e1051a39Sopenharmony_ci	&mov($num,&wparam(4));	# get dl
764e1051a39Sopenharmony_ci	&and($num,7);
765e1051a39Sopenharmony_ci	&jz(&label("pw_nc_end"));
766e1051a39Sopenharmony_ci
767e1051a39Sopenharmony_ci	for ($i=0; $i<7; $i++)
768e1051a39Sopenharmony_ci	{
769e1051a39Sopenharmony_ci	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
770e1051a39Sopenharmony_ci	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
771e1051a39Sopenharmony_ci	    &set_label("pw_tail_nc".$i,0);
772e1051a39Sopenharmony_ci	    &dec($num) if ($i != 6);
773e1051a39Sopenharmony_ci	    &jz(&label("pw_nc_end")) if ($i != 6);
774e1051a39Sopenharmony_ci	}
775e1051a39Sopenharmony_ci
776e1051a39Sopenharmony_ci	&set_label("pw_nc_end",0);
777e1051a39Sopenharmony_ci	&mov($c,0);
778e1051a39Sopenharmony_ci
779e1051a39Sopenharmony_ci	&set_label("pw_end",0);
780e1051a39Sopenharmony_ci
781e1051a39Sopenharmony_ci#	&mov("eax",$c);		# $c is "eax"
782e1051a39Sopenharmony_ci
783e1051a39Sopenharmony_ci	&function_end($name);
784e1051a39Sopenharmony_ci	}
785