1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci#
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci# January 2015
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# ChaCha20 for x86.
20e1051a39Sopenharmony_ci#
21e1051a39Sopenharmony_ci# Performance in cycles per byte out of large buffer.
22e1051a39Sopenharmony_ci#
23e1051a39Sopenharmony_ci#		1xIALU/gcc	4xSSSE3
24e1051a39Sopenharmony_ci# Pentium	17.5/+80%
25e1051a39Sopenharmony_ci# PIII		14.2/+60%
26e1051a39Sopenharmony_ci# P4		18.6/+84%
27e1051a39Sopenharmony_ci# Core2		9.56/+89%	4.83
28e1051a39Sopenharmony_ci# Westmere	9.50/+45%	3.35
29e1051a39Sopenharmony_ci# Sandy Bridge	10.5/+47%	3.20
30e1051a39Sopenharmony_ci# Haswell	8.15/+50%	2.83
31e1051a39Sopenharmony_ci# Skylake	7.53/+22%	2.75
32e1051a39Sopenharmony_ci# Silvermont	17.4/+36%	8.35
33e1051a39Sopenharmony_ci# Goldmont	13.4/+40%	4.36
34e1051a39Sopenharmony_ci# Sledgehammer	10.2/+54%
35e1051a39Sopenharmony_ci# Bulldozer	13.4/+50%	4.38(*)
36e1051a39Sopenharmony_ci#
37e1051a39Sopenharmony_ci# (*)	Bulldozer actually executes 4xXOP code path that delivers 3.55;
38e1051a39Sopenharmony_ci
39e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm");
41e1051a39Sopenharmony_cirequire "x86asm.pl";
42e1051a39Sopenharmony_ci
43e1051a39Sopenharmony_ci$output = pop and open STDOUT,">$output";
44e1051a39Sopenharmony_ci
45e1051a39Sopenharmony_ci&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
46e1051a39Sopenharmony_ci
47e1051a39Sopenharmony_ci$xmm=$ymm=0;
48e1051a39Sopenharmony_cifor (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
49e1051a39Sopenharmony_ci
50e1051a39Sopenharmony_ci$ymm=1 if ($xmm &&
51e1051a39Sopenharmony_ci		`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
52e1051a39Sopenharmony_ci			=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
53e1051a39Sopenharmony_ci		($gasver=$1)>=2.19);	# first version supporting AVX
54e1051a39Sopenharmony_ci
55e1051a39Sopenharmony_ci$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
56e1051a39Sopenharmony_ci		`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
57e1051a39Sopenharmony_ci		$1>=2.03);	# first version supporting AVX
58e1051a39Sopenharmony_ci
59e1051a39Sopenharmony_ci$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
60e1051a39Sopenharmony_ci		`ml 2>&1` =~ /Version ([0-9]+)\./ &&
61e1051a39Sopenharmony_ci		$1>=10);	# first version supporting AVX
62e1051a39Sopenharmony_ci
63e1051a39Sopenharmony_ci$ymm=1 if ($xmm && !$ymm &&
64e1051a39Sopenharmony_ci		`$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ &&
65e1051a39Sopenharmony_ci		$2>=3.0);	# first version supporting AVX
66e1051a39Sopenharmony_ci
67e1051a39Sopenharmony_ci$a="eax";
68e1051a39Sopenharmony_ci($b,$b_)=("ebx","ebp");
69e1051a39Sopenharmony_ci($c,$c_)=("ecx","esi");
70e1051a39Sopenharmony_ci($d,$d_)=("edx","edi");
71e1051a39Sopenharmony_ci
72e1051a39Sopenharmony_cisub QUARTERROUND {
73e1051a39Sopenharmony_cimy ($ai,$bi,$ci,$di,$i)=@_;
74e1051a39Sopenharmony_cimy ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
75e1051a39Sopenharmony_cimy ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
76e1051a39Sopenharmony_ci
77e1051a39Sopenharmony_ci	#       a   b   c   d
78e1051a39Sopenharmony_ci	#
79e1051a39Sopenharmony_ci	#       0   4   8  12 < even round
80e1051a39Sopenharmony_ci	#       1   5   9  13
81e1051a39Sopenharmony_ci	#       2   6  10  14
82e1051a39Sopenharmony_ci	#       3   7  11  15
83e1051a39Sopenharmony_ci	#       0   5  10  15 < odd round
84e1051a39Sopenharmony_ci	#       1   6  11  12
85e1051a39Sopenharmony_ci	#       2   7   8  13
86e1051a39Sopenharmony_ci	#       3   4   9  14
87e1051a39Sopenharmony_ci
88e1051a39Sopenharmony_ci	if ($i==0) {
89e1051a39Sopenharmony_ci            my $j=4;
90e1051a39Sopenharmony_ci	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
91e1051a39Sopenharmony_ci	} elsif ($i==3) {
92e1051a39Sopenharmony_ci            my $j=0;
93e1051a39Sopenharmony_ci	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
94e1051a39Sopenharmony_ci	} elsif ($i==4) {
95e1051a39Sopenharmony_ci            my $j=4;
96e1051a39Sopenharmony_ci	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
97e1051a39Sopenharmony_ci	} elsif ($i==7) {
98e1051a39Sopenharmony_ci            my $j=0;
99e1051a39Sopenharmony_ci	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
100e1051a39Sopenharmony_ci	}
101e1051a39Sopenharmony_ci
102e1051a39Sopenharmony_ci	#&add	($a,$b);			# see elsewhere
103e1051a39Sopenharmony_ci	&xor	($d,$a);
104e1051a39Sopenharmony_ci	 &mov	(&DWP(4*$cp,"esp"),$c_)		if ($ai>0 && $ai<3);
105e1051a39Sopenharmony_ci	&rol	($d,16);
106e1051a39Sopenharmony_ci	 &mov	(&DWP(4*$bp,"esp"),$b_)		if ($i!=0);
107e1051a39Sopenharmony_ci	&add	($c,$d);
108e1051a39Sopenharmony_ci	 &mov	($c_,&DWP(4*$cn,"esp"))		if ($ai>0 && $ai<3);
109e1051a39Sopenharmony_ci	&xor	($b,$c);
110e1051a39Sopenharmony_ci	 &mov	($d_,&DWP(4*$dn,"esp"))		if ($di!=$dn);
111e1051a39Sopenharmony_ci	&rol	($b,12);
112e1051a39Sopenharmony_ci	 &mov	($b_,&DWP(4*$bn,"esp"))		if ($i<7);
113e1051a39Sopenharmony_ci	 &mov	($b_,&DWP(128,"esp"))		if ($i==7);	# loop counter
114e1051a39Sopenharmony_ci	&add	($a,$b);
115e1051a39Sopenharmony_ci	&xor	($d,$a);
116e1051a39Sopenharmony_ci	&mov	(&DWP(4*$ai,"esp"),$a);
117e1051a39Sopenharmony_ci	&rol	($d,8);
118e1051a39Sopenharmony_ci	&mov	($a,&DWP(4*$an,"esp"));
119e1051a39Sopenharmony_ci	&add	($c,$d);
120e1051a39Sopenharmony_ci	&mov	(&DWP(4*$di,"esp"),$d)		if ($di!=$dn);
121e1051a39Sopenharmony_ci	&mov	($d_,$d)			if ($di==$dn);
122e1051a39Sopenharmony_ci	&xor	($b,$c);
123e1051a39Sopenharmony_ci	 &add	($a,$b_)			if ($i<7);	# elsewhere
124e1051a39Sopenharmony_ci	&rol	($b,7);
125e1051a39Sopenharmony_ci
126e1051a39Sopenharmony_ci	($b,$b_)=($b_,$b);
127e1051a39Sopenharmony_ci	($c,$c_)=($c_,$c);
128e1051a39Sopenharmony_ci	($d,$d_)=($d_,$d);
129e1051a39Sopenharmony_ci}
130e1051a39Sopenharmony_ci
131e1051a39Sopenharmony_ci&static_label("ssse3_shortcut");
132e1051a39Sopenharmony_ci&static_label("xop_shortcut");
133e1051a39Sopenharmony_ci&static_label("ssse3_data");
134e1051a39Sopenharmony_ci&static_label("pic_point");
135e1051a39Sopenharmony_ci
136e1051a39Sopenharmony_ci&function_begin("ChaCha20_ctr32");
137e1051a39Sopenharmony_ci	&xor	("eax","eax");
138e1051a39Sopenharmony_ci	&cmp	("eax",&wparam(2));		# len==0?
139e1051a39Sopenharmony_ci	&je	(&label("no_data"));
140e1051a39Sopenharmony_ciif ($xmm) {
141e1051a39Sopenharmony_ci	&call	(&label("pic_point"));
142e1051a39Sopenharmony_ci&set_label("pic_point");
143e1051a39Sopenharmony_ci	&blindpop("eax");
144e1051a39Sopenharmony_ci	&picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
145e1051a39Sopenharmony_ci	&test	(&DWP(0,"ebp"),1<<24);		# test FXSR bit
146e1051a39Sopenharmony_ci	&jz	(&label("x86"));
147e1051a39Sopenharmony_ci	&test	(&DWP(4,"ebp"),1<<9);		# test SSSE3 bit
148e1051a39Sopenharmony_ci	&jz	(&label("x86"));
149e1051a39Sopenharmony_ci	&jmp	(&label("ssse3_shortcut"));
150e1051a39Sopenharmony_ci&set_label("x86");
151e1051a39Sopenharmony_ci}
152e1051a39Sopenharmony_ci	&mov	("esi",&wparam(3));		# key
153e1051a39Sopenharmony_ci	&mov	("edi",&wparam(4));		# counter and nonce
154e1051a39Sopenharmony_ci
155e1051a39Sopenharmony_ci	&stack_push(33);
156e1051a39Sopenharmony_ci
157e1051a39Sopenharmony_ci	&mov	("eax",&DWP(4*0,"esi"));	# copy key
158e1051a39Sopenharmony_ci	&mov	("ebx",&DWP(4*1,"esi"));
159e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(4*2,"esi"));
160e1051a39Sopenharmony_ci	&mov	("edx",&DWP(4*3,"esi"));
161e1051a39Sopenharmony_ci	&mov	(&DWP(64+4*4,"esp"),"eax");
162e1051a39Sopenharmony_ci	&mov	(&DWP(64+4*5,"esp"),"ebx");
163e1051a39Sopenharmony_ci	&mov	(&DWP(64+4*6,"esp"),"ecx");
164e1051a39Sopenharmony_ci	&mov	(&DWP(64+4*7,"esp"),"edx");
165e1051a39Sopenharmony_ci	&mov	("eax",&DWP(4*4,"esi"));
166e1051a39Sopenharmony_ci	&mov	("ebx",&DWP(4*5,"esi"));
167e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(4*6,"esi"));
168e1051a39Sopenharmony_ci	&mov	("edx",&DWP(4*7,"esi"));
169e1051a39Sopenharmony_ci	&mov	(&DWP(64+4*8,"esp"),"eax");
170e1051a39Sopenharmony_ci	&mov	(&DWP(64+4*9,"esp"),"ebx");
171e1051a39Sopenharmony_ci	&mov	(&DWP(64+4*10,"esp"),"ecx");
172e1051a39Sopenharmony_ci	&mov	(&DWP(64+4*11,"esp"),"edx");
173e1051a39Sopenharmony_ci	&mov	("eax",&DWP(4*0,"edi"));	# copy counter and nonce
174e1051a39Sopenharmony_ci	&mov	("ebx",&DWP(4*1,"edi"));
175e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(4*2,"edi"));
176e1051a39Sopenharmony_ci	&mov	("edx",&DWP(4*3,"edi"));
177e1051a39Sopenharmony_ci	&sub	("eax",1);
178e1051a39Sopenharmony_ci	&mov	(&DWP(64+4*12,"esp"),"eax");
179e1051a39Sopenharmony_ci	&mov	(&DWP(64+4*13,"esp"),"ebx");
180e1051a39Sopenharmony_ci	&mov	(&DWP(64+4*14,"esp"),"ecx");
181e1051a39Sopenharmony_ci	&mov	(&DWP(64+4*15,"esp"),"edx");
182e1051a39Sopenharmony_ci	&jmp	(&label("entry"));
183e1051a39Sopenharmony_ci
184e1051a39Sopenharmony_ci&set_label("outer_loop",16);
185e1051a39Sopenharmony_ci	&mov	(&wparam(1),$b);		# save input
186e1051a39Sopenharmony_ci	&mov	(&wparam(0),$a);		# save output
187e1051a39Sopenharmony_ci	&mov	(&wparam(2),$c);		# save len
188e1051a39Sopenharmony_ci&set_label("entry");
189e1051a39Sopenharmony_ci	&mov	($a,0x61707865);
190e1051a39Sopenharmony_ci	&mov	(&DWP(4*1,"esp"),0x3320646e);
191e1051a39Sopenharmony_ci	&mov	(&DWP(4*2,"esp"),0x79622d32);
192e1051a39Sopenharmony_ci	&mov	(&DWP(4*3,"esp"),0x6b206574);
193e1051a39Sopenharmony_ci
194e1051a39Sopenharmony_ci	&mov	($b, &DWP(64+4*5,"esp"));	# copy key material
195e1051a39Sopenharmony_ci	&mov	($b_,&DWP(64+4*6,"esp"));
196e1051a39Sopenharmony_ci	&mov	($c, &DWP(64+4*10,"esp"));
197e1051a39Sopenharmony_ci	&mov	($c_,&DWP(64+4*11,"esp"));
198e1051a39Sopenharmony_ci	&mov	($d, &DWP(64+4*13,"esp"));
199e1051a39Sopenharmony_ci	&mov	($d_,&DWP(64+4*14,"esp"));
200e1051a39Sopenharmony_ci	&mov	(&DWP(4*5,"esp"),$b);
201e1051a39Sopenharmony_ci	&mov	(&DWP(4*6,"esp"),$b_);
202e1051a39Sopenharmony_ci	&mov	(&DWP(4*10,"esp"),$c);
203e1051a39Sopenharmony_ci	&mov	(&DWP(4*11,"esp"),$c_);
204e1051a39Sopenharmony_ci	&mov	(&DWP(4*13,"esp"),$d);
205e1051a39Sopenharmony_ci	&mov	(&DWP(4*14,"esp"),$d_);
206e1051a39Sopenharmony_ci
207e1051a39Sopenharmony_ci	&mov	($b, &DWP(64+4*7,"esp"));
208e1051a39Sopenharmony_ci	&mov	($d_,&DWP(64+4*15,"esp"));
209e1051a39Sopenharmony_ci	&mov	($d, &DWP(64+4*12,"esp"));
210e1051a39Sopenharmony_ci	&mov	($b_,&DWP(64+4*4,"esp"));
211e1051a39Sopenharmony_ci	&mov	($c, &DWP(64+4*8,"esp"));
212e1051a39Sopenharmony_ci	&mov	($c_,&DWP(64+4*9,"esp"));
213e1051a39Sopenharmony_ci	&add	($d,1);				# counter value
214e1051a39Sopenharmony_ci	&mov	(&DWP(4*7,"esp"),$b);
215e1051a39Sopenharmony_ci	&mov	(&DWP(4*15,"esp"),$d_);
216e1051a39Sopenharmony_ci	&mov	(&DWP(64+4*12,"esp"),$d);	# save counter value
217e1051a39Sopenharmony_ci
218e1051a39Sopenharmony_ci	&mov	($b,10);			# loop counter
219e1051a39Sopenharmony_ci	&jmp	(&label("loop"));
220e1051a39Sopenharmony_ci
221e1051a39Sopenharmony_ci&set_label("loop",16);
222e1051a39Sopenharmony_ci	&add	($a,$b_);			# elsewhere
223e1051a39Sopenharmony_ci	&mov	(&DWP(128,"esp"),$b);		# save loop counter
224e1051a39Sopenharmony_ci	&mov	($b,$b_);
225e1051a39Sopenharmony_ci	&QUARTERROUND(0, 4, 8, 12, 0);
226e1051a39Sopenharmony_ci	&QUARTERROUND(1, 5, 9, 13, 1);
227e1051a39Sopenharmony_ci	&QUARTERROUND(2, 6,10, 14, 2);
228e1051a39Sopenharmony_ci	&QUARTERROUND(3, 7,11, 15, 3);
229e1051a39Sopenharmony_ci	&QUARTERROUND(0, 5,10, 15, 4);
230e1051a39Sopenharmony_ci	&QUARTERROUND(1, 6,11, 12, 5);
231e1051a39Sopenharmony_ci	&QUARTERROUND(2, 7, 8, 13, 6);
232e1051a39Sopenharmony_ci	&QUARTERROUND(3, 4, 9, 14, 7);
233e1051a39Sopenharmony_ci	&dec	($b);
234e1051a39Sopenharmony_ci	&jnz	(&label("loop"));
235e1051a39Sopenharmony_ci
236e1051a39Sopenharmony_ci	&mov	($b,&wparam(2));		# load len
237e1051a39Sopenharmony_ci
238e1051a39Sopenharmony_ci	&add	($a,0x61707865);		# accumulate key material
239e1051a39Sopenharmony_ci	&add	($b_,&DWP(64+4*4,"esp"));
240e1051a39Sopenharmony_ci	&add	($c, &DWP(64+4*8,"esp"));
241e1051a39Sopenharmony_ci	&add	($c_,&DWP(64+4*9,"esp"));
242e1051a39Sopenharmony_ci
243e1051a39Sopenharmony_ci	&cmp	($b,64);
244e1051a39Sopenharmony_ci	&jb	(&label("tail"));
245e1051a39Sopenharmony_ci
246e1051a39Sopenharmony_ci	&mov	($b,&wparam(1));		# load input pointer
247e1051a39Sopenharmony_ci	&add	($d, &DWP(64+4*12,"esp"));
248e1051a39Sopenharmony_ci	&add	($d_,&DWP(64+4*14,"esp"));
249e1051a39Sopenharmony_ci
250e1051a39Sopenharmony_ci	&xor	($a, &DWP(4*0,$b));		# xor with input
251e1051a39Sopenharmony_ci	&xor	($b_,&DWP(4*4,$b));
252e1051a39Sopenharmony_ci	&mov	(&DWP(4*0,"esp"),$a);
253e1051a39Sopenharmony_ci	&mov	($a,&wparam(0));		# load output pointer
254e1051a39Sopenharmony_ci	&xor	($c, &DWP(4*8,$b));
255e1051a39Sopenharmony_ci	&xor	($c_,&DWP(4*9,$b));
256e1051a39Sopenharmony_ci	&xor	($d, &DWP(4*12,$b));
257e1051a39Sopenharmony_ci	&xor	($d_,&DWP(4*14,$b));
258e1051a39Sopenharmony_ci	&mov	(&DWP(4*4,$a),$b_);		# write output
259e1051a39Sopenharmony_ci	&mov	(&DWP(4*8,$a),$c);
260e1051a39Sopenharmony_ci	&mov	(&DWP(4*9,$a),$c_);
261e1051a39Sopenharmony_ci	&mov	(&DWP(4*12,$a),$d);
262e1051a39Sopenharmony_ci	&mov	(&DWP(4*14,$a),$d_);
263e1051a39Sopenharmony_ci
264e1051a39Sopenharmony_ci	&mov	($b_,&DWP(4*1,"esp"));
265e1051a39Sopenharmony_ci	&mov	($c, &DWP(4*2,"esp"));
266e1051a39Sopenharmony_ci	&mov	($c_,&DWP(4*3,"esp"));
267e1051a39Sopenharmony_ci	&mov	($d, &DWP(4*5,"esp"));
268e1051a39Sopenharmony_ci	&mov	($d_,&DWP(4*6,"esp"));
269e1051a39Sopenharmony_ci	&add	($b_,0x3320646e);		# accumulate key material
270e1051a39Sopenharmony_ci	&add	($c, 0x79622d32);
271e1051a39Sopenharmony_ci	&add	($c_,0x6b206574);
272e1051a39Sopenharmony_ci	&add	($d, &DWP(64+4*5,"esp"));
273e1051a39Sopenharmony_ci	&add	($d_,&DWP(64+4*6,"esp"));
274e1051a39Sopenharmony_ci	&xor	($b_,&DWP(4*1,$b));
275e1051a39Sopenharmony_ci	&xor	($c, &DWP(4*2,$b));
276e1051a39Sopenharmony_ci	&xor	($c_,&DWP(4*3,$b));
277e1051a39Sopenharmony_ci	&xor	($d, &DWP(4*5,$b));
278e1051a39Sopenharmony_ci	&xor	($d_,&DWP(4*6,$b));
279e1051a39Sopenharmony_ci	&mov	(&DWP(4*1,$a),$b_);
280e1051a39Sopenharmony_ci	&mov	(&DWP(4*2,$a),$c);
281e1051a39Sopenharmony_ci	&mov	(&DWP(4*3,$a),$c_);
282e1051a39Sopenharmony_ci	&mov	(&DWP(4*5,$a),$d);
283e1051a39Sopenharmony_ci	&mov	(&DWP(4*6,$a),$d_);
284e1051a39Sopenharmony_ci
285e1051a39Sopenharmony_ci	&mov	($b_,&DWP(4*7,"esp"));
286e1051a39Sopenharmony_ci	&mov	($c, &DWP(4*10,"esp"));
287e1051a39Sopenharmony_ci	&mov	($c_,&DWP(4*11,"esp"));
288e1051a39Sopenharmony_ci	&mov	($d, &DWP(4*13,"esp"));
289e1051a39Sopenharmony_ci	&mov	($d_,&DWP(4*15,"esp"));
290e1051a39Sopenharmony_ci	&add	($b_,&DWP(64+4*7,"esp"));
291e1051a39Sopenharmony_ci	&add	($c, &DWP(64+4*10,"esp"));
292e1051a39Sopenharmony_ci	&add	($c_,&DWP(64+4*11,"esp"));
293e1051a39Sopenharmony_ci	&add	($d, &DWP(64+4*13,"esp"));
294e1051a39Sopenharmony_ci	&add	($d_,&DWP(64+4*15,"esp"));
295e1051a39Sopenharmony_ci	&xor	($b_,&DWP(4*7,$b));
296e1051a39Sopenharmony_ci	&xor	($c, &DWP(4*10,$b));
297e1051a39Sopenharmony_ci	&xor	($c_,&DWP(4*11,$b));
298e1051a39Sopenharmony_ci	&xor	($d, &DWP(4*13,$b));
299e1051a39Sopenharmony_ci	&xor	($d_,&DWP(4*15,$b));
300e1051a39Sopenharmony_ci	&lea	($b,&DWP(4*16,$b));
301e1051a39Sopenharmony_ci	&mov	(&DWP(4*7,$a),$b_);
302e1051a39Sopenharmony_ci	&mov	($b_,&DWP(4*0,"esp"));
303e1051a39Sopenharmony_ci	&mov	(&DWP(4*10,$a),$c);
304e1051a39Sopenharmony_ci	&mov	($c,&wparam(2));		# len
305e1051a39Sopenharmony_ci	&mov	(&DWP(4*11,$a),$c_);
306e1051a39Sopenharmony_ci	&mov	(&DWP(4*13,$a),$d);
307e1051a39Sopenharmony_ci	&mov	(&DWP(4*15,$a),$d_);
308e1051a39Sopenharmony_ci	&mov	(&DWP(4*0,$a),$b_);
309e1051a39Sopenharmony_ci	&lea	($a,&DWP(4*16,$a));
310e1051a39Sopenharmony_ci	&sub	($c,64);
311e1051a39Sopenharmony_ci	&jnz	(&label("outer_loop"));
312e1051a39Sopenharmony_ci
313e1051a39Sopenharmony_ci	&jmp	(&label("done"));
314e1051a39Sopenharmony_ci
315e1051a39Sopenharmony_ci&set_label("tail");
316e1051a39Sopenharmony_ci	&add	($d, &DWP(64+4*12,"esp"));
317e1051a39Sopenharmony_ci	&add	($d_,&DWP(64+4*14,"esp"));
318e1051a39Sopenharmony_ci	&mov	(&DWP(4*0,"esp"),$a);
319e1051a39Sopenharmony_ci	&mov	(&DWP(4*4,"esp"),$b_);
320e1051a39Sopenharmony_ci	&mov	(&DWP(4*8,"esp"),$c);
321e1051a39Sopenharmony_ci	&mov	(&DWP(4*9,"esp"),$c_);
322e1051a39Sopenharmony_ci	&mov	(&DWP(4*12,"esp"),$d);
323e1051a39Sopenharmony_ci	&mov	(&DWP(4*14,"esp"),$d_);
324e1051a39Sopenharmony_ci
325e1051a39Sopenharmony_ci	&mov	($b_,&DWP(4*1,"esp"));
326e1051a39Sopenharmony_ci	&mov	($c, &DWP(4*2,"esp"));
327e1051a39Sopenharmony_ci	&mov	($c_,&DWP(4*3,"esp"));
328e1051a39Sopenharmony_ci	&mov	($d, &DWP(4*5,"esp"));
329e1051a39Sopenharmony_ci	&mov	($d_,&DWP(4*6,"esp"));
330e1051a39Sopenharmony_ci	&add	($b_,0x3320646e);		# accumulate key material
331e1051a39Sopenharmony_ci	&add	($c, 0x79622d32);
332e1051a39Sopenharmony_ci	&add	($c_,0x6b206574);
333e1051a39Sopenharmony_ci	&add	($d, &DWP(64+4*5,"esp"));
334e1051a39Sopenharmony_ci	&add	($d_,&DWP(64+4*6,"esp"));
335e1051a39Sopenharmony_ci	&mov	(&DWP(4*1,"esp"),$b_);
336e1051a39Sopenharmony_ci	&mov	(&DWP(4*2,"esp"),$c);
337e1051a39Sopenharmony_ci	&mov	(&DWP(4*3,"esp"),$c_);
338e1051a39Sopenharmony_ci	&mov	(&DWP(4*5,"esp"),$d);
339e1051a39Sopenharmony_ci	&mov	(&DWP(4*6,"esp"),$d_);
340e1051a39Sopenharmony_ci
341e1051a39Sopenharmony_ci	&mov	($b_,&DWP(4*7,"esp"));
342e1051a39Sopenharmony_ci	&mov	($c, &DWP(4*10,"esp"));
343e1051a39Sopenharmony_ci	&mov	($c_,&DWP(4*11,"esp"));
344e1051a39Sopenharmony_ci	&mov	($d, &DWP(4*13,"esp"));
345e1051a39Sopenharmony_ci	&mov	($d_,&DWP(4*15,"esp"));
346e1051a39Sopenharmony_ci	&add	($b_,&DWP(64+4*7,"esp"));
347e1051a39Sopenharmony_ci	&add	($c, &DWP(64+4*10,"esp"));
348e1051a39Sopenharmony_ci	&add	($c_,&DWP(64+4*11,"esp"));
349e1051a39Sopenharmony_ci	&add	($d, &DWP(64+4*13,"esp"));
350e1051a39Sopenharmony_ci	&add	($d_,&DWP(64+4*15,"esp"));
351e1051a39Sopenharmony_ci	&mov	(&DWP(4*7,"esp"),$b_);
352e1051a39Sopenharmony_ci	&mov	($b_,&wparam(1));		# load input
353e1051a39Sopenharmony_ci	&mov	(&DWP(4*10,"esp"),$c);
354e1051a39Sopenharmony_ci	&mov	($c,&wparam(0));		# load output
355e1051a39Sopenharmony_ci	&mov	(&DWP(4*11,"esp"),$c_);
356e1051a39Sopenharmony_ci	&xor	($c_,$c_);
357e1051a39Sopenharmony_ci	&mov	(&DWP(4*13,"esp"),$d);
358e1051a39Sopenharmony_ci	&mov	(&DWP(4*15,"esp"),$d_);
359e1051a39Sopenharmony_ci
360e1051a39Sopenharmony_ci	&xor	("eax","eax");
361e1051a39Sopenharmony_ci	&xor	("edx","edx");
362e1051a39Sopenharmony_ci&set_label("tail_loop");
363e1051a39Sopenharmony_ci	&movb	("al",&BP(0,$c_,$b_));
364e1051a39Sopenharmony_ci	&movb	("dl",&BP(0,"esp",$c_));
365e1051a39Sopenharmony_ci	&lea	($c_,&DWP(1,$c_));
366e1051a39Sopenharmony_ci	&xor	("al","dl");
367e1051a39Sopenharmony_ci	&mov	(&BP(-1,$c,$c_),"al");
368e1051a39Sopenharmony_ci	&dec	($b);
369e1051a39Sopenharmony_ci	&jnz	(&label("tail_loop"));
370e1051a39Sopenharmony_ci
371e1051a39Sopenharmony_ci&set_label("done");
372e1051a39Sopenharmony_ci	&stack_pop(33);
373e1051a39Sopenharmony_ci&set_label("no_data");
374e1051a39Sopenharmony_ci&function_end("ChaCha20_ctr32");
375e1051a39Sopenharmony_ci
376e1051a39Sopenharmony_ciif ($xmm) {
377e1051a39Sopenharmony_cimy ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
378e1051a39Sopenharmony_cimy ($out,$inp,$len)=("edi","esi","ecx");
379e1051a39Sopenharmony_ci
380e1051a39Sopenharmony_cisub QUARTERROUND_SSSE3 {
381e1051a39Sopenharmony_cimy ($ai,$bi,$ci,$di,$i)=@_;
382e1051a39Sopenharmony_cimy ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
383e1051a39Sopenharmony_cimy ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
384e1051a39Sopenharmony_ci
385e1051a39Sopenharmony_ci	#       a   b   c   d
386e1051a39Sopenharmony_ci	#
387e1051a39Sopenharmony_ci	#       0   4   8  12 < even round
388e1051a39Sopenharmony_ci	#       1   5   9  13
389e1051a39Sopenharmony_ci	#       2   6  10  14
390e1051a39Sopenharmony_ci	#       3   7  11  15
391e1051a39Sopenharmony_ci	#       0   5  10  15 < odd round
392e1051a39Sopenharmony_ci	#       1   6  11  12
393e1051a39Sopenharmony_ci	#       2   7   8  13
394e1051a39Sopenharmony_ci	#       3   4   9  14
395e1051a39Sopenharmony_ci
396e1051a39Sopenharmony_ci	if ($i==0) {
397e1051a39Sopenharmony_ci            my $j=4;
398e1051a39Sopenharmony_ci	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
399e1051a39Sopenharmony_ci	} elsif ($i==3) {
400e1051a39Sopenharmony_ci            my $j=0;
401e1051a39Sopenharmony_ci	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
402e1051a39Sopenharmony_ci	} elsif ($i==4) {
403e1051a39Sopenharmony_ci            my $j=4;
404e1051a39Sopenharmony_ci	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
405e1051a39Sopenharmony_ci	} elsif ($i==7) {
406e1051a39Sopenharmony_ci            my $j=0;
407e1051a39Sopenharmony_ci	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
408e1051a39Sopenharmony_ci	}
409e1051a39Sopenharmony_ci
410e1051a39Sopenharmony_ci	#&paddd	($xa,$xb);			# see elsewhere
411e1051a39Sopenharmony_ci	#&pxor	($xd,$xa);			# see elsewhere
412e1051a39Sopenharmony_ci	 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_)	if ($ai>0 && $ai<3);
413e1051a39Sopenharmony_ci	&pshufb	($xd,&QWP(0,"eax"));		# rot16
414e1051a39Sopenharmony_ci	 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_)	if ($i!=0);
415e1051a39Sopenharmony_ci	&paddd	($xc,$xd);
416e1051a39Sopenharmony_ci	 &movdqa($xc_,&QWP(16*$cn-128,"ebx"))	if ($ai>0 && $ai<3);
417e1051a39Sopenharmony_ci	&pxor	($xb,$xc);
418e1051a39Sopenharmony_ci	 &movdqa($xb_,&QWP(16*$bn-128,"ebx"))	if ($i<7);
419e1051a39Sopenharmony_ci	&movdqa	($xa_,$xb);			# borrow as temporary
420e1051a39Sopenharmony_ci	&pslld	($xb,12);
421e1051a39Sopenharmony_ci	&psrld	($xa_,20);
422e1051a39Sopenharmony_ci	&por	($xb,$xa_);
423e1051a39Sopenharmony_ci	 &movdqa($xa_,&QWP(16*$an-128,"ebx"));
424e1051a39Sopenharmony_ci	&paddd	($xa,$xb);
425e1051a39Sopenharmony_ci	 &movdqa($xd_,&QWP(16*$dn-128,"ebx"))	if ($di!=$dn);
426e1051a39Sopenharmony_ci	&pxor	($xd,$xa);
427e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*$ai-128,"ebx"),$xa);
428e1051a39Sopenharmony_ci	&pshufb	($xd,&QWP(16,"eax"));		# rot8
429e1051a39Sopenharmony_ci	&paddd	($xc,$xd);
430e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*$di-128,"ebx"),$xd)	if ($di!=$dn);
431e1051a39Sopenharmony_ci	&movdqa	($xd_,$xd)			if ($di==$dn);
432e1051a39Sopenharmony_ci	&pxor	($xb,$xc);
433e1051a39Sopenharmony_ci	 &paddd	($xa_,$xb_)			if ($i<7);	# elsewhere
434e1051a39Sopenharmony_ci	&movdqa	($xa,$xb);			# borrow as temporary
435e1051a39Sopenharmony_ci	&pslld	($xb,7);
436e1051a39Sopenharmony_ci	&psrld	($xa,25);
437e1051a39Sopenharmony_ci	 &pxor	($xd_,$xa_)			if ($i<7);	# elsewhere
438e1051a39Sopenharmony_ci	&por	($xb,$xa);
439e1051a39Sopenharmony_ci
440e1051a39Sopenharmony_ci	($xa,$xa_)=($xa_,$xa);
441e1051a39Sopenharmony_ci	($xb,$xb_)=($xb_,$xb);
442e1051a39Sopenharmony_ci	($xc,$xc_)=($xc_,$xc);
443e1051a39Sopenharmony_ci	($xd,$xd_)=($xd_,$xd);
444e1051a39Sopenharmony_ci}
445e1051a39Sopenharmony_ci
446e1051a39Sopenharmony_ci&function_begin("ChaCha20_ssse3");
447e1051a39Sopenharmony_ci&set_label("ssse3_shortcut");
448e1051a39Sopenharmony_ciif ($ymm) {
449e1051a39Sopenharmony_ci	&test		(&DWP(4,"ebp"),1<<11);		# test XOP bit
450e1051a39Sopenharmony_ci	&jnz		(&label("xop_shortcut"));
451e1051a39Sopenharmony_ci}
452e1051a39Sopenharmony_ci
453e1051a39Sopenharmony_ci	&mov		($out,&wparam(0));
454e1051a39Sopenharmony_ci	&mov		($inp,&wparam(1));
455e1051a39Sopenharmony_ci	&mov		($len,&wparam(2));
456e1051a39Sopenharmony_ci	&mov		("edx",&wparam(3));		# key
457e1051a39Sopenharmony_ci	&mov		("ebx",&wparam(4));		# counter and nonce
458e1051a39Sopenharmony_ci
459e1051a39Sopenharmony_ci	&mov		("ebp","esp");
460e1051a39Sopenharmony_ci	&stack_push	(131);
461e1051a39Sopenharmony_ci	&and		("esp",-64);
462e1051a39Sopenharmony_ci	&mov		(&DWP(512,"esp"),"ebp");
463e1051a39Sopenharmony_ci
464e1051a39Sopenharmony_ci	&lea		("eax",&DWP(&label("ssse3_data")."-".
465e1051a39Sopenharmony_ci				    &label("pic_point"),"eax"));
466e1051a39Sopenharmony_ci	&movdqu		("xmm3",&QWP(0,"ebx"));		# counter and nonce
467e1051a39Sopenharmony_ci
468e1051a39Sopenharmony_ciif (defined($gasver) && $gasver>=2.17) {		# even though we encode
469e1051a39Sopenharmony_ci							# pshufb manually, we
470e1051a39Sopenharmony_ci							# handle only register
471e1051a39Sopenharmony_ci							# operands, while this
472e1051a39Sopenharmony_ci							# segment uses memory
473e1051a39Sopenharmony_ci							# operand...
474e1051a39Sopenharmony_ci	&cmp		($len,64*4);
475e1051a39Sopenharmony_ci	&jb		(&label("1x"));
476e1051a39Sopenharmony_ci
477e1051a39Sopenharmony_ci	&mov		(&DWP(512+4,"esp"),"edx");	# offload pointers
478e1051a39Sopenharmony_ci	&mov		(&DWP(512+8,"esp"),"ebx");
479e1051a39Sopenharmony_ci	&sub		($len,64*4);			# bias len
480e1051a39Sopenharmony_ci	&lea		("ebp",&DWP(256+128,"esp"));	# size optimization
481e1051a39Sopenharmony_ci
482e1051a39Sopenharmony_ci	&movdqu		("xmm7",&QWP(0,"edx"));		# key
483e1051a39Sopenharmony_ci	&pshufd		("xmm0","xmm3",0x00);
484e1051a39Sopenharmony_ci	&pshufd		("xmm1","xmm3",0x55);
485e1051a39Sopenharmony_ci	&pshufd		("xmm2","xmm3",0xaa);
486e1051a39Sopenharmony_ci	&pshufd		("xmm3","xmm3",0xff);
487e1051a39Sopenharmony_ci	 &paddd		("xmm0",&QWP(16*3,"eax"));	# fix counters
488e1051a39Sopenharmony_ci	&pshufd		("xmm4","xmm7",0x00);
489e1051a39Sopenharmony_ci	&pshufd		("xmm5","xmm7",0x55);
490e1051a39Sopenharmony_ci	 &psubd		("xmm0",&QWP(16*4,"eax"));
491e1051a39Sopenharmony_ci	&pshufd		("xmm6","xmm7",0xaa);
492e1051a39Sopenharmony_ci	&pshufd		("xmm7","xmm7",0xff);
493e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*12-128,"ebp"),"xmm0");
494e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*13-128,"ebp"),"xmm1");
495e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*14-128,"ebp"),"xmm2");
496e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*15-128,"ebp"),"xmm3");
497e1051a39Sopenharmony_ci	 &movdqu	("xmm3",&QWP(16,"edx"));	# key
498e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*4-128,"ebp"),"xmm4");
499e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*5-128,"ebp"),"xmm5");
500e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*6-128,"ebp"),"xmm6");
501e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*7-128,"ebp"),"xmm7");
502e1051a39Sopenharmony_ci	 &movdqa	("xmm7",&QWP(16*2,"eax"));	# sigma
503e1051a39Sopenharmony_ci	 &lea		("ebx",&DWP(128,"esp"));	# size optimization
504e1051a39Sopenharmony_ci
505e1051a39Sopenharmony_ci	&pshufd		("xmm0","xmm3",0x00);
506e1051a39Sopenharmony_ci	&pshufd		("xmm1","xmm3",0x55);
507e1051a39Sopenharmony_ci	&pshufd		("xmm2","xmm3",0xaa);
508e1051a39Sopenharmony_ci	&pshufd		("xmm3","xmm3",0xff);
509e1051a39Sopenharmony_ci	&pshufd		("xmm4","xmm7",0x00);
510e1051a39Sopenharmony_ci	&pshufd		("xmm5","xmm7",0x55);
511e1051a39Sopenharmony_ci	&pshufd		("xmm6","xmm7",0xaa);
512e1051a39Sopenharmony_ci	&pshufd		("xmm7","xmm7",0xff);
513e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*8-128,"ebp"),"xmm0");
514e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*9-128,"ebp"),"xmm1");
515e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*10-128,"ebp"),"xmm2");
516e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*11-128,"ebp"),"xmm3");
517e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*0-128,"ebp"),"xmm4");
518e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*1-128,"ebp"),"xmm5");
519e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*2-128,"ebp"),"xmm6");
520e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*3-128,"ebp"),"xmm7");
521e1051a39Sopenharmony_ci
522e1051a39Sopenharmony_ci	&lea		($inp,&DWP(128,$inp));		# size optimization
523e1051a39Sopenharmony_ci	&lea		($out,&DWP(128,$out));		# size optimization
524e1051a39Sopenharmony_ci	&jmp		(&label("outer_loop"));
525e1051a39Sopenharmony_ci
526e1051a39Sopenharmony_ci&set_label("outer_loop",16);
527e1051a39Sopenharmony_ci	#&movdqa	("xmm0",&QWP(16*0-128,"ebp"));	# copy key material
528e1051a39Sopenharmony_ci	&movdqa		("xmm1",&QWP(16*1-128,"ebp"));
529e1051a39Sopenharmony_ci	&movdqa		("xmm2",&QWP(16*2-128,"ebp"));
530e1051a39Sopenharmony_ci	&movdqa		("xmm3",&QWP(16*3-128,"ebp"));
531e1051a39Sopenharmony_ci	#&movdqa	("xmm4",&QWP(16*4-128,"ebp"));
532e1051a39Sopenharmony_ci	&movdqa		("xmm5",&QWP(16*5-128,"ebp"));
533e1051a39Sopenharmony_ci	&movdqa		("xmm6",&QWP(16*6-128,"ebp"));
534e1051a39Sopenharmony_ci	&movdqa		("xmm7",&QWP(16*7-128,"ebp"));
535e1051a39Sopenharmony_ci	#&movdqa	(&QWP(16*0-128,"ebx"),"xmm0");
536e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*1-128,"ebx"),"xmm1");
537e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*2-128,"ebx"),"xmm2");
538e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*3-128,"ebx"),"xmm3");
539e1051a39Sopenharmony_ci	#&movdqa	(&QWP(16*4-128,"ebx"),"xmm4");
540e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*5-128,"ebx"),"xmm5");
541e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*6-128,"ebx"),"xmm6");
542e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*7-128,"ebx"),"xmm7");
543e1051a39Sopenharmony_ci	#&movdqa	("xmm0",&QWP(16*8-128,"ebp"));
544e1051a39Sopenharmony_ci	#&movdqa	("xmm1",&QWP(16*9-128,"ebp"));
545e1051a39Sopenharmony_ci	&movdqa		("xmm2",&QWP(16*10-128,"ebp"));
546e1051a39Sopenharmony_ci	&movdqa		("xmm3",&QWP(16*11-128,"ebp"));
547e1051a39Sopenharmony_ci	&movdqa		("xmm4",&QWP(16*12-128,"ebp"));
548e1051a39Sopenharmony_ci	&movdqa		("xmm5",&QWP(16*13-128,"ebp"));
549e1051a39Sopenharmony_ci	&movdqa		("xmm6",&QWP(16*14-128,"ebp"));
550e1051a39Sopenharmony_ci	&movdqa		("xmm7",&QWP(16*15-128,"ebp"));
551e1051a39Sopenharmony_ci	&paddd		("xmm4",&QWP(16*4,"eax"));	# counter value
552e1051a39Sopenharmony_ci	#&movdqa	(&QWP(16*8-128,"ebx"),"xmm0");
553e1051a39Sopenharmony_ci	#&movdqa	(&QWP(16*9-128,"ebx"),"xmm1");
554e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*10-128,"ebx"),"xmm2");
555e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*11-128,"ebx"),"xmm3");
556e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*12-128,"ebx"),"xmm4");
557e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*13-128,"ebx"),"xmm5");
558e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*14-128,"ebx"),"xmm6");
559e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*15-128,"ebx"),"xmm7");
560e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*12-128,"ebp"),"xmm4");	# save counter value
561e1051a39Sopenharmony_ci
562e1051a39Sopenharmony_ci	&movdqa		($xa, &QWP(16*0-128,"ebp"));
563e1051a39Sopenharmony_ci	&movdqa		($xd, "xmm4");
564e1051a39Sopenharmony_ci	&movdqa		($xb_,&QWP(16*4-128,"ebp"));
565e1051a39Sopenharmony_ci	&movdqa		($xc, &QWP(16*8-128,"ebp"));
566e1051a39Sopenharmony_ci	&movdqa		($xc_,&QWP(16*9-128,"ebp"));
567e1051a39Sopenharmony_ci
568e1051a39Sopenharmony_ci	&mov		("edx",10);			# loop counter
569e1051a39Sopenharmony_ci	&nop		();
570e1051a39Sopenharmony_ci
571e1051a39Sopenharmony_ci&set_label("loop",16);
572e1051a39Sopenharmony_ci	&paddd		($xa,$xb_);			# elsewhere
573e1051a39Sopenharmony_ci	&movdqa		($xb,$xb_);
574e1051a39Sopenharmony_ci	&pxor		($xd,$xa);			# elsewhere
575e1051a39Sopenharmony_ci	&QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
576e1051a39Sopenharmony_ci	&QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
577e1051a39Sopenharmony_ci	&QUARTERROUND_SSSE3(2, 6,10, 14, 2);
578e1051a39Sopenharmony_ci	&QUARTERROUND_SSSE3(3, 7,11, 15, 3);
579e1051a39Sopenharmony_ci	&QUARTERROUND_SSSE3(0, 5,10, 15, 4);
580e1051a39Sopenharmony_ci	&QUARTERROUND_SSSE3(1, 6,11, 12, 5);
581e1051a39Sopenharmony_ci	&QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
582e1051a39Sopenharmony_ci	&QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
583e1051a39Sopenharmony_ci	&dec		("edx");
584e1051a39Sopenharmony_ci	&jnz		(&label("loop"));
585e1051a39Sopenharmony_ci
586e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*4-128,"ebx"),$xb_);
587e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*8-128,"ebx"),$xc);
588e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*9-128,"ebx"),$xc_);
589e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*12-128,"ebx"),$xd);
590e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*14-128,"ebx"),$xd_);
591e1051a39Sopenharmony_ci
592e1051a39Sopenharmony_ci    my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
593e1051a39Sopenharmony_ci
594e1051a39Sopenharmony_ci	#&movdqa	($xa0,&QWP(16*0-128,"ebx"));	# it's there
595e1051a39Sopenharmony_ci	&movdqa		($xa1,&QWP(16*1-128,"ebx"));
596e1051a39Sopenharmony_ci	&movdqa		($xa2,&QWP(16*2-128,"ebx"));
597e1051a39Sopenharmony_ci	&movdqa		($xa3,&QWP(16*3-128,"ebx"));
598e1051a39Sopenharmony_ci
599e1051a39Sopenharmony_ci    for($i=0;$i<256;$i+=64) {
600e1051a39Sopenharmony_ci	&paddd		($xa0,&QWP($i+16*0-128,"ebp"));	# accumulate key material
601e1051a39Sopenharmony_ci	&paddd		($xa1,&QWP($i+16*1-128,"ebp"));
602e1051a39Sopenharmony_ci	&paddd		($xa2,&QWP($i+16*2-128,"ebp"));
603e1051a39Sopenharmony_ci	&paddd		($xa3,&QWP($i+16*3-128,"ebp"));
604e1051a39Sopenharmony_ci
605e1051a39Sopenharmony_ci	&movdqa		($xt2,$xa0);		# "de-interlace" data
606e1051a39Sopenharmony_ci	&punpckldq	($xa0,$xa1);
607e1051a39Sopenharmony_ci	&movdqa		($xt3,$xa2);
608e1051a39Sopenharmony_ci	&punpckldq	($xa2,$xa3);
609e1051a39Sopenharmony_ci	&punpckhdq	($xt2,$xa1);
610e1051a39Sopenharmony_ci	&punpckhdq	($xt3,$xa3);
611e1051a39Sopenharmony_ci	&movdqa		($xa1,$xa0);
612e1051a39Sopenharmony_ci	&punpcklqdq	($xa0,$xa2);		# "a0"
613e1051a39Sopenharmony_ci	&movdqa		($xa3,$xt2);
614e1051a39Sopenharmony_ci	&punpcklqdq	($xt2,$xt3);		# "a2"
615e1051a39Sopenharmony_ci	&punpckhqdq	($xa1,$xa2);		# "a1"
616e1051a39Sopenharmony_ci	&punpckhqdq	($xa3,$xt3);		# "a3"
617e1051a39Sopenharmony_ci
618e1051a39Sopenharmony_ci	#($xa2,$xt2)=($xt2,$xa2);
619e1051a39Sopenharmony_ci
620e1051a39Sopenharmony_ci	&movdqu		($xt0,&QWP(64*0-128,$inp));	# load input
621e1051a39Sopenharmony_ci	&movdqu		($xt1,&QWP(64*1-128,$inp));
622e1051a39Sopenharmony_ci	&movdqu		($xa2,&QWP(64*2-128,$inp));
623e1051a39Sopenharmony_ci	&movdqu		($xt3,&QWP(64*3-128,$inp));
624e1051a39Sopenharmony_ci	&lea		($inp,&QWP($i<192?16:(64*4-16*3),$inp));
625e1051a39Sopenharmony_ci	&pxor		($xt0,$xa0);
626e1051a39Sopenharmony_ci	&movdqa		($xa0,&QWP($i+16*4-128,"ebx"))	if ($i<192);
627e1051a39Sopenharmony_ci	&pxor		($xt1,$xa1);
628e1051a39Sopenharmony_ci	&movdqa		($xa1,&QWP($i+16*5-128,"ebx"))	if ($i<192);
629e1051a39Sopenharmony_ci	&pxor		($xt2,$xa2);
630e1051a39Sopenharmony_ci	&movdqa		($xa2,&QWP($i+16*6-128,"ebx"))	if ($i<192);
631e1051a39Sopenharmony_ci	&pxor		($xt3,$xa3);
632e1051a39Sopenharmony_ci	&movdqa		($xa3,&QWP($i+16*7-128,"ebx"))	if ($i<192);
633e1051a39Sopenharmony_ci	&movdqu		(&QWP(64*0-128,$out),$xt0);	# store output
634e1051a39Sopenharmony_ci	&movdqu		(&QWP(64*1-128,$out),$xt1);
635e1051a39Sopenharmony_ci	&movdqu		(&QWP(64*2-128,$out),$xt2);
636e1051a39Sopenharmony_ci	&movdqu		(&QWP(64*3-128,$out),$xt3);
637e1051a39Sopenharmony_ci	&lea		($out,&QWP($i<192?16:(64*4-16*3),$out));
638e1051a39Sopenharmony_ci    }
639e1051a39Sopenharmony_ci	&sub		($len,64*4);
640e1051a39Sopenharmony_ci	&jnc		(&label("outer_loop"));
641e1051a39Sopenharmony_ci
642e1051a39Sopenharmony_ci	&add		($len,64*4);
643e1051a39Sopenharmony_ci	&jz		(&label("done"));
644e1051a39Sopenharmony_ci
645e1051a39Sopenharmony_ci	&mov		("ebx",&DWP(512+8,"esp"));	# restore pointers
646e1051a39Sopenharmony_ci	&lea		($inp,&DWP(-128,$inp));
647e1051a39Sopenharmony_ci	&mov		("edx",&DWP(512+4,"esp"));
648e1051a39Sopenharmony_ci	&lea		($out,&DWP(-128,$out));
649e1051a39Sopenharmony_ci
650e1051a39Sopenharmony_ci	&movd		("xmm2",&DWP(16*12-128,"ebp"));	# counter value
651e1051a39Sopenharmony_ci	&movdqu		("xmm3",&QWP(0,"ebx"));
652e1051a39Sopenharmony_ci	&paddd		("xmm2",&QWP(16*6,"eax"));	# +four
653e1051a39Sopenharmony_ci	&pand		("xmm3",&QWP(16*7,"eax"));
654e1051a39Sopenharmony_ci	&por		("xmm3","xmm2");		# counter value
655e1051a39Sopenharmony_ci}
656e1051a39Sopenharmony_ci{
657e1051a39Sopenharmony_cimy ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
658e1051a39Sopenharmony_ci
659e1051a39Sopenharmony_cisub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
660e1051a39Sopenharmony_ci	&paddd		($a,$b);
661e1051a39Sopenharmony_ci	&pxor		($d,$a);
662e1051a39Sopenharmony_ci	&pshufb		($d,$rot16);
663e1051a39Sopenharmony_ci
664e1051a39Sopenharmony_ci	&paddd		($c,$d);
665e1051a39Sopenharmony_ci	&pxor		($b,$c);
666e1051a39Sopenharmony_ci	&movdqa		($t,$b);
667e1051a39Sopenharmony_ci	&psrld		($b,20);
668e1051a39Sopenharmony_ci	&pslld		($t,12);
669e1051a39Sopenharmony_ci	&por		($b,$t);
670e1051a39Sopenharmony_ci
671e1051a39Sopenharmony_ci	&paddd		($a,$b);
672e1051a39Sopenharmony_ci	&pxor		($d,$a);
673e1051a39Sopenharmony_ci	&pshufb		($d,$rot24);
674e1051a39Sopenharmony_ci
675e1051a39Sopenharmony_ci	&paddd		($c,$d);
676e1051a39Sopenharmony_ci	&pxor		($b,$c);
677e1051a39Sopenharmony_ci	&movdqa		($t,$b);
678e1051a39Sopenharmony_ci	&psrld		($b,25);
679e1051a39Sopenharmony_ci	&pslld		($t,7);
680e1051a39Sopenharmony_ci	&por		($b,$t);
681e1051a39Sopenharmony_ci}
682e1051a39Sopenharmony_ci
683e1051a39Sopenharmony_ci&set_label("1x");
684e1051a39Sopenharmony_ci	&movdqa		($a,&QWP(16*2,"eax"));		# sigma
685e1051a39Sopenharmony_ci	&movdqu		($b,&QWP(0,"edx"));
686e1051a39Sopenharmony_ci	&movdqu		($c,&QWP(16,"edx"));
687e1051a39Sopenharmony_ci	#&movdqu	($d,&QWP(0,"ebx"));		# already loaded
688e1051a39Sopenharmony_ci	&movdqa		($rot16,&QWP(0,"eax"));
689e1051a39Sopenharmony_ci	&movdqa		($rot24,&QWP(16,"eax"));
690e1051a39Sopenharmony_ci	&mov		(&DWP(16*3,"esp"),"ebp");
691e1051a39Sopenharmony_ci
692e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$a);
693e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$b);
694e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*2,"esp"),$c);
695e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$d);
696e1051a39Sopenharmony_ci	&mov		("edx",10);
697e1051a39Sopenharmony_ci	&jmp		(&label("loop1x"));
698e1051a39Sopenharmony_ci
699e1051a39Sopenharmony_ci&set_label("outer1x",16);
700e1051a39Sopenharmony_ci	&movdqa		($d,&QWP(16*5,"eax"));		# one
701e1051a39Sopenharmony_ci	&movdqa		($a,&QWP(16*0,"esp"));
702e1051a39Sopenharmony_ci	&movdqa		($b,&QWP(16*1,"esp"));
703e1051a39Sopenharmony_ci	&movdqa		($c,&QWP(16*2,"esp"));
704e1051a39Sopenharmony_ci	&paddd		($d,&QWP(16*3,"esp"));
705e1051a39Sopenharmony_ci	&mov		("edx",10);
706e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$d);
707e1051a39Sopenharmony_ci	&jmp		(&label("loop1x"));
708e1051a39Sopenharmony_ci
709e1051a39Sopenharmony_ci&set_label("loop1x",16);
710e1051a39Sopenharmony_ci	&SSSE3ROUND();
711e1051a39Sopenharmony_ci	&pshufd	($c,$c,0b01001110);
712e1051a39Sopenharmony_ci	&pshufd	($b,$b,0b00111001);
713e1051a39Sopenharmony_ci	&pshufd	($d,$d,0b10010011);
714e1051a39Sopenharmony_ci	&nop	();
715e1051a39Sopenharmony_ci
716e1051a39Sopenharmony_ci	&SSSE3ROUND();
717e1051a39Sopenharmony_ci	&pshufd	($c,$c,0b01001110);
718e1051a39Sopenharmony_ci	&pshufd	($b,$b,0b10010011);
719e1051a39Sopenharmony_ci	&pshufd	($d,$d,0b00111001);
720e1051a39Sopenharmony_ci
721e1051a39Sopenharmony_ci	&dec		("edx");
722e1051a39Sopenharmony_ci	&jnz		(&label("loop1x"));
723e1051a39Sopenharmony_ci
724e1051a39Sopenharmony_ci	&paddd		($a,&QWP(16*0,"esp"));
725e1051a39Sopenharmony_ci	&paddd		($b,&QWP(16*1,"esp"));
726e1051a39Sopenharmony_ci	&paddd		($c,&QWP(16*2,"esp"));
727e1051a39Sopenharmony_ci	&paddd		($d,&QWP(16*3,"esp"));
728e1051a39Sopenharmony_ci
729e1051a39Sopenharmony_ci	&cmp		($len,64);
730e1051a39Sopenharmony_ci	&jb		(&label("tail"));
731e1051a39Sopenharmony_ci
732e1051a39Sopenharmony_ci	&movdqu		($t,&QWP(16*0,$inp));
733e1051a39Sopenharmony_ci	&movdqu		($t1,&QWP(16*1,$inp));
734e1051a39Sopenharmony_ci	&pxor		($a,$t);		# xor with input
735e1051a39Sopenharmony_ci	&movdqu		($t,&QWP(16*2,$inp));
736e1051a39Sopenharmony_ci	&pxor		($b,$t1);
737e1051a39Sopenharmony_ci	&movdqu		($t1,&QWP(16*3,$inp));
738e1051a39Sopenharmony_ci	&pxor		($c,$t);
739e1051a39Sopenharmony_ci	&pxor		($d,$t1);
740e1051a39Sopenharmony_ci	&lea		($inp,&DWP(16*4,$inp));	# inp+=64
741e1051a39Sopenharmony_ci
742e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*0,$out),$a);	# write output
743e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*1,$out),$b);
744e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*2,$out),$c);
745e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*3,$out),$d);
746e1051a39Sopenharmony_ci	&lea		($out,&DWP(16*4,$out));	# inp+=64
747e1051a39Sopenharmony_ci
748e1051a39Sopenharmony_ci	&sub		($len,64);
749e1051a39Sopenharmony_ci	&jnz		(&label("outer1x"));
750e1051a39Sopenharmony_ci
751e1051a39Sopenharmony_ci	&jmp		(&label("done"));
752e1051a39Sopenharmony_ci
753e1051a39Sopenharmony_ci&set_label("tail");
754e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$a);
755e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$b);
756e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*2,"esp"),$c);
757e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$d);
758e1051a39Sopenharmony_ci
759e1051a39Sopenharmony_ci	&xor		("eax","eax");
760e1051a39Sopenharmony_ci	&xor		("edx","edx");
761e1051a39Sopenharmony_ci	&xor		("ebp","ebp");
762e1051a39Sopenharmony_ci
763e1051a39Sopenharmony_ci&set_label("tail_loop");
764e1051a39Sopenharmony_ci	&movb		("al",&BP(0,"esp","ebp"));
765e1051a39Sopenharmony_ci	&movb		("dl",&BP(0,$inp,"ebp"));
766e1051a39Sopenharmony_ci	&lea		("ebp",&DWP(1,"ebp"));
767e1051a39Sopenharmony_ci	&xor		("al","dl");
768e1051a39Sopenharmony_ci	&movb		(&BP(-1,$out,"ebp"),"al");
769e1051a39Sopenharmony_ci	&dec		($len);
770e1051a39Sopenharmony_ci	&jnz		(&label("tail_loop"));
771e1051a39Sopenharmony_ci}
772e1051a39Sopenharmony_ci&set_label("done");
773e1051a39Sopenharmony_ci	&mov		("esp",&DWP(512,"esp"));
774e1051a39Sopenharmony_ci&function_end("ChaCha20_ssse3");
775e1051a39Sopenharmony_ci
776e1051a39Sopenharmony_ci&align	(64);
777e1051a39Sopenharmony_ci&set_label("ssse3_data");
778e1051a39Sopenharmony_ci&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
779e1051a39Sopenharmony_ci&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
780e1051a39Sopenharmony_ci&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
781e1051a39Sopenharmony_ci&data_word(0,1,2,3);
782e1051a39Sopenharmony_ci&data_word(4,4,4,4);
783e1051a39Sopenharmony_ci&data_word(1,0,0,0);
784e1051a39Sopenharmony_ci&data_word(4,0,0,0);
785e1051a39Sopenharmony_ci&data_word(0,-1,-1,-1);
786e1051a39Sopenharmony_ci&align	(64);
787e1051a39Sopenharmony_ci}
788e1051a39Sopenharmony_ci&asciz	("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
789e1051a39Sopenharmony_ci
790e1051a39Sopenharmony_ciif ($ymm) {
791e1051a39Sopenharmony_cimy ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
792e1051a39Sopenharmony_cimy ($out,$inp,$len)=("edi","esi","ecx");
793e1051a39Sopenharmony_ci
794e1051a39Sopenharmony_cisub QUARTERROUND_XOP {
795e1051a39Sopenharmony_cimy ($ai,$bi,$ci,$di,$i)=@_;
796e1051a39Sopenharmony_cimy ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
797e1051a39Sopenharmony_cimy ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
798e1051a39Sopenharmony_ci
799e1051a39Sopenharmony_ci	#       a   b   c   d
800e1051a39Sopenharmony_ci	#
801e1051a39Sopenharmony_ci	#       0   4   8  12 < even round
802e1051a39Sopenharmony_ci	#       1   5   9  13
803e1051a39Sopenharmony_ci	#       2   6  10  14
804e1051a39Sopenharmony_ci	#       3   7  11  15
805e1051a39Sopenharmony_ci	#       0   5  10  15 < odd round
806e1051a39Sopenharmony_ci	#       1   6  11  12
807e1051a39Sopenharmony_ci	#       2   7   8  13
808e1051a39Sopenharmony_ci	#       3   4   9  14
809e1051a39Sopenharmony_ci
810e1051a39Sopenharmony_ci	if ($i==0) {
811e1051a39Sopenharmony_ci            my $j=4;
812e1051a39Sopenharmony_ci	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
813e1051a39Sopenharmony_ci	} elsif ($i==3) {
814e1051a39Sopenharmony_ci            my $j=0;
815e1051a39Sopenharmony_ci	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
816e1051a39Sopenharmony_ci	} elsif ($i==4) {
817e1051a39Sopenharmony_ci            my $j=4;
818e1051a39Sopenharmony_ci	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
819e1051a39Sopenharmony_ci	} elsif ($i==7) {
820e1051a39Sopenharmony_ci            my $j=0;
821e1051a39Sopenharmony_ci	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
822e1051a39Sopenharmony_ci	}
823e1051a39Sopenharmony_ci
824e1051a39Sopenharmony_ci	#&vpaddd	($xa,$xa,$xb);			# see elsewhere
825e1051a39Sopenharmony_ci	#&vpxor		($xd,$xd,$xa);			# see elsewhere
826e1051a39Sopenharmony_ci	 &vmovdqa	(&QWP(16*$cp-128,"ebx"),$xc_)	if ($ai>0 && $ai<3);
827e1051a39Sopenharmony_ci	&vprotd		($xd,$xd,16);
828e1051a39Sopenharmony_ci	 &vmovdqa	(&QWP(16*$bp-128,"ebx"),$xb_)	if ($i!=0);
829e1051a39Sopenharmony_ci	&vpaddd		($xc,$xc,$xd);
830e1051a39Sopenharmony_ci	 &vmovdqa	($xc_,&QWP(16*$cn-128,"ebx"))	if ($ai>0 && $ai<3);
831e1051a39Sopenharmony_ci	&vpxor		($xb,$i!=0?$xb:$xb_,$xc);
832e1051a39Sopenharmony_ci	 &vmovdqa	($xa_,&QWP(16*$an-128,"ebx"));
833e1051a39Sopenharmony_ci	&vprotd		($xb,$xb,12);
834e1051a39Sopenharmony_ci	 &vmovdqa	($xb_,&QWP(16*$bn-128,"ebx"))	if ($i<7);
835e1051a39Sopenharmony_ci	&vpaddd		($xa,$xa,$xb);
836e1051a39Sopenharmony_ci	 &vmovdqa	($xd_,&QWP(16*$dn-128,"ebx"))	if ($di!=$dn);
837e1051a39Sopenharmony_ci	&vpxor		($xd,$xd,$xa);
838e1051a39Sopenharmony_ci	 &vpaddd	($xa_,$xa_,$xb_)		if ($i<7);	# elsewhere
839e1051a39Sopenharmony_ci	&vprotd		($xd,$xd,8);
840e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*$ai-128,"ebx"),$xa);
841e1051a39Sopenharmony_ci	&vpaddd		($xc,$xc,$xd);
842e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*$di-128,"ebx"),$xd)	if ($di!=$dn);
843e1051a39Sopenharmony_ci	&vpxor		($xb,$xb,$xc);
844e1051a39Sopenharmony_ci	 &vpxor		($xd_,$di==$dn?$xd:$xd_,$xa_)	if ($i<7);	# elsewhere
845e1051a39Sopenharmony_ci	&vprotd		($xb,$xb,7);
846e1051a39Sopenharmony_ci
847e1051a39Sopenharmony_ci	($xa,$xa_)=($xa_,$xa);
848e1051a39Sopenharmony_ci	($xb,$xb_)=($xb_,$xb);
849e1051a39Sopenharmony_ci	($xc,$xc_)=($xc_,$xc);
850e1051a39Sopenharmony_ci	($xd,$xd_)=($xd_,$xd);
851e1051a39Sopenharmony_ci}
852e1051a39Sopenharmony_ci
853e1051a39Sopenharmony_ci&function_begin("ChaCha20_xop");
854e1051a39Sopenharmony_ci&set_label("xop_shortcut");
855e1051a39Sopenharmony_ci	&mov		($out,&wparam(0));
856e1051a39Sopenharmony_ci	&mov		($inp,&wparam(1));
857e1051a39Sopenharmony_ci	&mov		($len,&wparam(2));
858e1051a39Sopenharmony_ci	&mov		("edx",&wparam(3));		# key
859e1051a39Sopenharmony_ci	&mov		("ebx",&wparam(4));		# counter and nonce
860e1051a39Sopenharmony_ci	&vzeroupper	();
861e1051a39Sopenharmony_ci
862e1051a39Sopenharmony_ci	&mov		("ebp","esp");
863e1051a39Sopenharmony_ci	&stack_push	(131);
864e1051a39Sopenharmony_ci	&and		("esp",-64);
865e1051a39Sopenharmony_ci	&mov		(&DWP(512,"esp"),"ebp");
866e1051a39Sopenharmony_ci
867e1051a39Sopenharmony_ci	&lea		("eax",&DWP(&label("ssse3_data")."-".
868e1051a39Sopenharmony_ci				    &label("pic_point"),"eax"));
869e1051a39Sopenharmony_ci	&vmovdqu	("xmm3",&QWP(0,"ebx"));		# counter and nonce
870e1051a39Sopenharmony_ci
871e1051a39Sopenharmony_ci	&cmp		($len,64*4);
872e1051a39Sopenharmony_ci	&jb		(&label("1x"));
873e1051a39Sopenharmony_ci
874e1051a39Sopenharmony_ci	&mov		(&DWP(512+4,"esp"),"edx");	# offload pointers
875e1051a39Sopenharmony_ci	&mov		(&DWP(512+8,"esp"),"ebx");
876e1051a39Sopenharmony_ci	&sub		($len,64*4);			# bias len
877e1051a39Sopenharmony_ci	&lea		("ebp",&DWP(256+128,"esp"));	# size optimization
878e1051a39Sopenharmony_ci
879e1051a39Sopenharmony_ci	&vmovdqu	("xmm7",&QWP(0,"edx"));		# key
880e1051a39Sopenharmony_ci	&vpshufd	("xmm0","xmm3",0x00);
881e1051a39Sopenharmony_ci	&vpshufd	("xmm1","xmm3",0x55);
882e1051a39Sopenharmony_ci	&vpshufd	("xmm2","xmm3",0xaa);
883e1051a39Sopenharmony_ci	&vpshufd	("xmm3","xmm3",0xff);
884e1051a39Sopenharmony_ci	 &vpaddd	("xmm0","xmm0",&QWP(16*3,"eax"));	# fix counters
885e1051a39Sopenharmony_ci	&vpshufd	("xmm4","xmm7",0x00);
886e1051a39Sopenharmony_ci	&vpshufd	("xmm5","xmm7",0x55);
887e1051a39Sopenharmony_ci	 &vpsubd	("xmm0","xmm0",&QWP(16*4,"eax"));
888e1051a39Sopenharmony_ci	&vpshufd	("xmm6","xmm7",0xaa);
889e1051a39Sopenharmony_ci	&vpshufd	("xmm7","xmm7",0xff);
890e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*12-128,"ebp"),"xmm0");
891e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*13-128,"ebp"),"xmm1");
892e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*14-128,"ebp"),"xmm2");
893e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*15-128,"ebp"),"xmm3");
894e1051a39Sopenharmony_ci	 &vmovdqu	("xmm3",&QWP(16,"edx"));	# key
895e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*4-128,"ebp"),"xmm4");
896e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*5-128,"ebp"),"xmm5");
897e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*6-128,"ebp"),"xmm6");
898e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*7-128,"ebp"),"xmm7");
899e1051a39Sopenharmony_ci	 &vmovdqa	("xmm7",&QWP(16*2,"eax"));	# sigma
900e1051a39Sopenharmony_ci	 &lea		("ebx",&DWP(128,"esp"));	# size optimization
901e1051a39Sopenharmony_ci
902e1051a39Sopenharmony_ci	&vpshufd	("xmm0","xmm3",0x00);
903e1051a39Sopenharmony_ci	&vpshufd	("xmm1","xmm3",0x55);
904e1051a39Sopenharmony_ci	&vpshufd	("xmm2","xmm3",0xaa);
905e1051a39Sopenharmony_ci	&vpshufd	("xmm3","xmm3",0xff);
906e1051a39Sopenharmony_ci	&vpshufd	("xmm4","xmm7",0x00);
907e1051a39Sopenharmony_ci	&vpshufd	("xmm5","xmm7",0x55);
908e1051a39Sopenharmony_ci	&vpshufd	("xmm6","xmm7",0xaa);
909e1051a39Sopenharmony_ci	&vpshufd	("xmm7","xmm7",0xff);
910e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*8-128,"ebp"),"xmm0");
911e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*9-128,"ebp"),"xmm1");
912e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*10-128,"ebp"),"xmm2");
913e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*11-128,"ebp"),"xmm3");
914e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*0-128,"ebp"),"xmm4");
915e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*1-128,"ebp"),"xmm5");
916e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*2-128,"ebp"),"xmm6");
917e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*3-128,"ebp"),"xmm7");
918e1051a39Sopenharmony_ci
919e1051a39Sopenharmony_ci	&lea		($inp,&DWP(128,$inp));		# size optimization
920e1051a39Sopenharmony_ci	&lea		($out,&DWP(128,$out));		# size optimization
921e1051a39Sopenharmony_ci	&jmp		(&label("outer_loop"));
922e1051a39Sopenharmony_ci
923e1051a39Sopenharmony_ci&set_label("outer_loop",32);
924e1051a39Sopenharmony_ci	#&vmovdqa	("xmm0",&QWP(16*0-128,"ebp"));	# copy key material
925e1051a39Sopenharmony_ci	&vmovdqa	("xmm1",&QWP(16*1-128,"ebp"));
926e1051a39Sopenharmony_ci	&vmovdqa	("xmm2",&QWP(16*2-128,"ebp"));
927e1051a39Sopenharmony_ci	&vmovdqa	("xmm3",&QWP(16*3-128,"ebp"));
928e1051a39Sopenharmony_ci	#&vmovdqa	("xmm4",&QWP(16*4-128,"ebp"));
929e1051a39Sopenharmony_ci	&vmovdqa	("xmm5",&QWP(16*5-128,"ebp"));
930e1051a39Sopenharmony_ci	&vmovdqa	("xmm6",&QWP(16*6-128,"ebp"));
931e1051a39Sopenharmony_ci	&vmovdqa	("xmm7",&QWP(16*7-128,"ebp"));
932e1051a39Sopenharmony_ci	#&vmovdqa	(&QWP(16*0-128,"ebx"),"xmm0");
933e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*1-128,"ebx"),"xmm1");
934e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*2-128,"ebx"),"xmm2");
935e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*3-128,"ebx"),"xmm3");
936e1051a39Sopenharmony_ci	#&vmovdqa	(&QWP(16*4-128,"ebx"),"xmm4");
937e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*5-128,"ebx"),"xmm5");
938e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*6-128,"ebx"),"xmm6");
939e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*7-128,"ebx"),"xmm7");
940e1051a39Sopenharmony_ci	#&vmovdqa	("xmm0",&QWP(16*8-128,"ebp"));
941e1051a39Sopenharmony_ci	#&vmovdqa	("xmm1",&QWP(16*9-128,"ebp"));
942e1051a39Sopenharmony_ci	&vmovdqa	("xmm2",&QWP(16*10-128,"ebp"));
943e1051a39Sopenharmony_ci	&vmovdqa	("xmm3",&QWP(16*11-128,"ebp"));
944e1051a39Sopenharmony_ci	&vmovdqa	("xmm4",&QWP(16*12-128,"ebp"));
945e1051a39Sopenharmony_ci	&vmovdqa	("xmm5",&QWP(16*13-128,"ebp"));
946e1051a39Sopenharmony_ci	&vmovdqa	("xmm6",&QWP(16*14-128,"ebp"));
947e1051a39Sopenharmony_ci	&vmovdqa	("xmm7",&QWP(16*15-128,"ebp"));
948e1051a39Sopenharmony_ci	&vpaddd		("xmm4","xmm4",&QWP(16*4,"eax"));	# counter value
949e1051a39Sopenharmony_ci	#&vmovdqa	(&QWP(16*8-128,"ebx"),"xmm0");
950e1051a39Sopenharmony_ci	#&vmovdqa	(&QWP(16*9-128,"ebx"),"xmm1");
951e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*10-128,"ebx"),"xmm2");
952e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*11-128,"ebx"),"xmm3");
953e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*12-128,"ebx"),"xmm4");
954e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*13-128,"ebx"),"xmm5");
955e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*14-128,"ebx"),"xmm6");
956e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*15-128,"ebx"),"xmm7");
957e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*12-128,"ebp"),"xmm4");	# save counter value
958e1051a39Sopenharmony_ci
959e1051a39Sopenharmony_ci	&vmovdqa	($xa, &QWP(16*0-128,"ebp"));
960e1051a39Sopenharmony_ci	&vmovdqa	($xd, "xmm4");
961e1051a39Sopenharmony_ci	&vmovdqa	($xb_,&QWP(16*4-128,"ebp"));
962e1051a39Sopenharmony_ci	&vmovdqa	($xc, &QWP(16*8-128,"ebp"));
963e1051a39Sopenharmony_ci	&vmovdqa	($xc_,&QWP(16*9-128,"ebp"));
964e1051a39Sopenharmony_ci
965e1051a39Sopenharmony_ci	&mov		("edx",10);			# loop counter
966e1051a39Sopenharmony_ci	&nop		();
967e1051a39Sopenharmony_ci
968e1051a39Sopenharmony_ci&set_label("loop",32);
969e1051a39Sopenharmony_ci	&vpaddd		($xa,$xa,$xb_);			# elsewhere
970e1051a39Sopenharmony_ci	&vpxor		($xd,$xd,$xa);			# elsewhere
971e1051a39Sopenharmony_ci	&QUARTERROUND_XOP(0, 4, 8, 12, 0);
972e1051a39Sopenharmony_ci	&QUARTERROUND_XOP(1, 5, 9, 13, 1);
973e1051a39Sopenharmony_ci	&QUARTERROUND_XOP(2, 6,10, 14, 2);
974e1051a39Sopenharmony_ci	&QUARTERROUND_XOP(3, 7,11, 15, 3);
975e1051a39Sopenharmony_ci	&QUARTERROUND_XOP(0, 5,10, 15, 4);
976e1051a39Sopenharmony_ci	&QUARTERROUND_XOP(1, 6,11, 12, 5);
977e1051a39Sopenharmony_ci	&QUARTERROUND_XOP(2, 7, 8, 13, 6);
978e1051a39Sopenharmony_ci	&QUARTERROUND_XOP(3, 4, 9, 14, 7);
979e1051a39Sopenharmony_ci	&dec		("edx");
980e1051a39Sopenharmony_ci	&jnz		(&label("loop"));
981e1051a39Sopenharmony_ci
982e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*4-128,"ebx"),$xb_);
983e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*8-128,"ebx"),$xc);
984e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*9-128,"ebx"),$xc_);
985e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*12-128,"ebx"),$xd);
986e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*14-128,"ebx"),$xd_);
987e1051a39Sopenharmony_ci
988e1051a39Sopenharmony_ci    my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
989e1051a39Sopenharmony_ci
990e1051a39Sopenharmony_ci	#&vmovdqa	($xa0,&QWP(16*0-128,"ebx"));	# it's there
991e1051a39Sopenharmony_ci	&vmovdqa	($xa1,&QWP(16*1-128,"ebx"));
992e1051a39Sopenharmony_ci	&vmovdqa	($xa2,&QWP(16*2-128,"ebx"));
993e1051a39Sopenharmony_ci	&vmovdqa	($xa3,&QWP(16*3-128,"ebx"));
994e1051a39Sopenharmony_ci
995e1051a39Sopenharmony_ci    for($i=0;$i<256;$i+=64) {
996e1051a39Sopenharmony_ci	&vpaddd		($xa0,$xa0,&QWP($i+16*0-128,"ebp"));	# accumulate key material
997e1051a39Sopenharmony_ci	&vpaddd		($xa1,$xa1,&QWP($i+16*1-128,"ebp"));
998e1051a39Sopenharmony_ci	&vpaddd		($xa2,$xa2,&QWP($i+16*2-128,"ebp"));
999e1051a39Sopenharmony_ci	&vpaddd		($xa3,$xa3,&QWP($i+16*3-128,"ebp"));
1000e1051a39Sopenharmony_ci
1001e1051a39Sopenharmony_ci	&vpunpckldq	($xt2,$xa0,$xa1);	# "de-interlace" data
1002e1051a39Sopenharmony_ci	&vpunpckldq	($xt3,$xa2,$xa3);
1003e1051a39Sopenharmony_ci	&vpunpckhdq	($xa0,$xa0,$xa1);
1004e1051a39Sopenharmony_ci	&vpunpckhdq	($xa2,$xa2,$xa3);
1005e1051a39Sopenharmony_ci	&vpunpcklqdq	($xa1,$xt2,$xt3);	# "a0"
1006e1051a39Sopenharmony_ci	&vpunpckhqdq	($xt2,$xt2,$xt3);	# "a1"
1007e1051a39Sopenharmony_ci	&vpunpcklqdq	($xt3,$xa0,$xa2);	# "a2"
1008e1051a39Sopenharmony_ci	&vpunpckhqdq	($xa3,$xa0,$xa2);	# "a3"
1009e1051a39Sopenharmony_ci
1010e1051a39Sopenharmony_ci	&vpxor		($xt0,$xa1,&QWP(64*0-128,$inp));
1011e1051a39Sopenharmony_ci	&vpxor		($xt1,$xt2,&QWP(64*1-128,$inp));
1012e1051a39Sopenharmony_ci	&vpxor		($xt2,$xt3,&QWP(64*2-128,$inp));
1013e1051a39Sopenharmony_ci	&vpxor		($xt3,$xa3,&QWP(64*3-128,$inp));
1014e1051a39Sopenharmony_ci	&lea		($inp,&QWP($i<192?16:(64*4-16*3),$inp));
1015e1051a39Sopenharmony_ci	&vmovdqa	($xa0,&QWP($i+16*4-128,"ebx"))	if ($i<192);
1016e1051a39Sopenharmony_ci	&vmovdqa	($xa1,&QWP($i+16*5-128,"ebx"))	if ($i<192);
1017e1051a39Sopenharmony_ci	&vmovdqa	($xa2,&QWP($i+16*6-128,"ebx"))	if ($i<192);
1018e1051a39Sopenharmony_ci	&vmovdqa	($xa3,&QWP($i+16*7-128,"ebx"))	if ($i<192);
1019e1051a39Sopenharmony_ci	&vmovdqu	(&QWP(64*0-128,$out),$xt0);	# store output
1020e1051a39Sopenharmony_ci	&vmovdqu	(&QWP(64*1-128,$out),$xt1);
1021e1051a39Sopenharmony_ci	&vmovdqu	(&QWP(64*2-128,$out),$xt2);
1022e1051a39Sopenharmony_ci	&vmovdqu	(&QWP(64*3-128,$out),$xt3);
1023e1051a39Sopenharmony_ci	&lea		($out,&QWP($i<192?16:(64*4-16*3),$out));
1024e1051a39Sopenharmony_ci    }
1025e1051a39Sopenharmony_ci	&sub		($len,64*4);
1026e1051a39Sopenharmony_ci	&jnc		(&label("outer_loop"));
1027e1051a39Sopenharmony_ci
1028e1051a39Sopenharmony_ci	&add		($len,64*4);
1029e1051a39Sopenharmony_ci	&jz		(&label("done"));
1030e1051a39Sopenharmony_ci
1031e1051a39Sopenharmony_ci	&mov		("ebx",&DWP(512+8,"esp"));	# restore pointers
1032e1051a39Sopenharmony_ci	&lea		($inp,&DWP(-128,$inp));
1033e1051a39Sopenharmony_ci	&mov		("edx",&DWP(512+4,"esp"));
1034e1051a39Sopenharmony_ci	&lea		($out,&DWP(-128,$out));
1035e1051a39Sopenharmony_ci
1036e1051a39Sopenharmony_ci	&vmovd		("xmm2",&DWP(16*12-128,"ebp"));	# counter value
1037e1051a39Sopenharmony_ci	&vmovdqu	("xmm3",&QWP(0,"ebx"));
1038e1051a39Sopenharmony_ci	&vpaddd		("xmm2","xmm2",&QWP(16*6,"eax"));# +four
1039e1051a39Sopenharmony_ci	&vpand		("xmm3","xmm3",&QWP(16*7,"eax"));
1040e1051a39Sopenharmony_ci	&vpor		("xmm3","xmm3","xmm2");		# counter value
1041e1051a39Sopenharmony_ci{
1042e1051a39Sopenharmony_cimy ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
1043e1051a39Sopenharmony_ci
1044e1051a39Sopenharmony_cisub XOPROUND {
1045e1051a39Sopenharmony_ci	&vpaddd		($a,$a,$b);
1046e1051a39Sopenharmony_ci	&vpxor		($d,$d,$a);
1047e1051a39Sopenharmony_ci	&vprotd		($d,$d,16);
1048e1051a39Sopenharmony_ci
1049e1051a39Sopenharmony_ci	&vpaddd		($c,$c,$d);
1050e1051a39Sopenharmony_ci	&vpxor		($b,$b,$c);
1051e1051a39Sopenharmony_ci	&vprotd		($b,$b,12);
1052e1051a39Sopenharmony_ci
1053e1051a39Sopenharmony_ci	&vpaddd		($a,$a,$b);
1054e1051a39Sopenharmony_ci	&vpxor		($d,$d,$a);
1055e1051a39Sopenharmony_ci	&vprotd		($d,$d,8);
1056e1051a39Sopenharmony_ci
1057e1051a39Sopenharmony_ci	&vpaddd		($c,$c,$d);
1058e1051a39Sopenharmony_ci	&vpxor		($b,$b,$c);
1059e1051a39Sopenharmony_ci	&vprotd		($b,$b,7);
1060e1051a39Sopenharmony_ci}
1061e1051a39Sopenharmony_ci
1062e1051a39Sopenharmony_ci&set_label("1x");
1063e1051a39Sopenharmony_ci	&vmovdqa	($a,&QWP(16*2,"eax"));		# sigma
1064e1051a39Sopenharmony_ci	&vmovdqu	($b,&QWP(0,"edx"));
1065e1051a39Sopenharmony_ci	&vmovdqu	($c,&QWP(16,"edx"));
1066e1051a39Sopenharmony_ci	#&vmovdqu	($d,&QWP(0,"ebx"));		# already loaded
1067e1051a39Sopenharmony_ci	&vmovdqa	($rot16,&QWP(0,"eax"));
1068e1051a39Sopenharmony_ci	&vmovdqa	($rot24,&QWP(16,"eax"));
1069e1051a39Sopenharmony_ci	&mov		(&DWP(16*3,"esp"),"ebp");
1070e1051a39Sopenharmony_ci
1071e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*0,"esp"),$a);
1072e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*1,"esp"),$b);
1073e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*2,"esp"),$c);
1074e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*3,"esp"),$d);
1075e1051a39Sopenharmony_ci	&mov		("edx",10);
1076e1051a39Sopenharmony_ci	&jmp		(&label("loop1x"));
1077e1051a39Sopenharmony_ci
1078e1051a39Sopenharmony_ci&set_label("outer1x",16);
1079e1051a39Sopenharmony_ci	&vmovdqa	($d,&QWP(16*5,"eax"));		# one
1080e1051a39Sopenharmony_ci	&vmovdqa	($a,&QWP(16*0,"esp"));
1081e1051a39Sopenharmony_ci	&vmovdqa	($b,&QWP(16*1,"esp"));
1082e1051a39Sopenharmony_ci	&vmovdqa	($c,&QWP(16*2,"esp"));
1083e1051a39Sopenharmony_ci	&vpaddd		($d,$d,&QWP(16*3,"esp"));
1084e1051a39Sopenharmony_ci	&mov		("edx",10);
1085e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*3,"esp"),$d);
1086e1051a39Sopenharmony_ci	&jmp		(&label("loop1x"));
1087e1051a39Sopenharmony_ci
1088e1051a39Sopenharmony_ci&set_label("loop1x",16);
1089e1051a39Sopenharmony_ci	&XOPROUND();
1090e1051a39Sopenharmony_ci	&vpshufd	($c,$c,0b01001110);
1091e1051a39Sopenharmony_ci	&vpshufd	($b,$b,0b00111001);
1092e1051a39Sopenharmony_ci	&vpshufd	($d,$d,0b10010011);
1093e1051a39Sopenharmony_ci
1094e1051a39Sopenharmony_ci	&XOPROUND();
1095e1051a39Sopenharmony_ci	&vpshufd	($c,$c,0b01001110);
1096e1051a39Sopenharmony_ci	&vpshufd	($b,$b,0b10010011);
1097e1051a39Sopenharmony_ci	&vpshufd	($d,$d,0b00111001);
1098e1051a39Sopenharmony_ci
1099e1051a39Sopenharmony_ci	&dec		("edx");
1100e1051a39Sopenharmony_ci	&jnz		(&label("loop1x"));
1101e1051a39Sopenharmony_ci
1102e1051a39Sopenharmony_ci	&vpaddd		($a,$a,&QWP(16*0,"esp"));
1103e1051a39Sopenharmony_ci	&vpaddd		($b,$b,&QWP(16*1,"esp"));
1104e1051a39Sopenharmony_ci	&vpaddd		($c,$c,&QWP(16*2,"esp"));
1105e1051a39Sopenharmony_ci	&vpaddd		($d,$d,&QWP(16*3,"esp"));
1106e1051a39Sopenharmony_ci
1107e1051a39Sopenharmony_ci	&cmp		($len,64);
1108e1051a39Sopenharmony_ci	&jb		(&label("tail"));
1109e1051a39Sopenharmony_ci
1110e1051a39Sopenharmony_ci	&vpxor		($a,$a,&QWP(16*0,$inp));	# xor with input
1111e1051a39Sopenharmony_ci	&vpxor		($b,$b,&QWP(16*1,$inp));
1112e1051a39Sopenharmony_ci	&vpxor		($c,$c,&QWP(16*2,$inp));
1113e1051a39Sopenharmony_ci	&vpxor		($d,$d,&QWP(16*3,$inp));
1114e1051a39Sopenharmony_ci	&lea		($inp,&DWP(16*4,$inp));		# inp+=64
1115e1051a39Sopenharmony_ci
1116e1051a39Sopenharmony_ci	&vmovdqu	(&QWP(16*0,$out),$a);		# write output
1117e1051a39Sopenharmony_ci	&vmovdqu	(&QWP(16*1,$out),$b);
1118e1051a39Sopenharmony_ci	&vmovdqu	(&QWP(16*2,$out),$c);
1119e1051a39Sopenharmony_ci	&vmovdqu	(&QWP(16*3,$out),$d);
1120e1051a39Sopenharmony_ci	&lea		($out,&DWP(16*4,$out));		# inp+=64
1121e1051a39Sopenharmony_ci
1122e1051a39Sopenharmony_ci	&sub		($len,64);
1123e1051a39Sopenharmony_ci	&jnz		(&label("outer1x"));
1124e1051a39Sopenharmony_ci
1125e1051a39Sopenharmony_ci	&jmp		(&label("done"));
1126e1051a39Sopenharmony_ci
1127e1051a39Sopenharmony_ci&set_label("tail");
1128e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*0,"esp"),$a);
1129e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*1,"esp"),$b);
1130e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*2,"esp"),$c);
1131e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(16*3,"esp"),$d);
1132e1051a39Sopenharmony_ci
1133e1051a39Sopenharmony_ci	&xor		("eax","eax");
1134e1051a39Sopenharmony_ci	&xor		("edx","edx");
1135e1051a39Sopenharmony_ci	&xor		("ebp","ebp");
1136e1051a39Sopenharmony_ci
1137e1051a39Sopenharmony_ci&set_label("tail_loop");
1138e1051a39Sopenharmony_ci	&movb		("al",&BP(0,"esp","ebp"));
1139e1051a39Sopenharmony_ci	&movb		("dl",&BP(0,$inp,"ebp"));
1140e1051a39Sopenharmony_ci	&lea		("ebp",&DWP(1,"ebp"));
1141e1051a39Sopenharmony_ci	&xor		("al","dl");
1142e1051a39Sopenharmony_ci	&movb		(&BP(-1,$out,"ebp"),"al");
1143e1051a39Sopenharmony_ci	&dec		($len);
1144e1051a39Sopenharmony_ci	&jnz		(&label("tail_loop"));
1145e1051a39Sopenharmony_ci}
1146e1051a39Sopenharmony_ci&set_label("done");
1147e1051a39Sopenharmony_ci	&vzeroupper	();
1148e1051a39Sopenharmony_ci	&mov		("esp",&DWP(512,"esp"));
1149e1051a39Sopenharmony_ci&function_end("ChaCha20_xop");
1150e1051a39Sopenharmony_ci}
1151e1051a39Sopenharmony_ci
1152e1051a39Sopenharmony_ci&asm_finish();
1153e1051a39Sopenharmony_ci
1154e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
1155