1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2009-2022 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci# This module implements support for Intel AES-NI extension. In
18e1051a39Sopenharmony_ci# OpenSSL context it's used with Intel engine, but can also be used as
19e1051a39Sopenharmony_ci# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
20e1051a39Sopenharmony_ci# details].
21e1051a39Sopenharmony_ci#
22e1051a39Sopenharmony_ci# Performance.
23e1051a39Sopenharmony_ci#
24e1051a39Sopenharmony_ci# To start with see corresponding paragraph in aesni-x86_64.pl...
25e1051a39Sopenharmony_ci# Instead of filling table similar to one found there I've chosen to
26e1051a39Sopenharmony_ci# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27e1051a39Sopenharmony_ci# The simplified table below represents 32-bit performance relative
28e1051a39Sopenharmony_ci# to 64-bit one in every given point. Ratios vary for different
29e1051a39Sopenharmony_ci# encryption modes, therefore interval values.
30e1051a39Sopenharmony_ci#
31e1051a39Sopenharmony_ci#	16-byte     64-byte     256-byte    1-KB        8-KB
32e1051a39Sopenharmony_ci#	53-67%      67-84%      91-94%      95-98%      97-99.5%
33e1051a39Sopenharmony_ci#
34e1051a39Sopenharmony_ci# Lower ratios for smaller block sizes are perfectly understandable,
35e1051a39Sopenharmony_ci# because function call overhead is higher in 32-bit mode. Largest
36e1051a39Sopenharmony_ci# 8-KB block performance is virtually same: 32-bit code is less than
37e1051a39Sopenharmony_ci# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
38e1051a39Sopenharmony_ci
39e1051a39Sopenharmony_ci# January 2011
40e1051a39Sopenharmony_ci#
41e1051a39Sopenharmony_ci# See aesni-x86_64.pl for details. Unlike x86_64 version this module
42e1051a39Sopenharmony_ci# interleaves at most 6 aes[enc|dec] instructions, because there are
43e1051a39Sopenharmony_ci# not enough registers for 8x interleave [which should be optimal for
44e1051a39Sopenharmony_ci# Sandy Bridge]. Actually, performance results for 6x interleave
45e1051a39Sopenharmony_ci# factor presented in aesni-x86_64.pl (except for CTR) are for this
46e1051a39Sopenharmony_ci# module.
47e1051a39Sopenharmony_ci
48e1051a39Sopenharmony_ci# April 2011
49e1051a39Sopenharmony_ci#
50e1051a39Sopenharmony_ci# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51e1051a39Sopenharmony_ci# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
52e1051a39Sopenharmony_ci
53e1051a39Sopenharmony_ci# November 2015
54e1051a39Sopenharmony_ci#
55e1051a39Sopenharmony_ci# Add aesni_ocb_[en|de]crypt.
56e1051a39Sopenharmony_ci
57e1051a39Sopenharmony_ci######################################################################
58e1051a39Sopenharmony_ci# Current large-block performance in cycles per byte processed with
59e1051a39Sopenharmony_ci# 128-bit key (less is better).
60e1051a39Sopenharmony_ci#
61e1051a39Sopenharmony_ci#		CBC en-/decrypt	CTR	XTS	ECB	OCB
62e1051a39Sopenharmony_ci# Westmere	3.77/1.37	1.37	1.52	1.27
63e1051a39Sopenharmony_ci# * Bridge	5.07/0.98	0.99	1.09	0.91	1.10
64e1051a39Sopenharmony_ci# Haswell	4.44/0.80	0.97	1.03	0.72	0.76
65e1051a39Sopenharmony_ci# Skylake	2.68/0.65	0.65	0.66	0.64	0.66
66e1051a39Sopenharmony_ci# Silvermont	5.77/3.56	3.67	4.03	3.46	4.03
67e1051a39Sopenharmony_ci# Goldmont	3.84/1.39	1.39	1.63	1.31	1.70
68e1051a39Sopenharmony_ci# Bulldozer	5.80/0.98	1.05	1.24	0.93	1.23
69e1051a39Sopenharmony_ci
70e1051a39Sopenharmony_ci$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
71e1051a39Sopenharmony_ci			# generates drop-in replacement for
72e1051a39Sopenharmony_ci			# crypto/aes/asm/aes-586.pl:-)
73e1051a39Sopenharmony_ci$inline=1;		# inline _aesni_[en|de]crypt
74e1051a39Sopenharmony_ci
75e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
76e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm");
77e1051a39Sopenharmony_cirequire "x86asm.pl";
78e1051a39Sopenharmony_ci
79e1051a39Sopenharmony_ci$output = pop and open STDOUT,">$output";
80e1051a39Sopenharmony_ci
81e1051a39Sopenharmony_ci&asm_init($ARGV[0]);
82e1051a39Sopenharmony_ci
83e1051a39Sopenharmony_ci&external_label("OPENSSL_ia32cap_P");
84e1051a39Sopenharmony_ci&static_label("key_const");
85e1051a39Sopenharmony_ci
86e1051a39Sopenharmony_ciif ($PREFIX eq "aesni")	{ $movekey=\&movups; }
87e1051a39Sopenharmony_cielse			{ $movekey=\&movups; }
88e1051a39Sopenharmony_ci
89e1051a39Sopenharmony_ci$len="eax";
90e1051a39Sopenharmony_ci$rounds="ecx";
91e1051a39Sopenharmony_ci$key="edx";
92e1051a39Sopenharmony_ci$inp="esi";
93e1051a39Sopenharmony_ci$out="edi";
94e1051a39Sopenharmony_ci$rounds_="ebx";	# backup copy for $rounds
95e1051a39Sopenharmony_ci$key_="ebp";	# backup copy for $key
96e1051a39Sopenharmony_ci
97e1051a39Sopenharmony_ci$rndkey0="xmm0";
98e1051a39Sopenharmony_ci$rndkey1="xmm1";
99e1051a39Sopenharmony_ci$inout0="xmm2";
100e1051a39Sopenharmony_ci$inout1="xmm3";
101e1051a39Sopenharmony_ci$inout2="xmm4";
102e1051a39Sopenharmony_ci$inout3="xmm5";	$in1="xmm5";
103e1051a39Sopenharmony_ci$inout4="xmm6";	$in0="xmm6";
104e1051a39Sopenharmony_ci$inout5="xmm7";	$ivec="xmm7";
105e1051a39Sopenharmony_ci
106e1051a39Sopenharmony_ci# AESNI extension
107e1051a39Sopenharmony_cisub aeskeygenassist
108e1051a39Sopenharmony_ci{ my($dst,$src,$imm)=@_;
109e1051a39Sopenharmony_ci    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
110e1051a39Sopenharmony_ci    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
111e1051a39Sopenharmony_ci}
112e1051a39Sopenharmony_cisub aescommon
113e1051a39Sopenharmony_ci{ my($opcodelet,$dst,$src)=@_;
114e1051a39Sopenharmony_ci    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
115e1051a39Sopenharmony_ci    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
116e1051a39Sopenharmony_ci}
117e1051a39Sopenharmony_cisub aesimc	{ aescommon(0xdb,@_); }
118e1051a39Sopenharmony_cisub aesenc	{ aescommon(0xdc,@_); }
119e1051a39Sopenharmony_cisub aesenclast	{ aescommon(0xdd,@_); }
120e1051a39Sopenharmony_cisub aesdec	{ aescommon(0xde,@_); }
121e1051a39Sopenharmony_cisub aesdeclast	{ aescommon(0xdf,@_); }
122e1051a39Sopenharmony_ci
123e1051a39Sopenharmony_ci# Inline version of internal aesni_[en|de]crypt1
124e1051a39Sopenharmony_ci{ my $sn;
125e1051a39Sopenharmony_cisub aesni_inline_generate1
126e1051a39Sopenharmony_ci{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
127e1051a39Sopenharmony_ci  $sn++;
128e1051a39Sopenharmony_ci
129e1051a39Sopenharmony_ci    &$movekey		($rndkey0,&QWP(0,$key));
130e1051a39Sopenharmony_ci    &$movekey		($rndkey1,&QWP(16,$key));
131e1051a39Sopenharmony_ci    &xorps		($ivec,$rndkey0)	if (defined($ivec));
132e1051a39Sopenharmony_ci    &lea		($key,&DWP(32,$key));
133e1051a39Sopenharmony_ci    &xorps		($inout,$ivec)		if (defined($ivec));
134e1051a39Sopenharmony_ci    &xorps		($inout,$rndkey0)	if (!defined($ivec));
135e1051a39Sopenharmony_ci    &set_label("${p}1_loop_$sn");
136e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
137e1051a39Sopenharmony_ci	&dec		($rounds);
138e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key));
139e1051a39Sopenharmony_ci	&lea		($key,&DWP(16,$key));
140e1051a39Sopenharmony_ci    &jnz		(&label("${p}1_loop_$sn"));
141e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout,$rndkey1)";
142e1051a39Sopenharmony_ci}}
143e1051a39Sopenharmony_ci
144e1051a39Sopenharmony_cisub aesni_generate1	# fully unrolled loop
145e1051a39Sopenharmony_ci{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
146e1051a39Sopenharmony_ci
147e1051a39Sopenharmony_ci    &function_begin_B("_aesni_${p}rypt1");
148e1051a39Sopenharmony_ci	&movups		($rndkey0,&QWP(0,$key));
149e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0x10,$key));
150e1051a39Sopenharmony_ci	&xorps		($inout,$rndkey0);
151e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0x20,$key));
152e1051a39Sopenharmony_ci	&lea		($key,&DWP(0x30,$key));
153e1051a39Sopenharmony_ci	&cmp		($rounds,11);
154e1051a39Sopenharmony_ci	&jb		(&label("${p}128"));
155e1051a39Sopenharmony_ci	&lea		($key,&DWP(0x20,$key));
156e1051a39Sopenharmony_ci	&je		(&label("${p}192"));
157e1051a39Sopenharmony_ci	&lea		($key,&DWP(0x20,$key));
158e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
159e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(-0x40,$key));
160e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey0)";
161e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-0x30,$key));
162e1051a39Sopenharmony_ci    &set_label("${p}192");
163e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
164e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(-0x20,$key));
165e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey0)";
166e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-0x10,$key));
167e1051a39Sopenharmony_ci    &set_label("${p}128");
168e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
169e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key));
170e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey0)";
171e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0x10,$key));
172e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
173e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0x20,$key));
174e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey0)";
175e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0x30,$key));
176e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
177e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0x40,$key));
178e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey0)";
179e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0x50,$key));
180e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
181e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0x60,$key));
182e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey0)";
183e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0x70,$key));
184e1051a39Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
185e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout,$rndkey0)";
186e1051a39Sopenharmony_ci    &ret();
187e1051a39Sopenharmony_ci    &function_end_B("_aesni_${p}rypt1");
188e1051a39Sopenharmony_ci}
189e1051a39Sopenharmony_ci
190e1051a39Sopenharmony_ci# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
191e1051a39Sopenharmony_ci&aesni_generate1("enc") if (!$inline);
192e1051a39Sopenharmony_ci&function_begin_B("${PREFIX}_encrypt");
193e1051a39Sopenharmony_ci	&mov	("eax",&wparam(0));
194e1051a39Sopenharmony_ci	&mov	($key,&wparam(2));
195e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(0,"eax"));
196e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
197e1051a39Sopenharmony_ci	&mov	("eax",&wparam(1));
198e1051a39Sopenharmony_ci	if ($inline)
199e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
200e1051a39Sopenharmony_ci	else
201e1051a39Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
202e1051a39Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);		# clear register bank
203e1051a39Sopenharmony_ci	&pxor	($rndkey1,$rndkey1);
204e1051a39Sopenharmony_ci	&movups	(&QWP(0,"eax"),$inout0);
205e1051a39Sopenharmony_ci	&pxor	($inout0,$inout0);
206e1051a39Sopenharmony_ci	&ret	();
207e1051a39Sopenharmony_ci&function_end_B("${PREFIX}_encrypt");
208e1051a39Sopenharmony_ci
209e1051a39Sopenharmony_ci# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
210e1051a39Sopenharmony_ci&aesni_generate1("dec") if(!$inline);
211e1051a39Sopenharmony_ci&function_begin_B("${PREFIX}_decrypt");
212e1051a39Sopenharmony_ci	&mov	("eax",&wparam(0));
213e1051a39Sopenharmony_ci	&mov	($key,&wparam(2));
214e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(0,"eax"));
215e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
216e1051a39Sopenharmony_ci	&mov	("eax",&wparam(1));
217e1051a39Sopenharmony_ci	if ($inline)
218e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
219e1051a39Sopenharmony_ci	else
220e1051a39Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
221e1051a39Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);		# clear register bank
222e1051a39Sopenharmony_ci	&pxor	($rndkey1,$rndkey1);
223e1051a39Sopenharmony_ci	&movups	(&QWP(0,"eax"),$inout0);
224e1051a39Sopenharmony_ci	&pxor	($inout0,$inout0);
225e1051a39Sopenharmony_ci	&ret	();
226e1051a39Sopenharmony_ci&function_end_B("${PREFIX}_decrypt");
227e1051a39Sopenharmony_ci
228e1051a39Sopenharmony_ci# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
229e1051a39Sopenharmony_ci# factor. Why 3x subroutine were originally used in loops? Even though
230e1051a39Sopenharmony_ci# aes[enc|dec] latency was originally 6, it could be scheduled only
231e1051a39Sopenharmony_ci# every *2nd* cycle. Thus 3x interleave was the one providing optimal
232e1051a39Sopenharmony_ci# utilization, i.e. when subroutine's throughput is virtually same as
233e1051a39Sopenharmony_ci# of non-interleaved subroutine [for number of input blocks up to 3].
234e1051a39Sopenharmony_ci# This is why it originally made no sense to implement 2x subroutine.
235e1051a39Sopenharmony_ci# But times change and it became appropriate to spend extra 192 bytes
236e1051a39Sopenharmony_ci# on 2x subroutine on Atom Silvermont account. For processors that
237e1051a39Sopenharmony_ci# can schedule aes[enc|dec] every cycle optimal interleave factor
238e1051a39Sopenharmony_ci# equals to corresponding instructions latency. 8x is optimal for
239e1051a39Sopenharmony_ci# * Bridge, but it's unfeasible to accommodate such implementation
240e1051a39Sopenharmony_ci# in XMM registers addressable in 32-bit mode and therefore maximum
241e1051a39Sopenharmony_ci# of 6x is used instead...
242e1051a39Sopenharmony_ci
243e1051a39Sopenharmony_cisub aesni_generate2
244e1051a39Sopenharmony_ci{ my $p=shift;
245e1051a39Sopenharmony_ci
246e1051a39Sopenharmony_ci    &function_begin_B("_aesni_${p}rypt2");
247e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key));
248e1051a39Sopenharmony_ci	&shl		($rounds,4);
249e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key));
250e1051a39Sopenharmony_ci	&xorps		($inout0,$rndkey0);
251e1051a39Sopenharmony_ci	&pxor		($inout1,$rndkey0);
252e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(32,$key));
253e1051a39Sopenharmony_ci	&lea		($key,&DWP(32,$key,$rounds));
254e1051a39Sopenharmony_ci	&neg		($rounds);
255e1051a39Sopenharmony_ci	&add		($rounds,16);
256e1051a39Sopenharmony_ci
257e1051a39Sopenharmony_ci    &set_label("${p}2_loop");
258e1051a39Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey1)";
259e1051a39Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey1)";
260e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
261e1051a39Sopenharmony_ci	&add		($rounds,32);
262e1051a39Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey0)";
263e1051a39Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey0)";
264e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
265e1051a39Sopenharmony_ci	&jnz		(&label("${p}2_loop"));
266e1051a39Sopenharmony_ci    eval"&aes${p}	($inout0,$rndkey1)";
267e1051a39Sopenharmony_ci    eval"&aes${p}	($inout1,$rndkey1)";
268e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout0,$rndkey0)";
269e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout1,$rndkey0)";
270e1051a39Sopenharmony_ci    &ret();
271e1051a39Sopenharmony_ci    &function_end_B("_aesni_${p}rypt2");
272e1051a39Sopenharmony_ci}
273e1051a39Sopenharmony_ci
274e1051a39Sopenharmony_cisub aesni_generate3
275e1051a39Sopenharmony_ci{ my $p=shift;
276e1051a39Sopenharmony_ci
277e1051a39Sopenharmony_ci    &function_begin_B("_aesni_${p}rypt3");
278e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key));
279e1051a39Sopenharmony_ci	&shl		($rounds,4);
280e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key));
281e1051a39Sopenharmony_ci	&xorps		($inout0,$rndkey0);
282e1051a39Sopenharmony_ci	&pxor		($inout1,$rndkey0);
283e1051a39Sopenharmony_ci	&pxor		($inout2,$rndkey0);
284e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(32,$key));
285e1051a39Sopenharmony_ci	&lea		($key,&DWP(32,$key,$rounds));
286e1051a39Sopenharmony_ci	&neg		($rounds);
287e1051a39Sopenharmony_ci	&add		($rounds,16);
288e1051a39Sopenharmony_ci
289e1051a39Sopenharmony_ci    &set_label("${p}3_loop");
290e1051a39Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey1)";
291e1051a39Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey1)";
292e1051a39Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey1)";
293e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
294e1051a39Sopenharmony_ci	&add		($rounds,32);
295e1051a39Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey0)";
296e1051a39Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey0)";
297e1051a39Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey0)";
298e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
299e1051a39Sopenharmony_ci	&jnz		(&label("${p}3_loop"));
300e1051a39Sopenharmony_ci    eval"&aes${p}	($inout0,$rndkey1)";
301e1051a39Sopenharmony_ci    eval"&aes${p}	($inout1,$rndkey1)";
302e1051a39Sopenharmony_ci    eval"&aes${p}	($inout2,$rndkey1)";
303e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout0,$rndkey0)";
304e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout1,$rndkey0)";
305e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout2,$rndkey0)";
306e1051a39Sopenharmony_ci    &ret();
307e1051a39Sopenharmony_ci    &function_end_B("_aesni_${p}rypt3");
308e1051a39Sopenharmony_ci}
309e1051a39Sopenharmony_ci
310e1051a39Sopenharmony_ci# 4x interleave is implemented to improve small block performance,
311e1051a39Sopenharmony_ci# most notably [and naturally] 4 block by ~30%. One can argue that one
312e1051a39Sopenharmony_ci# should have implemented 5x as well, but improvement  would be <20%,
313e1051a39Sopenharmony_ci# so it's not worth it...
314e1051a39Sopenharmony_cisub aesni_generate4
315e1051a39Sopenharmony_ci{ my $p=shift;
316e1051a39Sopenharmony_ci
317e1051a39Sopenharmony_ci    &function_begin_B("_aesni_${p}rypt4");
318e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key));
319e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key));
320e1051a39Sopenharmony_ci	&shl		($rounds,4);
321e1051a39Sopenharmony_ci	&xorps		($inout0,$rndkey0);
322e1051a39Sopenharmony_ci	&pxor		($inout1,$rndkey0);
323e1051a39Sopenharmony_ci	&pxor		($inout2,$rndkey0);
324e1051a39Sopenharmony_ci	&pxor		($inout3,$rndkey0);
325e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(32,$key));
326e1051a39Sopenharmony_ci	&lea		($key,&DWP(32,$key,$rounds));
327e1051a39Sopenharmony_ci	&neg		($rounds);
328e1051a39Sopenharmony_ci	&data_byte	(0x0f,0x1f,0x40,0x00);
329e1051a39Sopenharmony_ci	&add		($rounds,16);
330e1051a39Sopenharmony_ci
331e1051a39Sopenharmony_ci    &set_label("${p}4_loop");
332e1051a39Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey1)";
333e1051a39Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey1)";
334e1051a39Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey1)";
335e1051a39Sopenharmony_ci	eval"&aes${p}	($inout3,$rndkey1)";
336e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
337e1051a39Sopenharmony_ci	&add		($rounds,32);
338e1051a39Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey0)";
339e1051a39Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey0)";
340e1051a39Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey0)";
341e1051a39Sopenharmony_ci	eval"&aes${p}	($inout3,$rndkey0)";
342e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
343e1051a39Sopenharmony_ci    &jnz		(&label("${p}4_loop"));
344e1051a39Sopenharmony_ci
345e1051a39Sopenharmony_ci    eval"&aes${p}	($inout0,$rndkey1)";
346e1051a39Sopenharmony_ci    eval"&aes${p}	($inout1,$rndkey1)";
347e1051a39Sopenharmony_ci    eval"&aes${p}	($inout2,$rndkey1)";
348e1051a39Sopenharmony_ci    eval"&aes${p}	($inout3,$rndkey1)";
349e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout0,$rndkey0)";
350e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout1,$rndkey0)";
351e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout2,$rndkey0)";
352e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout3,$rndkey0)";
353e1051a39Sopenharmony_ci    &ret();
354e1051a39Sopenharmony_ci    &function_end_B("_aesni_${p}rypt4");
355e1051a39Sopenharmony_ci}
356e1051a39Sopenharmony_ci
357e1051a39Sopenharmony_cisub aesni_generate6
358e1051a39Sopenharmony_ci{ my $p=shift;
359e1051a39Sopenharmony_ci
360e1051a39Sopenharmony_ci    &function_begin_B("_aesni_${p}rypt6");
361e1051a39Sopenharmony_ci    &static_label("_aesni_${p}rypt6_enter");
362e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key));
363e1051a39Sopenharmony_ci	&shl		($rounds,4);
364e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key));
365e1051a39Sopenharmony_ci	&xorps		($inout0,$rndkey0);
366e1051a39Sopenharmony_ci	&pxor		($inout1,$rndkey0);	# pxor does better here
367e1051a39Sopenharmony_ci	&pxor		($inout2,$rndkey0);
368e1051a39Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey1)";
369e1051a39Sopenharmony_ci	&pxor		($inout3,$rndkey0);
370e1051a39Sopenharmony_ci	&pxor		($inout4,$rndkey0);
371e1051a39Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey1)";
372e1051a39Sopenharmony_ci	&lea		($key,&DWP(32,$key,$rounds));
373e1051a39Sopenharmony_ci	&neg		($rounds);
374e1051a39Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey1)";
375e1051a39Sopenharmony_ci	&pxor		($inout5,$rndkey0);
376e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
377e1051a39Sopenharmony_ci	&add		($rounds,16);
378e1051a39Sopenharmony_ci	&jmp		(&label("_aesni_${p}rypt6_inner"));
379e1051a39Sopenharmony_ci
380e1051a39Sopenharmony_ci    &set_label("${p}6_loop",16);
381e1051a39Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey1)";
382e1051a39Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey1)";
383e1051a39Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey1)";
384e1051a39Sopenharmony_ci    &set_label("_aesni_${p}rypt6_inner");
385e1051a39Sopenharmony_ci	eval"&aes${p}	($inout3,$rndkey1)";
386e1051a39Sopenharmony_ci	eval"&aes${p}	($inout4,$rndkey1)";
387e1051a39Sopenharmony_ci	eval"&aes${p}	($inout5,$rndkey1)";
388e1051a39Sopenharmony_ci    &set_label("_aesni_${p}rypt6_enter");
389e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
390e1051a39Sopenharmony_ci	&add		($rounds,32);
391e1051a39Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey0)";
392e1051a39Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey0)";
393e1051a39Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey0)";
394e1051a39Sopenharmony_ci	eval"&aes${p}	($inout3,$rndkey0)";
395e1051a39Sopenharmony_ci	eval"&aes${p}	($inout4,$rndkey0)";
396e1051a39Sopenharmony_ci	eval"&aes${p}	($inout5,$rndkey0)";
397e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
398e1051a39Sopenharmony_ci    &jnz		(&label("${p}6_loop"));
399e1051a39Sopenharmony_ci
400e1051a39Sopenharmony_ci    eval"&aes${p}	($inout0,$rndkey1)";
401e1051a39Sopenharmony_ci    eval"&aes${p}	($inout1,$rndkey1)";
402e1051a39Sopenharmony_ci    eval"&aes${p}	($inout2,$rndkey1)";
403e1051a39Sopenharmony_ci    eval"&aes${p}	($inout3,$rndkey1)";
404e1051a39Sopenharmony_ci    eval"&aes${p}	($inout4,$rndkey1)";
405e1051a39Sopenharmony_ci    eval"&aes${p}	($inout5,$rndkey1)";
406e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout0,$rndkey0)";
407e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout1,$rndkey0)";
408e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout2,$rndkey0)";
409e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout3,$rndkey0)";
410e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout4,$rndkey0)";
411e1051a39Sopenharmony_ci    eval"&aes${p}last	($inout5,$rndkey0)";
412e1051a39Sopenharmony_ci    &ret();
413e1051a39Sopenharmony_ci    &function_end_B("_aesni_${p}rypt6");
414e1051a39Sopenharmony_ci}
415e1051a39Sopenharmony_ci&aesni_generate2("enc") if ($PREFIX eq "aesni");
416e1051a39Sopenharmony_ci&aesni_generate2("dec");
417e1051a39Sopenharmony_ci&aesni_generate3("enc") if ($PREFIX eq "aesni");
418e1051a39Sopenharmony_ci&aesni_generate3("dec");
419e1051a39Sopenharmony_ci&aesni_generate4("enc") if ($PREFIX eq "aesni");
420e1051a39Sopenharmony_ci&aesni_generate4("dec");
421e1051a39Sopenharmony_ci&aesni_generate6("enc") if ($PREFIX eq "aesni");
422e1051a39Sopenharmony_ci&aesni_generate6("dec");
423e1051a39Sopenharmony_ci
424e1051a39Sopenharmony_ciif ($PREFIX eq "aesni") {
425e1051a39Sopenharmony_ci######################################################################
426e1051a39Sopenharmony_ci# void aesni_ecb_encrypt (const void *in, void *out,
427e1051a39Sopenharmony_ci#                         size_t length, const AES_KEY *key,
428e1051a39Sopenharmony_ci#                         int enc);
429e1051a39Sopenharmony_ci&function_begin("aesni_ecb_encrypt");
430e1051a39Sopenharmony_ci	&mov	($inp,&wparam(0));
431e1051a39Sopenharmony_ci	&mov	($out,&wparam(1));
432e1051a39Sopenharmony_ci	&mov	($len,&wparam(2));
433e1051a39Sopenharmony_ci	&mov	($key,&wparam(3));
434e1051a39Sopenharmony_ci	&mov	($rounds_,&wparam(4));
435e1051a39Sopenharmony_ci	&and	($len,-16);
436e1051a39Sopenharmony_ci	&jz	(&label("ecb_ret"));
437e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
438e1051a39Sopenharmony_ci	&test	($rounds_,$rounds_);
439e1051a39Sopenharmony_ci	&jz	(&label("ecb_decrypt"));
440e1051a39Sopenharmony_ci
441e1051a39Sopenharmony_ci	&mov	($key_,$key);		# backup $key
442e1051a39Sopenharmony_ci	&mov	($rounds_,$rounds);	# backup $rounds
443e1051a39Sopenharmony_ci	&cmp	($len,0x60);
444e1051a39Sopenharmony_ci	&jb	(&label("ecb_enc_tail"));
445e1051a39Sopenharmony_ci
446e1051a39Sopenharmony_ci	&movdqu	($inout0,&QWP(0,$inp));
447e1051a39Sopenharmony_ci	&movdqu	($inout1,&QWP(0x10,$inp));
448e1051a39Sopenharmony_ci	&movdqu	($inout2,&QWP(0x20,$inp));
449e1051a39Sopenharmony_ci	&movdqu	($inout3,&QWP(0x30,$inp));
450e1051a39Sopenharmony_ci	&movdqu	($inout4,&QWP(0x40,$inp));
451e1051a39Sopenharmony_ci	&movdqu	($inout5,&QWP(0x50,$inp));
452e1051a39Sopenharmony_ci	&lea	($inp,&DWP(0x60,$inp));
453e1051a39Sopenharmony_ci	&sub	($len,0x60);
454e1051a39Sopenharmony_ci	&jmp	(&label("ecb_enc_loop6_enter"));
455e1051a39Sopenharmony_ci
456e1051a39Sopenharmony_ci&set_label("ecb_enc_loop6",16);
457e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
458e1051a39Sopenharmony_ci	&movdqu	($inout0,&QWP(0,$inp));
459e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
460e1051a39Sopenharmony_ci	&movdqu	($inout1,&QWP(0x10,$inp));
461e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
462e1051a39Sopenharmony_ci	&movdqu	($inout2,&QWP(0x20,$inp));
463e1051a39Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
464e1051a39Sopenharmony_ci	&movdqu	($inout3,&QWP(0x30,$inp));
465e1051a39Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
466e1051a39Sopenharmony_ci	&movdqu	($inout4,&QWP(0x40,$inp));
467e1051a39Sopenharmony_ci	&movups	(&QWP(0x50,$out),$inout5);
468e1051a39Sopenharmony_ci	&lea	($out,&DWP(0x60,$out));
469e1051a39Sopenharmony_ci	&movdqu	($inout5,&QWP(0x50,$inp));
470e1051a39Sopenharmony_ci	&lea	($inp,&DWP(0x60,$inp));
471e1051a39Sopenharmony_ci&set_label("ecb_enc_loop6_enter");
472e1051a39Sopenharmony_ci
473e1051a39Sopenharmony_ci	&call	("_aesni_encrypt6");
474e1051a39Sopenharmony_ci
475e1051a39Sopenharmony_ci	&mov	($key,$key_);		# restore $key
476e1051a39Sopenharmony_ci	&mov	($rounds,$rounds_);	# restore $rounds
477e1051a39Sopenharmony_ci	&sub	($len,0x60);
478e1051a39Sopenharmony_ci	&jnc	(&label("ecb_enc_loop6"));
479e1051a39Sopenharmony_ci
480e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
481e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
482e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
483e1051a39Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
484e1051a39Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
485e1051a39Sopenharmony_ci	&movups	(&QWP(0x50,$out),$inout5);
486e1051a39Sopenharmony_ci	&lea	($out,&DWP(0x60,$out));
487e1051a39Sopenharmony_ci	&add	($len,0x60);
488e1051a39Sopenharmony_ci	&jz	(&label("ecb_ret"));
489e1051a39Sopenharmony_ci
490e1051a39Sopenharmony_ci&set_label("ecb_enc_tail");
491e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));
492e1051a39Sopenharmony_ci	&cmp	($len,0x20);
493e1051a39Sopenharmony_ci	&jb	(&label("ecb_enc_one"));
494e1051a39Sopenharmony_ci	&movups	($inout1,&QWP(0x10,$inp));
495e1051a39Sopenharmony_ci	&je	(&label("ecb_enc_two"));
496e1051a39Sopenharmony_ci	&movups	($inout2,&QWP(0x20,$inp));
497e1051a39Sopenharmony_ci	&cmp	($len,0x40);
498e1051a39Sopenharmony_ci	&jb	(&label("ecb_enc_three"));
499e1051a39Sopenharmony_ci	&movups	($inout3,&QWP(0x30,$inp));
500e1051a39Sopenharmony_ci	&je	(&label("ecb_enc_four"));
501e1051a39Sopenharmony_ci	&movups	($inout4,&QWP(0x40,$inp));
502e1051a39Sopenharmony_ci	&xorps	($inout5,$inout5);
503e1051a39Sopenharmony_ci	&call	("_aesni_encrypt6");
504e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
505e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
506e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
507e1051a39Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
508e1051a39Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
509e1051a39Sopenharmony_ci	jmp	(&label("ecb_ret"));
510e1051a39Sopenharmony_ci
511e1051a39Sopenharmony_ci&set_label("ecb_enc_one",16);
512e1051a39Sopenharmony_ci	if ($inline)
513e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
514e1051a39Sopenharmony_ci	else
515e1051a39Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
516e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
517e1051a39Sopenharmony_ci	&jmp	(&label("ecb_ret"));
518e1051a39Sopenharmony_ci
519e1051a39Sopenharmony_ci&set_label("ecb_enc_two",16);
520e1051a39Sopenharmony_ci	&call	("_aesni_encrypt2");
521e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
522e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
523e1051a39Sopenharmony_ci	&jmp	(&label("ecb_ret"));
524e1051a39Sopenharmony_ci
525e1051a39Sopenharmony_ci&set_label("ecb_enc_three",16);
526e1051a39Sopenharmony_ci	&call	("_aesni_encrypt3");
527e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
528e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
529e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
530e1051a39Sopenharmony_ci	&jmp	(&label("ecb_ret"));
531e1051a39Sopenharmony_ci
532e1051a39Sopenharmony_ci&set_label("ecb_enc_four",16);
533e1051a39Sopenharmony_ci	&call	("_aesni_encrypt4");
534e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
535e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
536e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
537e1051a39Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
538e1051a39Sopenharmony_ci	&jmp	(&label("ecb_ret"));
539e1051a39Sopenharmony_ci######################################################################
540e1051a39Sopenharmony_ci&set_label("ecb_decrypt",16);
541e1051a39Sopenharmony_ci	&mov	($key_,$key);		# backup $key
542e1051a39Sopenharmony_ci	&mov	($rounds_,$rounds);	# backup $rounds
543e1051a39Sopenharmony_ci	&cmp	($len,0x60);
544e1051a39Sopenharmony_ci	&jb	(&label("ecb_dec_tail"));
545e1051a39Sopenharmony_ci
546e1051a39Sopenharmony_ci	&movdqu	($inout0,&QWP(0,$inp));
547e1051a39Sopenharmony_ci	&movdqu	($inout1,&QWP(0x10,$inp));
548e1051a39Sopenharmony_ci	&movdqu	($inout2,&QWP(0x20,$inp));
549e1051a39Sopenharmony_ci	&movdqu	($inout3,&QWP(0x30,$inp));
550e1051a39Sopenharmony_ci	&movdqu	($inout4,&QWP(0x40,$inp));
551e1051a39Sopenharmony_ci	&movdqu	($inout5,&QWP(0x50,$inp));
552e1051a39Sopenharmony_ci	&lea	($inp,&DWP(0x60,$inp));
553e1051a39Sopenharmony_ci	&sub	($len,0x60);
554e1051a39Sopenharmony_ci	&jmp	(&label("ecb_dec_loop6_enter"));
555e1051a39Sopenharmony_ci
556e1051a39Sopenharmony_ci&set_label("ecb_dec_loop6",16);
557e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
558e1051a39Sopenharmony_ci	&movdqu	($inout0,&QWP(0,$inp));
559e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
560e1051a39Sopenharmony_ci	&movdqu	($inout1,&QWP(0x10,$inp));
561e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
562e1051a39Sopenharmony_ci	&movdqu	($inout2,&QWP(0x20,$inp));
563e1051a39Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
564e1051a39Sopenharmony_ci	&movdqu	($inout3,&QWP(0x30,$inp));
565e1051a39Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
566e1051a39Sopenharmony_ci	&movdqu	($inout4,&QWP(0x40,$inp));
567e1051a39Sopenharmony_ci	&movups	(&QWP(0x50,$out),$inout5);
568e1051a39Sopenharmony_ci	&lea	($out,&DWP(0x60,$out));
569e1051a39Sopenharmony_ci	&movdqu	($inout5,&QWP(0x50,$inp));
570e1051a39Sopenharmony_ci	&lea	($inp,&DWP(0x60,$inp));
571e1051a39Sopenharmony_ci&set_label("ecb_dec_loop6_enter");
572e1051a39Sopenharmony_ci
573e1051a39Sopenharmony_ci	&call	("_aesni_decrypt6");
574e1051a39Sopenharmony_ci
575e1051a39Sopenharmony_ci	&mov	($key,$key_);		# restore $key
576e1051a39Sopenharmony_ci	&mov	($rounds,$rounds_);	# restore $rounds
577e1051a39Sopenharmony_ci	&sub	($len,0x60);
578e1051a39Sopenharmony_ci	&jnc	(&label("ecb_dec_loop6"));
579e1051a39Sopenharmony_ci
580e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
581e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
582e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
583e1051a39Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
584e1051a39Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
585e1051a39Sopenharmony_ci	&movups	(&QWP(0x50,$out),$inout5);
586e1051a39Sopenharmony_ci	&lea	($out,&DWP(0x60,$out));
587e1051a39Sopenharmony_ci	&add	($len,0x60);
588e1051a39Sopenharmony_ci	&jz	(&label("ecb_ret"));
589e1051a39Sopenharmony_ci
590e1051a39Sopenharmony_ci&set_label("ecb_dec_tail");
591e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));
592e1051a39Sopenharmony_ci	&cmp	($len,0x20);
593e1051a39Sopenharmony_ci	&jb	(&label("ecb_dec_one"));
594e1051a39Sopenharmony_ci	&movups	($inout1,&QWP(0x10,$inp));
595e1051a39Sopenharmony_ci	&je	(&label("ecb_dec_two"));
596e1051a39Sopenharmony_ci	&movups	($inout2,&QWP(0x20,$inp));
597e1051a39Sopenharmony_ci	&cmp	($len,0x40);
598e1051a39Sopenharmony_ci	&jb	(&label("ecb_dec_three"));
599e1051a39Sopenharmony_ci	&movups	($inout3,&QWP(0x30,$inp));
600e1051a39Sopenharmony_ci	&je	(&label("ecb_dec_four"));
601e1051a39Sopenharmony_ci	&movups	($inout4,&QWP(0x40,$inp));
602e1051a39Sopenharmony_ci	&xorps	($inout5,$inout5);
603e1051a39Sopenharmony_ci	&call	("_aesni_decrypt6");
604e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
605e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
606e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
607e1051a39Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
608e1051a39Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
609e1051a39Sopenharmony_ci	&jmp	(&label("ecb_ret"));
610e1051a39Sopenharmony_ci
611e1051a39Sopenharmony_ci&set_label("ecb_dec_one",16);
612e1051a39Sopenharmony_ci	if ($inline)
613e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
614e1051a39Sopenharmony_ci	else
615e1051a39Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
616e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
617e1051a39Sopenharmony_ci	&jmp	(&label("ecb_ret"));
618e1051a39Sopenharmony_ci
619e1051a39Sopenharmony_ci&set_label("ecb_dec_two",16);
620e1051a39Sopenharmony_ci	&call	("_aesni_decrypt2");
621e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
622e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
623e1051a39Sopenharmony_ci	&jmp	(&label("ecb_ret"));
624e1051a39Sopenharmony_ci
625e1051a39Sopenharmony_ci&set_label("ecb_dec_three",16);
626e1051a39Sopenharmony_ci	&call	("_aesni_decrypt3");
627e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
628e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
629e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
630e1051a39Sopenharmony_ci	&jmp	(&label("ecb_ret"));
631e1051a39Sopenharmony_ci
632e1051a39Sopenharmony_ci&set_label("ecb_dec_four",16);
633e1051a39Sopenharmony_ci	&call	("_aesni_decrypt4");
634e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
635e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
636e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
637e1051a39Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
638e1051a39Sopenharmony_ci
639e1051a39Sopenharmony_ci&set_label("ecb_ret");
640e1051a39Sopenharmony_ci	&pxor	("xmm0","xmm0");		# clear register bank
641e1051a39Sopenharmony_ci	&pxor	("xmm1","xmm1");
642e1051a39Sopenharmony_ci	&pxor	("xmm2","xmm2");
643e1051a39Sopenharmony_ci	&pxor	("xmm3","xmm3");
644e1051a39Sopenharmony_ci	&pxor	("xmm4","xmm4");
645e1051a39Sopenharmony_ci	&pxor	("xmm5","xmm5");
646e1051a39Sopenharmony_ci	&pxor	("xmm6","xmm6");
647e1051a39Sopenharmony_ci	&pxor	("xmm7","xmm7");
648e1051a39Sopenharmony_ci&function_end("aesni_ecb_encrypt");
649e1051a39Sopenharmony_ci
650e1051a39Sopenharmony_ci######################################################################
651e1051a39Sopenharmony_ci# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
652e1051a39Sopenharmony_ci#                         size_t blocks, const AES_KEY *key,
653e1051a39Sopenharmony_ci#                         const char *ivec,char *cmac);
654e1051a39Sopenharmony_ci#
655e1051a39Sopenharmony_ci# Handles only complete blocks, operates on 64-bit counter and
656e1051a39Sopenharmony_ci# does not update *ivec! Nor does it finalize CMAC value
657e1051a39Sopenharmony_ci# (see engine/eng_aesni.c for details)
658e1051a39Sopenharmony_ci#
659e1051a39Sopenharmony_ci{ my $cmac=$inout1;
660e1051a39Sopenharmony_ci&function_begin("aesni_ccm64_encrypt_blocks");
661e1051a39Sopenharmony_ci	&mov	($inp,&wparam(0));
662e1051a39Sopenharmony_ci	&mov	($out,&wparam(1));
663e1051a39Sopenharmony_ci	&mov	($len,&wparam(2));
664e1051a39Sopenharmony_ci	&mov	($key,&wparam(3));
665e1051a39Sopenharmony_ci	&mov	($rounds_,&wparam(4));
666e1051a39Sopenharmony_ci	&mov	($rounds,&wparam(5));
667e1051a39Sopenharmony_ci	&mov	($key_,"esp");
668e1051a39Sopenharmony_ci	&sub	("esp",60);
669e1051a39Sopenharmony_ci	&and	("esp",-16);			# align stack
670e1051a39Sopenharmony_ci	&mov	(&DWP(48,"esp"),$key_);
671e1051a39Sopenharmony_ci
672e1051a39Sopenharmony_ci	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
673e1051a39Sopenharmony_ci	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
674e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
675e1051a39Sopenharmony_ci
676e1051a39Sopenharmony_ci	# compose byte-swap control mask for pshufb on stack
677e1051a39Sopenharmony_ci	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
678e1051a39Sopenharmony_ci	&mov	(&DWP(4,"esp"),0x08090a0b);
679e1051a39Sopenharmony_ci	&mov	(&DWP(8,"esp"),0x04050607);
680e1051a39Sopenharmony_ci	&mov	(&DWP(12,"esp"),0x00010203);
681e1051a39Sopenharmony_ci
682e1051a39Sopenharmony_ci	# compose counter increment vector on stack
683e1051a39Sopenharmony_ci	&mov	($rounds_,1);
684e1051a39Sopenharmony_ci	&xor	($key_,$key_);
685e1051a39Sopenharmony_ci	&mov	(&DWP(16,"esp"),$rounds_);
686e1051a39Sopenharmony_ci	&mov	(&DWP(20,"esp"),$key_);
687e1051a39Sopenharmony_ci	&mov	(&DWP(24,"esp"),$key_);
688e1051a39Sopenharmony_ci	&mov	(&DWP(28,"esp"),$key_);
689e1051a39Sopenharmony_ci
690e1051a39Sopenharmony_ci	&shl	($rounds,4);
691e1051a39Sopenharmony_ci	&mov	($rounds_,16);
692e1051a39Sopenharmony_ci	&lea	($key_,&DWP(0,$key));
693e1051a39Sopenharmony_ci	&movdqa	($inout3,&QWP(0,"esp"));
694e1051a39Sopenharmony_ci	&movdqa	($inout0,$ivec);
695e1051a39Sopenharmony_ci	&lea	($key,&DWP(32,$key,$rounds));
696e1051a39Sopenharmony_ci	&sub	($rounds_,$rounds);
697e1051a39Sopenharmony_ci	&pshufb	($ivec,$inout3);
698e1051a39Sopenharmony_ci
699e1051a39Sopenharmony_ci&set_label("ccm64_enc_outer");
700e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key_));
701e1051a39Sopenharmony_ci	&mov		($rounds,$rounds_);
702e1051a39Sopenharmony_ci	&movups		($in0,&QWP(0,$inp));
703e1051a39Sopenharmony_ci
704e1051a39Sopenharmony_ci	&xorps		($inout0,$rndkey0);
705e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key_));
706e1051a39Sopenharmony_ci	&xorps		($rndkey0,$in0);
707e1051a39Sopenharmony_ci	&xorps		($cmac,$rndkey0);		# cmac^=inp
708e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(32,$key_));
709e1051a39Sopenharmony_ci
710e1051a39Sopenharmony_ci&set_label("ccm64_enc2_loop");
711e1051a39Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
712e1051a39Sopenharmony_ci	&aesenc		($cmac,$rndkey1);
713e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
714e1051a39Sopenharmony_ci	&add		($rounds,32);
715e1051a39Sopenharmony_ci	&aesenc		($inout0,$rndkey0);
716e1051a39Sopenharmony_ci	&aesenc		($cmac,$rndkey0);
717e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
718e1051a39Sopenharmony_ci	&jnz		(&label("ccm64_enc2_loop"));
719e1051a39Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
720e1051a39Sopenharmony_ci	&aesenc		($cmac,$rndkey1);
721e1051a39Sopenharmony_ci	&paddq		($ivec,&QWP(16,"esp"));
722e1051a39Sopenharmony_ci	&dec		($len);
723e1051a39Sopenharmony_ci	&aesenclast	($inout0,$rndkey0);
724e1051a39Sopenharmony_ci	&aesenclast	($cmac,$rndkey0);
725e1051a39Sopenharmony_ci
726e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16,$inp));
727e1051a39Sopenharmony_ci	&xorps	($in0,$inout0);			# inp^=E(ivec)
728e1051a39Sopenharmony_ci	&movdqa	($inout0,$ivec);
729e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$in0);		# save output
730e1051a39Sopenharmony_ci	&pshufb	($inout0,$inout3);
731e1051a39Sopenharmony_ci	&lea	($out,&DWP(16,$out));
732e1051a39Sopenharmony_ci	&jnz	(&label("ccm64_enc_outer"));
733e1051a39Sopenharmony_ci
734e1051a39Sopenharmony_ci	&mov	("esp",&DWP(48,"esp"));
735e1051a39Sopenharmony_ci	&mov	($out,&wparam(5));
736e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$cmac);
737e1051a39Sopenharmony_ci
738e1051a39Sopenharmony_ci	&pxor	("xmm0","xmm0");		# clear register bank
739e1051a39Sopenharmony_ci	&pxor	("xmm1","xmm1");
740e1051a39Sopenharmony_ci	&pxor	("xmm2","xmm2");
741e1051a39Sopenharmony_ci	&pxor	("xmm3","xmm3");
742e1051a39Sopenharmony_ci	&pxor	("xmm4","xmm4");
743e1051a39Sopenharmony_ci	&pxor	("xmm5","xmm5");
744e1051a39Sopenharmony_ci	&pxor	("xmm6","xmm6");
745e1051a39Sopenharmony_ci	&pxor	("xmm7","xmm7");
746e1051a39Sopenharmony_ci&function_end("aesni_ccm64_encrypt_blocks");
747e1051a39Sopenharmony_ci
748e1051a39Sopenharmony_ci&function_begin("aesni_ccm64_decrypt_blocks");
749e1051a39Sopenharmony_ci	&mov	($inp,&wparam(0));
750e1051a39Sopenharmony_ci	&mov	($out,&wparam(1));
751e1051a39Sopenharmony_ci	&mov	($len,&wparam(2));
752e1051a39Sopenharmony_ci	&mov	($key,&wparam(3));
753e1051a39Sopenharmony_ci	&mov	($rounds_,&wparam(4));
754e1051a39Sopenharmony_ci	&mov	($rounds,&wparam(5));
755e1051a39Sopenharmony_ci	&mov	($key_,"esp");
756e1051a39Sopenharmony_ci	&sub	("esp",60);
757e1051a39Sopenharmony_ci	&and	("esp",-16);			# align stack
758e1051a39Sopenharmony_ci	&mov	(&DWP(48,"esp"),$key_);
759e1051a39Sopenharmony_ci
760e1051a39Sopenharmony_ci	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
761e1051a39Sopenharmony_ci	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
762e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
763e1051a39Sopenharmony_ci
764e1051a39Sopenharmony_ci	# compose byte-swap control mask for pshufb on stack
765e1051a39Sopenharmony_ci	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
766e1051a39Sopenharmony_ci	&mov	(&DWP(4,"esp"),0x08090a0b);
767e1051a39Sopenharmony_ci	&mov	(&DWP(8,"esp"),0x04050607);
768e1051a39Sopenharmony_ci	&mov	(&DWP(12,"esp"),0x00010203);
769e1051a39Sopenharmony_ci
770e1051a39Sopenharmony_ci	# compose counter increment vector on stack
771e1051a39Sopenharmony_ci	&mov	($rounds_,1);
772e1051a39Sopenharmony_ci	&xor	($key_,$key_);
773e1051a39Sopenharmony_ci	&mov	(&DWP(16,"esp"),$rounds_);
774e1051a39Sopenharmony_ci	&mov	(&DWP(20,"esp"),$key_);
775e1051a39Sopenharmony_ci	&mov	(&DWP(24,"esp"),$key_);
776e1051a39Sopenharmony_ci	&mov	(&DWP(28,"esp"),$key_);
777e1051a39Sopenharmony_ci
778e1051a39Sopenharmony_ci	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
779e1051a39Sopenharmony_ci	&movdqa	($inout0,$ivec);
780e1051a39Sopenharmony_ci
781e1051a39Sopenharmony_ci	&mov	($key_,$key);
782e1051a39Sopenharmony_ci	&mov	($rounds_,$rounds);
783e1051a39Sopenharmony_ci
784e1051a39Sopenharmony_ci	&pshufb	($ivec,$inout3);
785e1051a39Sopenharmony_ci	if ($inline)
786e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
787e1051a39Sopenharmony_ci	else
788e1051a39Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
789e1051a39Sopenharmony_ci	&shl	($rounds_,4);
790e1051a39Sopenharmony_ci	&mov	($rounds,16);
791e1051a39Sopenharmony_ci	&movups	($in0,&QWP(0,$inp));		# load inp
792e1051a39Sopenharmony_ci	&paddq	($ivec,&QWP(16,"esp"));
793e1051a39Sopenharmony_ci	&lea	($inp,&QWP(16,$inp));
794e1051a39Sopenharmony_ci	&sub	($rounds,$rounds_);
795e1051a39Sopenharmony_ci	&lea	($key,&DWP(32,$key_,$rounds_));
796e1051a39Sopenharmony_ci	&mov	($rounds_,$rounds);
797e1051a39Sopenharmony_ci	&jmp	(&label("ccm64_dec_outer"));
798e1051a39Sopenharmony_ci
799e1051a39Sopenharmony_ci&set_label("ccm64_dec_outer",16);
800e1051a39Sopenharmony_ci	&xorps	($in0,$inout0);			# inp ^= E(ivec)
801e1051a39Sopenharmony_ci	&movdqa	($inout0,$ivec);
802e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$in0);		# save output
803e1051a39Sopenharmony_ci	&lea	($out,&DWP(16,$out));
804e1051a39Sopenharmony_ci	&pshufb	($inout0,$inout3);
805e1051a39Sopenharmony_ci
806e1051a39Sopenharmony_ci	&sub	($len,1);
807e1051a39Sopenharmony_ci	&jz	(&label("ccm64_dec_break"));
808e1051a39Sopenharmony_ci
809e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key_));
810e1051a39Sopenharmony_ci	&mov		($rounds,$rounds_);
811e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key_));
812e1051a39Sopenharmony_ci	&xorps		($in0,$rndkey0);
813e1051a39Sopenharmony_ci	&xorps		($inout0,$rndkey0);
814e1051a39Sopenharmony_ci	&xorps		($cmac,$in0);		# cmac^=out
815e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(32,$key_));
816e1051a39Sopenharmony_ci
817e1051a39Sopenharmony_ci&set_label("ccm64_dec2_loop");
818e1051a39Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
819e1051a39Sopenharmony_ci	&aesenc		($cmac,$rndkey1);
820e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
821e1051a39Sopenharmony_ci	&add		($rounds,32);
822e1051a39Sopenharmony_ci	&aesenc		($inout0,$rndkey0);
823e1051a39Sopenharmony_ci	&aesenc		($cmac,$rndkey0);
824e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
825e1051a39Sopenharmony_ci	&jnz		(&label("ccm64_dec2_loop"));
826e1051a39Sopenharmony_ci	&movups		($in0,&QWP(0,$inp));	# load inp
827e1051a39Sopenharmony_ci	&paddq		($ivec,&QWP(16,"esp"));
828e1051a39Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
829e1051a39Sopenharmony_ci	&aesenc		($cmac,$rndkey1);
830e1051a39Sopenharmony_ci	&aesenclast	($inout0,$rndkey0);
831e1051a39Sopenharmony_ci	&aesenclast	($cmac,$rndkey0);
832e1051a39Sopenharmony_ci	&lea		($inp,&QWP(16,$inp));
833e1051a39Sopenharmony_ci	&jmp	(&label("ccm64_dec_outer"));
834e1051a39Sopenharmony_ci
835e1051a39Sopenharmony_ci&set_label("ccm64_dec_break",16);
836e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key_));
837e1051a39Sopenharmony_ci	&mov	($key,$key_);
838e1051a39Sopenharmony_ci	if ($inline)
839e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
840e1051a39Sopenharmony_ci	else
841e1051a39Sopenharmony_ci	{   &call	("_aesni_encrypt1",$cmac);	}
842e1051a39Sopenharmony_ci
843e1051a39Sopenharmony_ci	&mov	("esp",&DWP(48,"esp"));
844e1051a39Sopenharmony_ci	&mov	($out,&wparam(5));
845e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$cmac);
846e1051a39Sopenharmony_ci
847e1051a39Sopenharmony_ci	&pxor	("xmm0","xmm0");		# clear register bank
848e1051a39Sopenharmony_ci	&pxor	("xmm1","xmm1");
849e1051a39Sopenharmony_ci	&pxor	("xmm2","xmm2");
850e1051a39Sopenharmony_ci	&pxor	("xmm3","xmm3");
851e1051a39Sopenharmony_ci	&pxor	("xmm4","xmm4");
852e1051a39Sopenharmony_ci	&pxor	("xmm5","xmm5");
853e1051a39Sopenharmony_ci	&pxor	("xmm6","xmm6");
854e1051a39Sopenharmony_ci	&pxor	("xmm7","xmm7");
855e1051a39Sopenharmony_ci&function_end("aesni_ccm64_decrypt_blocks");
856e1051a39Sopenharmony_ci}
857e1051a39Sopenharmony_ci
858e1051a39Sopenharmony_ci######################################################################
859e1051a39Sopenharmony_ci# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
860e1051a39Sopenharmony_ci#                         size_t blocks, const AES_KEY *key,
861e1051a39Sopenharmony_ci#                         const char *ivec);
862e1051a39Sopenharmony_ci#
863e1051a39Sopenharmony_ci# Handles only complete blocks, operates on 32-bit counter and
864e1051a39Sopenharmony_ci# does not update *ivec! (see crypto/modes/ctr128.c for details)
865e1051a39Sopenharmony_ci#
866e1051a39Sopenharmony_ci# stack layout:
867e1051a39Sopenharmony_ci#	0	pshufb mask
868e1051a39Sopenharmony_ci#	16	vector addend: 0,6,6,6
869e1051a39Sopenharmony_ci# 	32	counter-less ivec
870e1051a39Sopenharmony_ci#	48	1st triplet of counter vector
871e1051a39Sopenharmony_ci#	64	2nd triplet of counter vector
872e1051a39Sopenharmony_ci#	80	saved %esp
873e1051a39Sopenharmony_ci
874e1051a39Sopenharmony_ci&function_begin("aesni_ctr32_encrypt_blocks");
875e1051a39Sopenharmony_ci	&mov	($inp,&wparam(0));
876e1051a39Sopenharmony_ci	&mov	($out,&wparam(1));
877e1051a39Sopenharmony_ci	&mov	($len,&wparam(2));
878e1051a39Sopenharmony_ci	&mov	($key,&wparam(3));
879e1051a39Sopenharmony_ci	&mov	($rounds_,&wparam(4));
880e1051a39Sopenharmony_ci	&mov	($key_,"esp");
881e1051a39Sopenharmony_ci	&sub	("esp",88);
882e1051a39Sopenharmony_ci	&and	("esp",-16);			# align stack
883e1051a39Sopenharmony_ci	&mov	(&DWP(80,"esp"),$key_);
884e1051a39Sopenharmony_ci
885e1051a39Sopenharmony_ci	&cmp	($len,1);
886e1051a39Sopenharmony_ci	&je	(&label("ctr32_one_shortcut"));
887e1051a39Sopenharmony_ci
888e1051a39Sopenharmony_ci	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
889e1051a39Sopenharmony_ci
890e1051a39Sopenharmony_ci	# compose byte-swap control mask for pshufb on stack
891e1051a39Sopenharmony_ci	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
892e1051a39Sopenharmony_ci	&mov	(&DWP(4,"esp"),0x08090a0b);
893e1051a39Sopenharmony_ci	&mov	(&DWP(8,"esp"),0x04050607);
894e1051a39Sopenharmony_ci	&mov	(&DWP(12,"esp"),0x00010203);
895e1051a39Sopenharmony_ci
896e1051a39Sopenharmony_ci	# compose counter increment vector on stack
897e1051a39Sopenharmony_ci	&mov	($rounds,6);
898e1051a39Sopenharmony_ci	&xor	($key_,$key_);
899e1051a39Sopenharmony_ci	&mov	(&DWP(16,"esp"),$rounds);
900e1051a39Sopenharmony_ci	&mov	(&DWP(20,"esp"),$rounds);
901e1051a39Sopenharmony_ci	&mov	(&DWP(24,"esp"),$rounds);
902e1051a39Sopenharmony_ci	&mov	(&DWP(28,"esp"),$key_);
903e1051a39Sopenharmony_ci
904e1051a39Sopenharmony_ci	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
905e1051a39Sopenharmony_ci	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
906e1051a39Sopenharmony_ci
907e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));	# key->rounds
908e1051a39Sopenharmony_ci
909e1051a39Sopenharmony_ci	# compose 2 vectors of 3x32-bit counters
910e1051a39Sopenharmony_ci	&bswap	($rounds_);
911e1051a39Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);
912e1051a39Sopenharmony_ci	&pxor	($rndkey1,$rndkey1);
913e1051a39Sopenharmony_ci	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
914e1051a39Sopenharmony_ci	&pinsrd	($rndkey0,$rounds_,0);
915e1051a39Sopenharmony_ci	&lea	($key_,&DWP(3,$rounds_));
916e1051a39Sopenharmony_ci	&pinsrd	($rndkey1,$key_,0);
917e1051a39Sopenharmony_ci	&inc	($rounds_);
918e1051a39Sopenharmony_ci	&pinsrd	($rndkey0,$rounds_,1);
919e1051a39Sopenharmony_ci	&inc	($key_);
920e1051a39Sopenharmony_ci	&pinsrd	($rndkey1,$key_,1);
921e1051a39Sopenharmony_ci	&inc	($rounds_);
922e1051a39Sopenharmony_ci	&pinsrd	($rndkey0,$rounds_,2);
923e1051a39Sopenharmony_ci	&inc	($key_);
924e1051a39Sopenharmony_ci	&pinsrd	($rndkey1,$key_,2);
925e1051a39Sopenharmony_ci	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
926e1051a39Sopenharmony_ci	&pshufb	($rndkey0,$inout0);		# byte swap
927e1051a39Sopenharmony_ci	&movdqu	($inout4,&QWP(0,$key));		# key[0]
928e1051a39Sopenharmony_ci	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
929e1051a39Sopenharmony_ci	&pshufb	($rndkey1,$inout0);		# byte swap
930e1051a39Sopenharmony_ci
931e1051a39Sopenharmony_ci	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
932e1051a39Sopenharmony_ci	&pshufd	($inout1,$rndkey0,2<<6);
933e1051a39Sopenharmony_ci	&cmp	($len,6);
934e1051a39Sopenharmony_ci	&jb	(&label("ctr32_tail"));
935e1051a39Sopenharmony_ci	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
936e1051a39Sopenharmony_ci	&shl	($rounds,4);
937e1051a39Sopenharmony_ci	&mov	($rounds_,16);
938e1051a39Sopenharmony_ci	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
939e1051a39Sopenharmony_ci	&mov	($key_,$key);			# backup $key
940e1051a39Sopenharmony_ci	&sub	($rounds_,$rounds);		# backup twisted $rounds
941e1051a39Sopenharmony_ci	&lea	($key,&DWP(32,$key,$rounds));
942e1051a39Sopenharmony_ci	&sub	($len,6);
943e1051a39Sopenharmony_ci	&jmp	(&label("ctr32_loop6"));
944e1051a39Sopenharmony_ci
945e1051a39Sopenharmony_ci&set_label("ctr32_loop6",16);
946e1051a39Sopenharmony_ci	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
947e1051a39Sopenharmony_ci	&pshufd	($inout2,$rndkey0,1<<6);
948e1051a39Sopenharmony_ci	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
949e1051a39Sopenharmony_ci	&pshufd	($inout3,$rndkey1,3<<6);
950e1051a39Sopenharmony_ci	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
951e1051a39Sopenharmony_ci	&pshufd	($inout4,$rndkey1,2<<6);
952e1051a39Sopenharmony_ci	&pxor		($inout1,$rndkey0);
953e1051a39Sopenharmony_ci	&pshufd	($inout5,$rndkey1,1<<6);
954e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key_));
955e1051a39Sopenharmony_ci	&pxor		($inout2,$rndkey0);
956e1051a39Sopenharmony_ci	&pxor		($inout3,$rndkey0);
957e1051a39Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
958e1051a39Sopenharmony_ci	&pxor		($inout4,$rndkey0);
959e1051a39Sopenharmony_ci	&pxor		($inout5,$rndkey0);
960e1051a39Sopenharmony_ci	&aesenc		($inout1,$rndkey1);
961e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(32,$key_));
962e1051a39Sopenharmony_ci	&mov		($rounds,$rounds_);
963e1051a39Sopenharmony_ci	&aesenc		($inout2,$rndkey1);
964e1051a39Sopenharmony_ci	&aesenc		($inout3,$rndkey1);
965e1051a39Sopenharmony_ci	&aesenc		($inout4,$rndkey1);
966e1051a39Sopenharmony_ci	&aesenc		($inout5,$rndkey1);
967e1051a39Sopenharmony_ci
968e1051a39Sopenharmony_ci	&call		(&label("_aesni_encrypt6_enter"));
969e1051a39Sopenharmony_ci
970e1051a39Sopenharmony_ci	&movups	($rndkey1,&QWP(0,$inp));
971e1051a39Sopenharmony_ci	&movups	($rndkey0,&QWP(0x10,$inp));
972e1051a39Sopenharmony_ci	&xorps	($inout0,$rndkey1);
973e1051a39Sopenharmony_ci	&movups	($rndkey1,&QWP(0x20,$inp));
974e1051a39Sopenharmony_ci	&xorps	($inout1,$rndkey0);
975e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
976e1051a39Sopenharmony_ci	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
977e1051a39Sopenharmony_ci	&xorps	($inout2,$rndkey1);
978e1051a39Sopenharmony_ci	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
979e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
980e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
981e1051a39Sopenharmony_ci
982e1051a39Sopenharmony_ci	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
983e1051a39Sopenharmony_ci	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
984e1051a39Sopenharmony_ci	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
985e1051a39Sopenharmony_ci
986e1051a39Sopenharmony_ci	&movups	($inout1,&QWP(0x30,$inp));
987e1051a39Sopenharmony_ci	&movups	($inout2,&QWP(0x40,$inp));
988e1051a39Sopenharmony_ci	&xorps	($inout3,$inout1);
989e1051a39Sopenharmony_ci	&movups	($inout1,&QWP(0x50,$inp));
990e1051a39Sopenharmony_ci	&lea	($inp,&DWP(0x60,$inp));
991e1051a39Sopenharmony_ci	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
992e1051a39Sopenharmony_ci	&pshufb	($rndkey0,$inout0);		# byte swap
993e1051a39Sopenharmony_ci	&xorps	($inout4,$inout2);
994e1051a39Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
995e1051a39Sopenharmony_ci	&xorps	($inout5,$inout1);
996e1051a39Sopenharmony_ci	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
997e1051a39Sopenharmony_ci	&pshufb	($rndkey1,$inout0);		# byte swap
998e1051a39Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
999e1051a39Sopenharmony_ci	&pshufd	($inout0,$rndkey0,3<<6);
1000e1051a39Sopenharmony_ci	&movups	(&QWP(0x50,$out),$inout5);
1001e1051a39Sopenharmony_ci	&lea	($out,&DWP(0x60,$out));
1002e1051a39Sopenharmony_ci
1003e1051a39Sopenharmony_ci	&pshufd	($inout1,$rndkey0,2<<6);
1004e1051a39Sopenharmony_ci	&sub	($len,6);
1005e1051a39Sopenharmony_ci	&jnc	(&label("ctr32_loop6"));
1006e1051a39Sopenharmony_ci
1007e1051a39Sopenharmony_ci	&add	($len,6);
1008e1051a39Sopenharmony_ci	&jz	(&label("ctr32_ret"));
1009e1051a39Sopenharmony_ci	&movdqu	($inout5,&QWP(0,$key_));
1010e1051a39Sopenharmony_ci	&mov	($key,$key_);
1011e1051a39Sopenharmony_ci	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
1012e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1013e1051a39Sopenharmony_ci
1014e1051a39Sopenharmony_ci&set_label("ctr32_tail");
1015e1051a39Sopenharmony_ci	&por	($inout0,$inout5);
1016e1051a39Sopenharmony_ci	&cmp	($len,2);
1017e1051a39Sopenharmony_ci	&jb	(&label("ctr32_one"));
1018e1051a39Sopenharmony_ci
1019e1051a39Sopenharmony_ci	&pshufd	($inout2,$rndkey0,1<<6);
1020e1051a39Sopenharmony_ci	&por	($inout1,$inout5);
1021e1051a39Sopenharmony_ci	&je	(&label("ctr32_two"));
1022e1051a39Sopenharmony_ci
1023e1051a39Sopenharmony_ci	&pshufd	($inout3,$rndkey1,3<<6);
1024e1051a39Sopenharmony_ci	&por	($inout2,$inout5);
1025e1051a39Sopenharmony_ci	&cmp	($len,4);
1026e1051a39Sopenharmony_ci	&jb	(&label("ctr32_three"));
1027e1051a39Sopenharmony_ci
1028e1051a39Sopenharmony_ci	&pshufd	($inout4,$rndkey1,2<<6);
1029e1051a39Sopenharmony_ci	&por	($inout3,$inout5);
1030e1051a39Sopenharmony_ci	&je	(&label("ctr32_four"));
1031e1051a39Sopenharmony_ci
1032e1051a39Sopenharmony_ci	&por	($inout4,$inout5);
1033e1051a39Sopenharmony_ci	&call	("_aesni_encrypt6");
1034e1051a39Sopenharmony_ci	&movups	($rndkey1,&QWP(0,$inp));
1035e1051a39Sopenharmony_ci	&movups	($rndkey0,&QWP(0x10,$inp));
1036e1051a39Sopenharmony_ci	&xorps	($inout0,$rndkey1);
1037e1051a39Sopenharmony_ci	&movups	($rndkey1,&QWP(0x20,$inp));
1038e1051a39Sopenharmony_ci	&xorps	($inout1,$rndkey0);
1039e1051a39Sopenharmony_ci	&movups	($rndkey0,&QWP(0x30,$inp));
1040e1051a39Sopenharmony_ci	&xorps	($inout2,$rndkey1);
1041e1051a39Sopenharmony_ci	&movups	($rndkey1,&QWP(0x40,$inp));
1042e1051a39Sopenharmony_ci	&xorps	($inout3,$rndkey0);
1043e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
1044e1051a39Sopenharmony_ci	&xorps	($inout4,$rndkey1);
1045e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
1046e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
1047e1051a39Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
1048e1051a39Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
1049e1051a39Sopenharmony_ci	&jmp	(&label("ctr32_ret"));
1050e1051a39Sopenharmony_ci
1051e1051a39Sopenharmony_ci&set_label("ctr32_one_shortcut",16);
1052e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
1053e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
1054e1051a39Sopenharmony_ci
1055e1051a39Sopenharmony_ci&set_label("ctr32_one");
1056e1051a39Sopenharmony_ci	if ($inline)
1057e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
1058e1051a39Sopenharmony_ci	else
1059e1051a39Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
1060e1051a39Sopenharmony_ci	&movups	($in0,&QWP(0,$inp));
1061e1051a39Sopenharmony_ci	&xorps	($in0,$inout0);
1062e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$in0);
1063e1051a39Sopenharmony_ci	&jmp	(&label("ctr32_ret"));
1064e1051a39Sopenharmony_ci
1065e1051a39Sopenharmony_ci&set_label("ctr32_two",16);
1066e1051a39Sopenharmony_ci	&call	("_aesni_encrypt2");
1067e1051a39Sopenharmony_ci	&movups	($inout3,&QWP(0,$inp));
1068e1051a39Sopenharmony_ci	&movups	($inout4,&QWP(0x10,$inp));
1069e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);
1070e1051a39Sopenharmony_ci	&xorps	($inout1,$inout4);
1071e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
1072e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
1073e1051a39Sopenharmony_ci	&jmp	(&label("ctr32_ret"));
1074e1051a39Sopenharmony_ci
1075e1051a39Sopenharmony_ci&set_label("ctr32_three",16);
1076e1051a39Sopenharmony_ci	&call	("_aesni_encrypt3");
1077e1051a39Sopenharmony_ci	&movups	($inout3,&QWP(0,$inp));
1078e1051a39Sopenharmony_ci	&movups	($inout4,&QWP(0x10,$inp));
1079e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);
1080e1051a39Sopenharmony_ci	&movups	($inout5,&QWP(0x20,$inp));
1081e1051a39Sopenharmony_ci	&xorps	($inout1,$inout4);
1082e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
1083e1051a39Sopenharmony_ci	&xorps	($inout2,$inout5);
1084e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
1085e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
1086e1051a39Sopenharmony_ci	&jmp	(&label("ctr32_ret"));
1087e1051a39Sopenharmony_ci
1088e1051a39Sopenharmony_ci&set_label("ctr32_four",16);
1089e1051a39Sopenharmony_ci	&call	("_aesni_encrypt4");
1090e1051a39Sopenharmony_ci	&movups	($inout4,&QWP(0,$inp));
1091e1051a39Sopenharmony_ci	&movups	($inout5,&QWP(0x10,$inp));
1092e1051a39Sopenharmony_ci	&movups	($rndkey1,&QWP(0x20,$inp));
1093e1051a39Sopenharmony_ci	&xorps	($inout0,$inout4);
1094e1051a39Sopenharmony_ci	&movups	($rndkey0,&QWP(0x30,$inp));
1095e1051a39Sopenharmony_ci	&xorps	($inout1,$inout5);
1096e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
1097e1051a39Sopenharmony_ci	&xorps	($inout2,$rndkey1);
1098e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
1099e1051a39Sopenharmony_ci	&xorps	($inout3,$rndkey0);
1100e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
1101e1051a39Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
1102e1051a39Sopenharmony_ci
1103e1051a39Sopenharmony_ci&set_label("ctr32_ret");
1104e1051a39Sopenharmony_ci	&pxor	("xmm0","xmm0");		# clear register bank
1105e1051a39Sopenharmony_ci	&pxor	("xmm1","xmm1");
1106e1051a39Sopenharmony_ci	&pxor	("xmm2","xmm2");
1107e1051a39Sopenharmony_ci	&pxor	("xmm3","xmm3");
1108e1051a39Sopenharmony_ci	&pxor	("xmm4","xmm4");
1109e1051a39Sopenharmony_ci	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
1110e1051a39Sopenharmony_ci	&pxor	("xmm5","xmm5");
1111e1051a39Sopenharmony_ci	&movdqa	(&QWP(48,"esp"),"xmm0");
1112e1051a39Sopenharmony_ci	&pxor	("xmm6","xmm6");
1113e1051a39Sopenharmony_ci	&movdqa	(&QWP(64,"esp"),"xmm0");
1114e1051a39Sopenharmony_ci	&pxor	("xmm7","xmm7");
1115e1051a39Sopenharmony_ci	&mov	("esp",&DWP(80,"esp"));
1116e1051a39Sopenharmony_ci&function_end("aesni_ctr32_encrypt_blocks");
1117e1051a39Sopenharmony_ci
1118e1051a39Sopenharmony_ci######################################################################
1119e1051a39Sopenharmony_ci# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1120e1051a39Sopenharmony_ci#	const AES_KEY *key1, const AES_KEY *key2
1121e1051a39Sopenharmony_ci#	const unsigned char iv[16]);
1122e1051a39Sopenharmony_ci#
1123e1051a39Sopenharmony_ci{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1124e1051a39Sopenharmony_ci
1125e1051a39Sopenharmony_ci&function_begin("aesni_xts_encrypt");
1126e1051a39Sopenharmony_ci	&mov	($key,&wparam(4));		# key2
1127e1051a39Sopenharmony_ci	&mov	($inp,&wparam(5));		# clear-text tweak
1128e1051a39Sopenharmony_ci
1129e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1130e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));
1131e1051a39Sopenharmony_ci	if ($inline)
1132e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
1133e1051a39Sopenharmony_ci	else
1134e1051a39Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
1135e1051a39Sopenharmony_ci
1136e1051a39Sopenharmony_ci	&mov	($inp,&wparam(0));
1137e1051a39Sopenharmony_ci	&mov	($out,&wparam(1));
1138e1051a39Sopenharmony_ci	&mov	($len,&wparam(2));
1139e1051a39Sopenharmony_ci	&mov	($key,&wparam(3));		# key1
1140e1051a39Sopenharmony_ci
1141e1051a39Sopenharmony_ci	&mov	($key_,"esp");
1142e1051a39Sopenharmony_ci	&sub	("esp",16*7+8);
1143e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1144e1051a39Sopenharmony_ci	&and	("esp",-16);			# align stack
1145e1051a39Sopenharmony_ci
1146e1051a39Sopenharmony_ci	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1147e1051a39Sopenharmony_ci	&mov	(&DWP(16*6+4,"esp"),0);
1148e1051a39Sopenharmony_ci	&mov	(&DWP(16*6+8,"esp"),1);
1149e1051a39Sopenharmony_ci	&mov	(&DWP(16*6+12,"esp"),0);
1150e1051a39Sopenharmony_ci	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1151e1051a39Sopenharmony_ci	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1152e1051a39Sopenharmony_ci
1153e1051a39Sopenharmony_ci	&movdqa	($tweak,$inout0);
1154e1051a39Sopenharmony_ci	&pxor	($twtmp,$twtmp);
1155e1051a39Sopenharmony_ci	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1156e1051a39Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1157e1051a39Sopenharmony_ci
1158e1051a39Sopenharmony_ci	&and	($len,-16);
1159e1051a39Sopenharmony_ci	&mov	($key_,$key);			# backup $key
1160e1051a39Sopenharmony_ci	&mov	($rounds_,$rounds);		# backup $rounds
1161e1051a39Sopenharmony_ci	&sub	($len,16*6);
1162e1051a39Sopenharmony_ci	&jc	(&label("xts_enc_short"));
1163e1051a39Sopenharmony_ci
1164e1051a39Sopenharmony_ci	&shl	($rounds,4);
1165e1051a39Sopenharmony_ci	&mov	($rounds_,16);
1166e1051a39Sopenharmony_ci	&sub	($rounds_,$rounds);
1167e1051a39Sopenharmony_ci	&lea	($key,&DWP(32,$key,$rounds));
1168e1051a39Sopenharmony_ci	&jmp	(&label("xts_enc_loop6"));
1169e1051a39Sopenharmony_ci
1170e1051a39Sopenharmony_ci&set_label("xts_enc_loop6",16);
1171e1051a39Sopenharmony_ci	for ($i=0;$i<4;$i++) {
1172e1051a39Sopenharmony_ci	    &pshufd	($twres,$twtmp,0x13);
1173e1051a39Sopenharmony_ci	    &pxor	($twtmp,$twtmp);
1174e1051a39Sopenharmony_ci	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1175e1051a39Sopenharmony_ci	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1176e1051a39Sopenharmony_ci	    &pand	($twres,$twmask);	# isolate carry and residue
1177e1051a39Sopenharmony_ci	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1178e1051a39Sopenharmony_ci	    &pxor	($tweak,$twres);
1179e1051a39Sopenharmony_ci	}
1180e1051a39Sopenharmony_ci	&pshufd	($inout5,$twtmp,0x13);
1181e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1182e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1183e1051a39Sopenharmony_ci	 &$movekey	($rndkey0,&QWP(0,$key_));
1184e1051a39Sopenharmony_ci	&pand	($inout5,$twmask);		# isolate carry and residue
1185e1051a39Sopenharmony_ci	 &movups	($inout0,&QWP(0,$inp));	# load input
1186e1051a39Sopenharmony_ci	&pxor	($inout5,$tweak);
1187e1051a39Sopenharmony_ci
1188e1051a39Sopenharmony_ci	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1189e1051a39Sopenharmony_ci	&mov	($rounds,$rounds_);		# restore $rounds
1190e1051a39Sopenharmony_ci	&movdqu	($inout1,&QWP(16*1,$inp));
1191e1051a39Sopenharmony_ci	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1192e1051a39Sopenharmony_ci	&movdqu	($inout2,&QWP(16*2,$inp));
1193e1051a39Sopenharmony_ci	 &pxor		($inout1,$rndkey0);
1194e1051a39Sopenharmony_ci	&movdqu	($inout3,&QWP(16*3,$inp));
1195e1051a39Sopenharmony_ci	 &pxor		($inout2,$rndkey0);
1196e1051a39Sopenharmony_ci	&movdqu	($inout4,&QWP(16*4,$inp));
1197e1051a39Sopenharmony_ci	 &pxor		($inout3,$rndkey0);
1198e1051a39Sopenharmony_ci	&movdqu	($rndkey1,&QWP(16*5,$inp));
1199e1051a39Sopenharmony_ci	 &pxor		($inout4,$rndkey0);
1200e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16*6,$inp));
1201e1051a39Sopenharmony_ci	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1202e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1203e1051a39Sopenharmony_ci	&pxor	($inout5,$rndkey1);
1204e1051a39Sopenharmony_ci
1205e1051a39Sopenharmony_ci	 &$movekey	($rndkey1,&QWP(16,$key_));
1206e1051a39Sopenharmony_ci	&pxor	($inout1,&QWP(16*1,"esp"));
1207e1051a39Sopenharmony_ci	&pxor	($inout2,&QWP(16*2,"esp"));
1208e1051a39Sopenharmony_ci	 &aesenc	($inout0,$rndkey1);
1209e1051a39Sopenharmony_ci	&pxor	($inout3,&QWP(16*3,"esp"));
1210e1051a39Sopenharmony_ci	&pxor	($inout4,&QWP(16*4,"esp"));
1211e1051a39Sopenharmony_ci	 &aesenc	($inout1,$rndkey1);
1212e1051a39Sopenharmony_ci	&pxor		($inout5,$rndkey0);
1213e1051a39Sopenharmony_ci	 &$movekey	($rndkey0,&QWP(32,$key_));
1214e1051a39Sopenharmony_ci	 &aesenc	($inout2,$rndkey1);
1215e1051a39Sopenharmony_ci	 &aesenc	($inout3,$rndkey1);
1216e1051a39Sopenharmony_ci	 &aesenc	($inout4,$rndkey1);
1217e1051a39Sopenharmony_ci	 &aesenc	($inout5,$rndkey1);
1218e1051a39Sopenharmony_ci	&call		(&label("_aesni_encrypt6_enter"));
1219e1051a39Sopenharmony_ci
1220e1051a39Sopenharmony_ci	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1221e1051a39Sopenharmony_ci       &pxor	($twtmp,$twtmp);
1222e1051a39Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1223e1051a39Sopenharmony_ci       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1224e1051a39Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
1225e1051a39Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
1226e1051a39Sopenharmony_ci	&xorps	($inout2,&QWP(16*2,"esp"));
1227e1051a39Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
1228e1051a39Sopenharmony_ci	&xorps	($inout3,&QWP(16*3,"esp"));
1229e1051a39Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
1230e1051a39Sopenharmony_ci	&xorps	($inout4,&QWP(16*4,"esp"));
1231e1051a39Sopenharmony_ci	&movups	(&QWP(16*3,$out),$inout3);
1232e1051a39Sopenharmony_ci	&xorps	($inout5,$tweak);
1233e1051a39Sopenharmony_ci	&movups	(&QWP(16*4,$out),$inout4);
1234e1051a39Sopenharmony_ci       &pshufd	($twres,$twtmp,0x13);
1235e1051a39Sopenharmony_ci	&movups	(&QWP(16*5,$out),$inout5);
1236e1051a39Sopenharmony_ci	&lea	($out,&DWP(16*6,$out));
1237e1051a39Sopenharmony_ci       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1238e1051a39Sopenharmony_ci
1239e1051a39Sopenharmony_ci	&pxor	($twtmp,$twtmp);
1240e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1241e1051a39Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
1242e1051a39Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1243e1051a39Sopenharmony_ci	&pxor	($tweak,$twres);
1244e1051a39Sopenharmony_ci
1245e1051a39Sopenharmony_ci	&sub	($len,16*6);
1246e1051a39Sopenharmony_ci	&jnc	(&label("xts_enc_loop6"));
1247e1051a39Sopenharmony_ci
1248e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1249e1051a39Sopenharmony_ci	&mov	($key,$key_);			# restore $key
1250e1051a39Sopenharmony_ci	&mov	($rounds_,$rounds);
1251e1051a39Sopenharmony_ci
1252e1051a39Sopenharmony_ci&set_label("xts_enc_short");
1253e1051a39Sopenharmony_ci	&add	($len,16*6);
1254e1051a39Sopenharmony_ci	&jz	(&label("xts_enc_done6x"));
1255e1051a39Sopenharmony_ci
1256e1051a39Sopenharmony_ci	&movdqa	($inout3,$tweak);		# put aside previous tweak
1257e1051a39Sopenharmony_ci	&cmp	($len,0x20);
1258e1051a39Sopenharmony_ci	&jb	(&label("xts_enc_one"));
1259e1051a39Sopenharmony_ci
1260e1051a39Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
1261e1051a39Sopenharmony_ci	&pxor	($twtmp,$twtmp);
1262e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1263e1051a39Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
1264e1051a39Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1265e1051a39Sopenharmony_ci	&pxor	($tweak,$twres);
1266e1051a39Sopenharmony_ci	&je	(&label("xts_enc_two"));
1267e1051a39Sopenharmony_ci
1268e1051a39Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
1269e1051a39Sopenharmony_ci	&pxor	($twtmp,$twtmp);
1270e1051a39Sopenharmony_ci	&movdqa	($inout4,$tweak);		# put aside previous tweak
1271e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1272e1051a39Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
1273e1051a39Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1274e1051a39Sopenharmony_ci	&pxor	($tweak,$twres);
1275e1051a39Sopenharmony_ci	&cmp	($len,0x40);
1276e1051a39Sopenharmony_ci	&jb	(&label("xts_enc_three"));
1277e1051a39Sopenharmony_ci
1278e1051a39Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
1279e1051a39Sopenharmony_ci	&pxor	($twtmp,$twtmp);
1280e1051a39Sopenharmony_ci	&movdqa	($inout5,$tweak);		# put aside previous tweak
1281e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1282e1051a39Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
1283e1051a39Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1284e1051a39Sopenharmony_ci	&pxor	($tweak,$twres);
1285e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*0,"esp"),$inout3);
1286e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*1,"esp"),$inout4);
1287e1051a39Sopenharmony_ci	&je	(&label("xts_enc_four"));
1288e1051a39Sopenharmony_ci
1289e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*2,"esp"),$inout5);
1290e1051a39Sopenharmony_ci	&pshufd	($inout5,$twtmp,0x13);
1291e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*3,"esp"),$tweak);
1292e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1293e1051a39Sopenharmony_ci	&pand	($inout5,$twmask);		# isolate carry and residue
1294e1051a39Sopenharmony_ci	&pxor	($inout5,$tweak);
1295e1051a39Sopenharmony_ci
1296e1051a39Sopenharmony_ci	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1297e1051a39Sopenharmony_ci	&movdqu	($inout1,&QWP(16*1,$inp));
1298e1051a39Sopenharmony_ci	&movdqu	($inout2,&QWP(16*2,$inp));
1299e1051a39Sopenharmony_ci	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1300e1051a39Sopenharmony_ci	&movdqu	($inout3,&QWP(16*3,$inp));
1301e1051a39Sopenharmony_ci	&pxor	($inout1,&QWP(16*1,"esp"));
1302e1051a39Sopenharmony_ci	&movdqu	($inout4,&QWP(16*4,$inp));
1303e1051a39Sopenharmony_ci	&pxor	($inout2,&QWP(16*2,"esp"));
1304e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16*5,$inp));
1305e1051a39Sopenharmony_ci	&pxor	($inout3,&QWP(16*3,"esp"));
1306e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1307e1051a39Sopenharmony_ci	&pxor	($inout4,$inout5);
1308e1051a39Sopenharmony_ci
1309e1051a39Sopenharmony_ci	&call	("_aesni_encrypt6");
1310e1051a39Sopenharmony_ci
1311e1051a39Sopenharmony_ci	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1312e1051a39Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1313e1051a39Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
1314e1051a39Sopenharmony_ci	&xorps	($inout2,&QWP(16*2,"esp"));
1315e1051a39Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
1316e1051a39Sopenharmony_ci	&xorps	($inout3,&QWP(16*3,"esp"));
1317e1051a39Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
1318e1051a39Sopenharmony_ci	&xorps	($inout4,$tweak);
1319e1051a39Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
1320e1051a39Sopenharmony_ci	&movups	(&QWP(16*3,$out),$inout3);
1321e1051a39Sopenharmony_ci	&movups	(&QWP(16*4,$out),$inout4);
1322e1051a39Sopenharmony_ci	&lea	($out,&DWP(16*5,$out));
1323e1051a39Sopenharmony_ci	&jmp	(&label("xts_enc_done"));
1324e1051a39Sopenharmony_ci
1325e1051a39Sopenharmony_ci&set_label("xts_enc_one",16);
1326e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
1327e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16*1,$inp));
1328e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
1329e1051a39Sopenharmony_ci	if ($inline)
1330e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
1331e1051a39Sopenharmony_ci	else
1332e1051a39Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
1333e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
1334e1051a39Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
1335e1051a39Sopenharmony_ci	&lea	($out,&DWP(16*1,$out));
1336e1051a39Sopenharmony_ci
1337e1051a39Sopenharmony_ci	&movdqa	($tweak,$inout3);		# last tweak
1338e1051a39Sopenharmony_ci	&jmp	(&label("xts_enc_done"));
1339e1051a39Sopenharmony_ci
1340e1051a39Sopenharmony_ci&set_label("xts_enc_two",16);
1341e1051a39Sopenharmony_ci	&movaps	($inout4,$tweak);		# put aside last tweak
1342e1051a39Sopenharmony_ci
1343e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
1344e1051a39Sopenharmony_ci	&movups	($inout1,&QWP(16*1,$inp));
1345e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16*2,$inp));
1346e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
1347e1051a39Sopenharmony_ci	&xorps	($inout1,$inout4);
1348e1051a39Sopenharmony_ci
1349e1051a39Sopenharmony_ci	&call	("_aesni_encrypt2");
1350e1051a39Sopenharmony_ci
1351e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
1352e1051a39Sopenharmony_ci	&xorps	($inout1,$inout4);
1353e1051a39Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
1354e1051a39Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
1355e1051a39Sopenharmony_ci	&lea	($out,&DWP(16*2,$out));
1356e1051a39Sopenharmony_ci
1357e1051a39Sopenharmony_ci	&movdqa	($tweak,$inout4);		# last tweak
1358e1051a39Sopenharmony_ci	&jmp	(&label("xts_enc_done"));
1359e1051a39Sopenharmony_ci
1360e1051a39Sopenharmony_ci&set_label("xts_enc_three",16);
1361e1051a39Sopenharmony_ci	&movaps	($inout5,$tweak);		# put aside last tweak
1362e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
1363e1051a39Sopenharmony_ci	&movups	($inout1,&QWP(16*1,$inp));
1364e1051a39Sopenharmony_ci	&movups	($inout2,&QWP(16*2,$inp));
1365e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16*3,$inp));
1366e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
1367e1051a39Sopenharmony_ci	&xorps	($inout1,$inout4);
1368e1051a39Sopenharmony_ci	&xorps	($inout2,$inout5);
1369e1051a39Sopenharmony_ci
1370e1051a39Sopenharmony_ci	&call	("_aesni_encrypt3");
1371e1051a39Sopenharmony_ci
1372e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
1373e1051a39Sopenharmony_ci	&xorps	($inout1,$inout4);
1374e1051a39Sopenharmony_ci	&xorps	($inout2,$inout5);
1375e1051a39Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
1376e1051a39Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
1377e1051a39Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
1378e1051a39Sopenharmony_ci	&lea	($out,&DWP(16*3,$out));
1379e1051a39Sopenharmony_ci
1380e1051a39Sopenharmony_ci	&movdqa	($tweak,$inout5);		# last tweak
1381e1051a39Sopenharmony_ci	&jmp	(&label("xts_enc_done"));
1382e1051a39Sopenharmony_ci
1383e1051a39Sopenharmony_ci&set_label("xts_enc_four",16);
1384e1051a39Sopenharmony_ci	&movaps	($inout4,$tweak);		# put aside last tweak
1385e1051a39Sopenharmony_ci
1386e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
1387e1051a39Sopenharmony_ci	&movups	($inout1,&QWP(16*1,$inp));
1388e1051a39Sopenharmony_ci	&movups	($inout2,&QWP(16*2,$inp));
1389e1051a39Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1390e1051a39Sopenharmony_ci	&movups	($inout3,&QWP(16*3,$inp));
1391e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16*4,$inp));
1392e1051a39Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
1393e1051a39Sopenharmony_ci	&xorps	($inout2,$inout5);
1394e1051a39Sopenharmony_ci	&xorps	($inout3,$inout4);
1395e1051a39Sopenharmony_ci
1396e1051a39Sopenharmony_ci	&call	("_aesni_encrypt4");
1397e1051a39Sopenharmony_ci
1398e1051a39Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1399e1051a39Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
1400e1051a39Sopenharmony_ci	&xorps	($inout2,$inout5);
1401e1051a39Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
1402e1051a39Sopenharmony_ci	&xorps	($inout3,$inout4);
1403e1051a39Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
1404e1051a39Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
1405e1051a39Sopenharmony_ci	&movups	(&QWP(16*3,$out),$inout3);
1406e1051a39Sopenharmony_ci	&lea	($out,&DWP(16*4,$out));
1407e1051a39Sopenharmony_ci
1408e1051a39Sopenharmony_ci	&movdqa	($tweak,$inout4);		# last tweak
1409e1051a39Sopenharmony_ci	&jmp	(&label("xts_enc_done"));
1410e1051a39Sopenharmony_ci
1411e1051a39Sopenharmony_ci&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
1412e1051a39Sopenharmony_ci	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1413e1051a39Sopenharmony_ci	&and	($len,15);
1414e1051a39Sopenharmony_ci	&jz	(&label("xts_enc_ret"));
1415e1051a39Sopenharmony_ci	&movdqa	($inout3,$tweak);
1416e1051a39Sopenharmony_ci	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1417e1051a39Sopenharmony_ci	&jmp	(&label("xts_enc_steal"));
1418e1051a39Sopenharmony_ci
1419e1051a39Sopenharmony_ci&set_label("xts_enc_done",16);
1420e1051a39Sopenharmony_ci	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1421e1051a39Sopenharmony_ci	&pxor	($twtmp,$twtmp);
1422e1051a39Sopenharmony_ci	&and	($len,15);
1423e1051a39Sopenharmony_ci	&jz	(&label("xts_enc_ret"));
1424e1051a39Sopenharmony_ci
1425e1051a39Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1426e1051a39Sopenharmony_ci	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1427e1051a39Sopenharmony_ci	&pshufd	($inout3,$twtmp,0x13);
1428e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1429e1051a39Sopenharmony_ci	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
1430e1051a39Sopenharmony_ci	&pxor	($inout3,$tweak);
1431e1051a39Sopenharmony_ci
1432e1051a39Sopenharmony_ci&set_label("xts_enc_steal");
1433e1051a39Sopenharmony_ci	&movz	($rounds,&BP(0,$inp));
1434e1051a39Sopenharmony_ci	&movz	($key,&BP(-16,$out));
1435e1051a39Sopenharmony_ci	&lea	($inp,&DWP(1,$inp));
1436e1051a39Sopenharmony_ci	&mov	(&BP(-16,$out),&LB($rounds));
1437e1051a39Sopenharmony_ci	&mov	(&BP(0,$out),&LB($key));
1438e1051a39Sopenharmony_ci	&lea	($out,&DWP(1,$out));
1439e1051a39Sopenharmony_ci	&sub	($len,1);
1440e1051a39Sopenharmony_ci	&jnz	(&label("xts_enc_steal"));
1441e1051a39Sopenharmony_ci
1442e1051a39Sopenharmony_ci	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1443e1051a39Sopenharmony_ci	&mov	($key,$key_);			# restore $key
1444e1051a39Sopenharmony_ci	&mov	($rounds,$rounds_);		# restore $rounds
1445e1051a39Sopenharmony_ci
1446e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(-16,$out));	# load input
1447e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
1448e1051a39Sopenharmony_ci	if ($inline)
1449e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
1450e1051a39Sopenharmony_ci	else
1451e1051a39Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
1452e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
1453e1051a39Sopenharmony_ci	&movups	(&QWP(-16,$out),$inout0);	# write output
1454e1051a39Sopenharmony_ci
1455e1051a39Sopenharmony_ci&set_label("xts_enc_ret");
1456e1051a39Sopenharmony_ci	&pxor	("xmm0","xmm0");		# clear register bank
1457e1051a39Sopenharmony_ci	&pxor	("xmm1","xmm1");
1458e1051a39Sopenharmony_ci	&pxor	("xmm2","xmm2");
1459e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
1460e1051a39Sopenharmony_ci	&pxor	("xmm3","xmm3");
1461e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*1,"esp"),"xmm0");
1462e1051a39Sopenharmony_ci	&pxor	("xmm4","xmm4");
1463e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*2,"esp"),"xmm0");
1464e1051a39Sopenharmony_ci	&pxor	("xmm5","xmm5");
1465e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*3,"esp"),"xmm0");
1466e1051a39Sopenharmony_ci	&pxor	("xmm6","xmm6");
1467e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*4,"esp"),"xmm0");
1468e1051a39Sopenharmony_ci	&pxor	("xmm7","xmm7");
1469e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*5,"esp"),"xmm0");
1470e1051a39Sopenharmony_ci	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1471e1051a39Sopenharmony_ci&function_end("aesni_xts_encrypt");
1472e1051a39Sopenharmony_ci
1473e1051a39Sopenharmony_ci&function_begin("aesni_xts_decrypt");
1474e1051a39Sopenharmony_ci	&mov	($key,&wparam(4));		# key2
1475e1051a39Sopenharmony_ci	&mov	($inp,&wparam(5));		# clear-text tweak
1476e1051a39Sopenharmony_ci
1477e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1478e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));
1479e1051a39Sopenharmony_ci	if ($inline)
1480e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
1481e1051a39Sopenharmony_ci	else
1482e1051a39Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
1483e1051a39Sopenharmony_ci
1484e1051a39Sopenharmony_ci	&mov	($inp,&wparam(0));
1485e1051a39Sopenharmony_ci	&mov	($out,&wparam(1));
1486e1051a39Sopenharmony_ci	&mov	($len,&wparam(2));
1487e1051a39Sopenharmony_ci	&mov	($key,&wparam(3));		# key1
1488e1051a39Sopenharmony_ci
1489e1051a39Sopenharmony_ci	&mov	($key_,"esp");
1490e1051a39Sopenharmony_ci	&sub	("esp",16*7+8);
1491e1051a39Sopenharmony_ci	&and	("esp",-16);			# align stack
1492e1051a39Sopenharmony_ci
1493e1051a39Sopenharmony_ci	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
1494e1051a39Sopenharmony_ci	&test	($len,15);
1495e1051a39Sopenharmony_ci	&setnz	(&LB($rounds_));
1496e1051a39Sopenharmony_ci	&shl	($rounds_,4);
1497e1051a39Sopenharmony_ci	&sub	($len,$rounds_);
1498e1051a39Sopenharmony_ci
1499e1051a39Sopenharmony_ci	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1500e1051a39Sopenharmony_ci	&mov	(&DWP(16*6+4,"esp"),0);
1501e1051a39Sopenharmony_ci	&mov	(&DWP(16*6+8,"esp"),1);
1502e1051a39Sopenharmony_ci	&mov	(&DWP(16*6+12,"esp"),0);
1503e1051a39Sopenharmony_ci	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1504e1051a39Sopenharmony_ci	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1505e1051a39Sopenharmony_ci
1506e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1507e1051a39Sopenharmony_ci	&mov	($key_,$key);			# backup $key
1508e1051a39Sopenharmony_ci	&mov	($rounds_,$rounds);		# backup $rounds
1509e1051a39Sopenharmony_ci
1510e1051a39Sopenharmony_ci	&movdqa	($tweak,$inout0);
1511e1051a39Sopenharmony_ci	&pxor	($twtmp,$twtmp);
1512e1051a39Sopenharmony_ci	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1513e1051a39Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1514e1051a39Sopenharmony_ci
1515e1051a39Sopenharmony_ci	&and	($len,-16);
1516e1051a39Sopenharmony_ci	&sub	($len,16*6);
1517e1051a39Sopenharmony_ci	&jc	(&label("xts_dec_short"));
1518e1051a39Sopenharmony_ci
1519e1051a39Sopenharmony_ci	&shl	($rounds,4);
1520e1051a39Sopenharmony_ci	&mov	($rounds_,16);
1521e1051a39Sopenharmony_ci	&sub	($rounds_,$rounds);
1522e1051a39Sopenharmony_ci	&lea	($key,&DWP(32,$key,$rounds));
1523e1051a39Sopenharmony_ci	&jmp	(&label("xts_dec_loop6"));
1524e1051a39Sopenharmony_ci
1525e1051a39Sopenharmony_ci&set_label("xts_dec_loop6",16);
1526e1051a39Sopenharmony_ci	for ($i=0;$i<4;$i++) {
1527e1051a39Sopenharmony_ci	    &pshufd	($twres,$twtmp,0x13);
1528e1051a39Sopenharmony_ci	    &pxor	($twtmp,$twtmp);
1529e1051a39Sopenharmony_ci	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1530e1051a39Sopenharmony_ci	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1531e1051a39Sopenharmony_ci	    &pand	($twres,$twmask);	# isolate carry and residue
1532e1051a39Sopenharmony_ci	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1533e1051a39Sopenharmony_ci	    &pxor	($tweak,$twres);
1534e1051a39Sopenharmony_ci	}
1535e1051a39Sopenharmony_ci	&pshufd	($inout5,$twtmp,0x13);
1536e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1537e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1538e1051a39Sopenharmony_ci	 &$movekey	($rndkey0,&QWP(0,$key_));
1539e1051a39Sopenharmony_ci	&pand	($inout5,$twmask);		# isolate carry and residue
1540e1051a39Sopenharmony_ci	 &movups	($inout0,&QWP(0,$inp));	# load input
1541e1051a39Sopenharmony_ci	&pxor	($inout5,$tweak);
1542e1051a39Sopenharmony_ci
1543e1051a39Sopenharmony_ci	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1544e1051a39Sopenharmony_ci	&mov	($rounds,$rounds_);
1545e1051a39Sopenharmony_ci	&movdqu	($inout1,&QWP(16*1,$inp));
1546e1051a39Sopenharmony_ci	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1547e1051a39Sopenharmony_ci	&movdqu	($inout2,&QWP(16*2,$inp));
1548e1051a39Sopenharmony_ci	 &pxor		($inout1,$rndkey0);
1549e1051a39Sopenharmony_ci	&movdqu	($inout3,&QWP(16*3,$inp));
1550e1051a39Sopenharmony_ci	 &pxor		($inout2,$rndkey0);
1551e1051a39Sopenharmony_ci	&movdqu	($inout4,&QWP(16*4,$inp));
1552e1051a39Sopenharmony_ci	 &pxor		($inout3,$rndkey0);
1553e1051a39Sopenharmony_ci	&movdqu	($rndkey1,&QWP(16*5,$inp));
1554e1051a39Sopenharmony_ci	 &pxor		($inout4,$rndkey0);
1555e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16*6,$inp));
1556e1051a39Sopenharmony_ci	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1557e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1558e1051a39Sopenharmony_ci	&pxor	($inout5,$rndkey1);
1559e1051a39Sopenharmony_ci
1560e1051a39Sopenharmony_ci	 &$movekey	($rndkey1,&QWP(16,$key_));
1561e1051a39Sopenharmony_ci	&pxor	($inout1,&QWP(16*1,"esp"));
1562e1051a39Sopenharmony_ci	&pxor	($inout2,&QWP(16*2,"esp"));
1563e1051a39Sopenharmony_ci	 &aesdec	($inout0,$rndkey1);
1564e1051a39Sopenharmony_ci	&pxor	($inout3,&QWP(16*3,"esp"));
1565e1051a39Sopenharmony_ci	&pxor	($inout4,&QWP(16*4,"esp"));
1566e1051a39Sopenharmony_ci	 &aesdec	($inout1,$rndkey1);
1567e1051a39Sopenharmony_ci	&pxor		($inout5,$rndkey0);
1568e1051a39Sopenharmony_ci	 &$movekey	($rndkey0,&QWP(32,$key_));
1569e1051a39Sopenharmony_ci	 &aesdec	($inout2,$rndkey1);
1570e1051a39Sopenharmony_ci	 &aesdec	($inout3,$rndkey1);
1571e1051a39Sopenharmony_ci	 &aesdec	($inout4,$rndkey1);
1572e1051a39Sopenharmony_ci	 &aesdec	($inout5,$rndkey1);
1573e1051a39Sopenharmony_ci	&call		(&label("_aesni_decrypt6_enter"));
1574e1051a39Sopenharmony_ci
1575e1051a39Sopenharmony_ci	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1576e1051a39Sopenharmony_ci       &pxor	($twtmp,$twtmp);
1577e1051a39Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1578e1051a39Sopenharmony_ci       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1579e1051a39Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
1580e1051a39Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
1581e1051a39Sopenharmony_ci	&xorps	($inout2,&QWP(16*2,"esp"));
1582e1051a39Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
1583e1051a39Sopenharmony_ci	&xorps	($inout3,&QWP(16*3,"esp"));
1584e1051a39Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
1585e1051a39Sopenharmony_ci	&xorps	($inout4,&QWP(16*4,"esp"));
1586e1051a39Sopenharmony_ci	&movups	(&QWP(16*3,$out),$inout3);
1587e1051a39Sopenharmony_ci	&xorps	($inout5,$tweak);
1588e1051a39Sopenharmony_ci	&movups	(&QWP(16*4,$out),$inout4);
1589e1051a39Sopenharmony_ci       &pshufd	($twres,$twtmp,0x13);
1590e1051a39Sopenharmony_ci	&movups	(&QWP(16*5,$out),$inout5);
1591e1051a39Sopenharmony_ci	&lea	($out,&DWP(16*6,$out));
1592e1051a39Sopenharmony_ci       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1593e1051a39Sopenharmony_ci
1594e1051a39Sopenharmony_ci	&pxor	($twtmp,$twtmp);
1595e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1596e1051a39Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
1597e1051a39Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1598e1051a39Sopenharmony_ci	&pxor	($tweak,$twres);
1599e1051a39Sopenharmony_ci
1600e1051a39Sopenharmony_ci	&sub	($len,16*6);
1601e1051a39Sopenharmony_ci	&jnc	(&label("xts_dec_loop6"));
1602e1051a39Sopenharmony_ci
1603e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1604e1051a39Sopenharmony_ci	&mov	($key,$key_);			# restore $key
1605e1051a39Sopenharmony_ci	&mov	($rounds_,$rounds);
1606e1051a39Sopenharmony_ci
1607e1051a39Sopenharmony_ci&set_label("xts_dec_short");
1608e1051a39Sopenharmony_ci	&add	($len,16*6);
1609e1051a39Sopenharmony_ci	&jz	(&label("xts_dec_done6x"));
1610e1051a39Sopenharmony_ci
1611e1051a39Sopenharmony_ci	&movdqa	($inout3,$tweak);		# put aside previous tweak
1612e1051a39Sopenharmony_ci	&cmp	($len,0x20);
1613e1051a39Sopenharmony_ci	&jb	(&label("xts_dec_one"));
1614e1051a39Sopenharmony_ci
1615e1051a39Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
1616e1051a39Sopenharmony_ci	&pxor	($twtmp,$twtmp);
1617e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1618e1051a39Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
1619e1051a39Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1620e1051a39Sopenharmony_ci	&pxor	($tweak,$twres);
1621e1051a39Sopenharmony_ci	&je	(&label("xts_dec_two"));
1622e1051a39Sopenharmony_ci
1623e1051a39Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
1624e1051a39Sopenharmony_ci	&pxor	($twtmp,$twtmp);
1625e1051a39Sopenharmony_ci	&movdqa	($inout4,$tweak);		# put aside previous tweak
1626e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1627e1051a39Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
1628e1051a39Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1629e1051a39Sopenharmony_ci	&pxor	($tweak,$twres);
1630e1051a39Sopenharmony_ci	&cmp	($len,0x40);
1631e1051a39Sopenharmony_ci	&jb	(&label("xts_dec_three"));
1632e1051a39Sopenharmony_ci
1633e1051a39Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
1634e1051a39Sopenharmony_ci	&pxor	($twtmp,$twtmp);
1635e1051a39Sopenharmony_ci	&movdqa	($inout5,$tweak);		# put aside previous tweak
1636e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1637e1051a39Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
1638e1051a39Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1639e1051a39Sopenharmony_ci	&pxor	($tweak,$twres);
1640e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*0,"esp"),$inout3);
1641e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*1,"esp"),$inout4);
1642e1051a39Sopenharmony_ci	&je	(&label("xts_dec_four"));
1643e1051a39Sopenharmony_ci
1644e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*2,"esp"),$inout5);
1645e1051a39Sopenharmony_ci	&pshufd	($inout5,$twtmp,0x13);
1646e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*3,"esp"),$tweak);
1647e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1648e1051a39Sopenharmony_ci	&pand	($inout5,$twmask);		# isolate carry and residue
1649e1051a39Sopenharmony_ci	&pxor	($inout5,$tweak);
1650e1051a39Sopenharmony_ci
1651e1051a39Sopenharmony_ci	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1652e1051a39Sopenharmony_ci	&movdqu	($inout1,&QWP(16*1,$inp));
1653e1051a39Sopenharmony_ci	&movdqu	($inout2,&QWP(16*2,$inp));
1654e1051a39Sopenharmony_ci	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1655e1051a39Sopenharmony_ci	&movdqu	($inout3,&QWP(16*3,$inp));
1656e1051a39Sopenharmony_ci	&pxor	($inout1,&QWP(16*1,"esp"));
1657e1051a39Sopenharmony_ci	&movdqu	($inout4,&QWP(16*4,$inp));
1658e1051a39Sopenharmony_ci	&pxor	($inout2,&QWP(16*2,"esp"));
1659e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16*5,$inp));
1660e1051a39Sopenharmony_ci	&pxor	($inout3,&QWP(16*3,"esp"));
1661e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1662e1051a39Sopenharmony_ci	&pxor	($inout4,$inout5);
1663e1051a39Sopenharmony_ci
1664e1051a39Sopenharmony_ci	&call	("_aesni_decrypt6");
1665e1051a39Sopenharmony_ci
1666e1051a39Sopenharmony_ci	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1667e1051a39Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1668e1051a39Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
1669e1051a39Sopenharmony_ci	&xorps	($inout2,&QWP(16*2,"esp"));
1670e1051a39Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
1671e1051a39Sopenharmony_ci	&xorps	($inout3,&QWP(16*3,"esp"));
1672e1051a39Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
1673e1051a39Sopenharmony_ci	&xorps	($inout4,$tweak);
1674e1051a39Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
1675e1051a39Sopenharmony_ci	&movups	(&QWP(16*3,$out),$inout3);
1676e1051a39Sopenharmony_ci	&movups	(&QWP(16*4,$out),$inout4);
1677e1051a39Sopenharmony_ci	&lea	($out,&DWP(16*5,$out));
1678e1051a39Sopenharmony_ci	&jmp	(&label("xts_dec_done"));
1679e1051a39Sopenharmony_ci
1680e1051a39Sopenharmony_ci&set_label("xts_dec_one",16);
1681e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
1682e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16*1,$inp));
1683e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
1684e1051a39Sopenharmony_ci	if ($inline)
1685e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
1686e1051a39Sopenharmony_ci	else
1687e1051a39Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
1688e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
1689e1051a39Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
1690e1051a39Sopenharmony_ci	&lea	($out,&DWP(16*1,$out));
1691e1051a39Sopenharmony_ci
1692e1051a39Sopenharmony_ci	&movdqa	($tweak,$inout3);		# last tweak
1693e1051a39Sopenharmony_ci	&jmp	(&label("xts_dec_done"));
1694e1051a39Sopenharmony_ci
1695e1051a39Sopenharmony_ci&set_label("xts_dec_two",16);
1696e1051a39Sopenharmony_ci	&movaps	($inout4,$tweak);		# put aside last tweak
1697e1051a39Sopenharmony_ci
1698e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
1699e1051a39Sopenharmony_ci	&movups	($inout1,&QWP(16*1,$inp));
1700e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16*2,$inp));
1701e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
1702e1051a39Sopenharmony_ci	&xorps	($inout1,$inout4);
1703e1051a39Sopenharmony_ci
1704e1051a39Sopenharmony_ci	&call	("_aesni_decrypt2");
1705e1051a39Sopenharmony_ci
1706e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
1707e1051a39Sopenharmony_ci	&xorps	($inout1,$inout4);
1708e1051a39Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
1709e1051a39Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
1710e1051a39Sopenharmony_ci	&lea	($out,&DWP(16*2,$out));
1711e1051a39Sopenharmony_ci
1712e1051a39Sopenharmony_ci	&movdqa	($tweak,$inout4);		# last tweak
1713e1051a39Sopenharmony_ci	&jmp	(&label("xts_dec_done"));
1714e1051a39Sopenharmony_ci
1715e1051a39Sopenharmony_ci&set_label("xts_dec_three",16);
1716e1051a39Sopenharmony_ci	&movaps	($inout5,$tweak);		# put aside last tweak
1717e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
1718e1051a39Sopenharmony_ci	&movups	($inout1,&QWP(16*1,$inp));
1719e1051a39Sopenharmony_ci	&movups	($inout2,&QWP(16*2,$inp));
1720e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16*3,$inp));
1721e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
1722e1051a39Sopenharmony_ci	&xorps	($inout1,$inout4);
1723e1051a39Sopenharmony_ci	&xorps	($inout2,$inout5);
1724e1051a39Sopenharmony_ci
1725e1051a39Sopenharmony_ci	&call	("_aesni_decrypt3");
1726e1051a39Sopenharmony_ci
1727e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
1728e1051a39Sopenharmony_ci	&xorps	($inout1,$inout4);
1729e1051a39Sopenharmony_ci	&xorps	($inout2,$inout5);
1730e1051a39Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
1731e1051a39Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
1732e1051a39Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
1733e1051a39Sopenharmony_ci	&lea	($out,&DWP(16*3,$out));
1734e1051a39Sopenharmony_ci
1735e1051a39Sopenharmony_ci	&movdqa	($tweak,$inout5);		# last tweak
1736e1051a39Sopenharmony_ci	&jmp	(&label("xts_dec_done"));
1737e1051a39Sopenharmony_ci
1738e1051a39Sopenharmony_ci&set_label("xts_dec_four",16);
1739e1051a39Sopenharmony_ci	&movaps	($inout4,$tweak);		# put aside last tweak
1740e1051a39Sopenharmony_ci
1741e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
1742e1051a39Sopenharmony_ci	&movups	($inout1,&QWP(16*1,$inp));
1743e1051a39Sopenharmony_ci	&movups	($inout2,&QWP(16*2,$inp));
1744e1051a39Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1745e1051a39Sopenharmony_ci	&movups	($inout3,&QWP(16*3,$inp));
1746e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16*4,$inp));
1747e1051a39Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
1748e1051a39Sopenharmony_ci	&xorps	($inout2,$inout5);
1749e1051a39Sopenharmony_ci	&xorps	($inout3,$inout4);
1750e1051a39Sopenharmony_ci
1751e1051a39Sopenharmony_ci	&call	("_aesni_decrypt4");
1752e1051a39Sopenharmony_ci
1753e1051a39Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1754e1051a39Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
1755e1051a39Sopenharmony_ci	&xorps	($inout2,$inout5);
1756e1051a39Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
1757e1051a39Sopenharmony_ci	&xorps	($inout3,$inout4);
1758e1051a39Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
1759e1051a39Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
1760e1051a39Sopenharmony_ci	&movups	(&QWP(16*3,$out),$inout3);
1761e1051a39Sopenharmony_ci	&lea	($out,&DWP(16*4,$out));
1762e1051a39Sopenharmony_ci
1763e1051a39Sopenharmony_ci	&movdqa	($tweak,$inout4);		# last tweak
1764e1051a39Sopenharmony_ci	&jmp	(&label("xts_dec_done"));
1765e1051a39Sopenharmony_ci
1766e1051a39Sopenharmony_ci&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
1767e1051a39Sopenharmony_ci	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1768e1051a39Sopenharmony_ci	&and	($len,15);
1769e1051a39Sopenharmony_ci	&jz	(&label("xts_dec_ret"));
1770e1051a39Sopenharmony_ci	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1771e1051a39Sopenharmony_ci	&jmp	(&label("xts_dec_only_one_more"));
1772e1051a39Sopenharmony_ci
1773e1051a39Sopenharmony_ci&set_label("xts_dec_done",16);
1774e1051a39Sopenharmony_ci	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1775e1051a39Sopenharmony_ci	&pxor	($twtmp,$twtmp);
1776e1051a39Sopenharmony_ci	&and	($len,15);
1777e1051a39Sopenharmony_ci	&jz	(&label("xts_dec_ret"));
1778e1051a39Sopenharmony_ci
1779e1051a39Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1780e1051a39Sopenharmony_ci	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1781e1051a39Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
1782e1051a39Sopenharmony_ci	&pxor	($twtmp,$twtmp);
1783e1051a39Sopenharmony_ci	&movdqa	($twmask,&QWP(16*6,"esp"));
1784e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1785e1051a39Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
1786e1051a39Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1787e1051a39Sopenharmony_ci	&pxor	($tweak,$twres);
1788e1051a39Sopenharmony_ci
1789e1051a39Sopenharmony_ci&set_label("xts_dec_only_one_more");
1790e1051a39Sopenharmony_ci	&pshufd	($inout3,$twtmp,0x13);
1791e1051a39Sopenharmony_ci	&movdqa	($inout4,$tweak);		# put aside previous tweak
1792e1051a39Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1793e1051a39Sopenharmony_ci	&pand	($inout3,$twmask);		# isolate carry and residue
1794e1051a39Sopenharmony_ci	&pxor	($inout3,$tweak);
1795e1051a39Sopenharmony_ci
1796e1051a39Sopenharmony_ci	&mov	($key,$key_);			# restore $key
1797e1051a39Sopenharmony_ci	&mov	($rounds,$rounds_);		# restore $rounds
1798e1051a39Sopenharmony_ci
1799e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));		# load input
1800e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
1801e1051a39Sopenharmony_ci	if ($inline)
1802e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
1803e1051a39Sopenharmony_ci	else
1804e1051a39Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
1805e1051a39Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
1806e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);		# write output
1807e1051a39Sopenharmony_ci
1808e1051a39Sopenharmony_ci&set_label("xts_dec_steal");
1809e1051a39Sopenharmony_ci	&movz	($rounds,&BP(16,$inp));
1810e1051a39Sopenharmony_ci	&movz	($key,&BP(0,$out));
1811e1051a39Sopenharmony_ci	&lea	($inp,&DWP(1,$inp));
1812e1051a39Sopenharmony_ci	&mov	(&BP(0,$out),&LB($rounds));
1813e1051a39Sopenharmony_ci	&mov	(&BP(16,$out),&LB($key));
1814e1051a39Sopenharmony_ci	&lea	($out,&DWP(1,$out));
1815e1051a39Sopenharmony_ci	&sub	($len,1);
1816e1051a39Sopenharmony_ci	&jnz	(&label("xts_dec_steal"));
1817e1051a39Sopenharmony_ci
1818e1051a39Sopenharmony_ci	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1819e1051a39Sopenharmony_ci	&mov	($key,$key_);			# restore $key
1820e1051a39Sopenharmony_ci	&mov	($rounds,$rounds_);		# restore $rounds
1821e1051a39Sopenharmony_ci
1822e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(0,$out));		# load input
1823e1051a39Sopenharmony_ci	&xorps	($inout0,$inout4);		# input^=tweak
1824e1051a39Sopenharmony_ci	if ($inline)
1825e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
1826e1051a39Sopenharmony_ci	else
1827e1051a39Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
1828e1051a39Sopenharmony_ci	&xorps	($inout0,$inout4);		# output^=tweak
1829e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);		# write output
1830e1051a39Sopenharmony_ci
1831e1051a39Sopenharmony_ci&set_label("xts_dec_ret");
1832e1051a39Sopenharmony_ci	&pxor	("xmm0","xmm0");		# clear register bank
1833e1051a39Sopenharmony_ci	&pxor	("xmm1","xmm1");
1834e1051a39Sopenharmony_ci	&pxor	("xmm2","xmm2");
1835e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
1836e1051a39Sopenharmony_ci	&pxor	("xmm3","xmm3");
1837e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*1,"esp"),"xmm0");
1838e1051a39Sopenharmony_ci	&pxor	("xmm4","xmm4");
1839e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*2,"esp"),"xmm0");
1840e1051a39Sopenharmony_ci	&pxor	("xmm5","xmm5");
1841e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*3,"esp"),"xmm0");
1842e1051a39Sopenharmony_ci	&pxor	("xmm6","xmm6");
1843e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*4,"esp"),"xmm0");
1844e1051a39Sopenharmony_ci	&pxor	("xmm7","xmm7");
1845e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*5,"esp"),"xmm0");
1846e1051a39Sopenharmony_ci	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1847e1051a39Sopenharmony_ci&function_end("aesni_xts_decrypt");
1848e1051a39Sopenharmony_ci}
1849e1051a39Sopenharmony_ci
1850e1051a39Sopenharmony_ci######################################################################
1851e1051a39Sopenharmony_ci# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
1852e1051a39Sopenharmony_ci#	const AES_KEY *key, unsigned int start_block_num,
1853e1051a39Sopenharmony_ci#	unsigned char offset_i[16], const unsigned char L_[][16],
1854e1051a39Sopenharmony_ci#	unsigned char checksum[16]);
1855e1051a39Sopenharmony_ci#
1856e1051a39Sopenharmony_ci{
1857e1051a39Sopenharmony_ci# offsets within stack frame
1858e1051a39Sopenharmony_cimy $checksum = 16*6;
1859e1051a39Sopenharmony_cimy ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
1860e1051a39Sopenharmony_ci
1861e1051a39Sopenharmony_ci# reassigned registers
1862e1051a39Sopenharmony_cimy ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
1863e1051a39Sopenharmony_ci# $l_, $blocks, $inp, $key are permanently allocated in registers;
1864e1051a39Sopenharmony_ci# remaining non-volatile ones are offloaded to stack, which even
1865e1051a39Sopenharmony_ci# stay invariant after written to stack.
1866e1051a39Sopenharmony_ci
1867e1051a39Sopenharmony_ci&function_begin("aesni_ocb_encrypt");
1868e1051a39Sopenharmony_ci	&mov	($rounds,&wparam(5));		# &offset_i
1869e1051a39Sopenharmony_ci	&mov	($rounds_,&wparam(7));		# &checksum
1870e1051a39Sopenharmony_ci
1871e1051a39Sopenharmony_ci	&mov	($inp,&wparam(0));
1872e1051a39Sopenharmony_ci	&mov	($out,&wparam(1));
1873e1051a39Sopenharmony_ci	&mov	($len,&wparam(2));
1874e1051a39Sopenharmony_ci	&mov	($key,&wparam(3));
1875e1051a39Sopenharmony_ci	&movdqu	($rndkey0,&QWP(0,$rounds));	# load offset_i
1876e1051a39Sopenharmony_ci	&mov	($block,&wparam(4));		# start_block_num
1877e1051a39Sopenharmony_ci	&movdqu	($rndkey1,&QWP(0,$rounds_));	# load checksum
1878e1051a39Sopenharmony_ci	&mov	($l_,&wparam(6));		# L_
1879e1051a39Sopenharmony_ci
1880e1051a39Sopenharmony_ci	&mov	($rounds,"esp");
1881e1051a39Sopenharmony_ci	&sub	("esp",$esp_off+4);		# alloca
1882e1051a39Sopenharmony_ci	&and	("esp",-16);			# align stack
1883e1051a39Sopenharmony_ci
1884e1051a39Sopenharmony_ci	&sub	($out,$inp);
1885e1051a39Sopenharmony_ci	&shl	($len,4);
1886e1051a39Sopenharmony_ci	&lea	($len,&DWP(-16*6,$inp,$len));	# end of input - 16*6
1887e1051a39Sopenharmony_ci	&mov	(&DWP($out_off,"esp"),$out);
1888e1051a39Sopenharmony_ci	&mov	(&DWP($end_off,"esp"),$len);
1889e1051a39Sopenharmony_ci	&mov	(&DWP($esp_off,"esp"),$rounds);
1890e1051a39Sopenharmony_ci
1891e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
1892e1051a39Sopenharmony_ci
1893e1051a39Sopenharmony_ci	&test	($block,1);
1894e1051a39Sopenharmony_ci	&jnz	(&label("odd"));
1895e1051a39Sopenharmony_ci
1896e1051a39Sopenharmony_ci	&bsf		($i3,$block);
1897e1051a39Sopenharmony_ci	&add		($block,1);
1898e1051a39Sopenharmony_ci	&shl		($i3,4);
1899e1051a39Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i3));
1900e1051a39Sopenharmony_ci	&mov		($i3,$key);			# put aside key
1901e1051a39Sopenharmony_ci
1902e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
1903e1051a39Sopenharmony_ci	&lea		($inp,&DWP(16,$inp));
1904e1051a39Sopenharmony_ci
1905e1051a39Sopenharmony_ci	&pxor		($inout5,$rndkey0);		# ^ last offset_i
1906e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
1907e1051a39Sopenharmony_ci	&pxor		($inout0,$inout5);		# ^ offset_i
1908e1051a39Sopenharmony_ci
1909e1051a39Sopenharmony_ci	&movdqa		($inout4,$rndkey1);
1910e1051a39Sopenharmony_ci	if ($inline)
1911e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
1912e1051a39Sopenharmony_ci	else
1913e1051a39Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
1914e1051a39Sopenharmony_ci
1915e1051a39Sopenharmony_ci	&xorps		($inout0,$inout5);		# ^ offset_i
1916e1051a39Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
1917e1051a39Sopenharmony_ci	&movdqa		($rndkey1,$inout4);		# pass the checksum
1918e1051a39Sopenharmony_ci
1919e1051a39Sopenharmony_ci	&movups		(&QWP(-16,$out,$inp),$inout0);	# store output
1920e1051a39Sopenharmony_ci
1921e1051a39Sopenharmony_ci	&mov		($rounds,&DWP(240,$i3));
1922e1051a39Sopenharmony_ci	&mov		($key,$i3);			# restore key
1923e1051a39Sopenharmony_ci	&mov		($len,&DWP($end_off,"esp"));
1924e1051a39Sopenharmony_ci
1925e1051a39Sopenharmony_ci&set_label("odd");
1926e1051a39Sopenharmony_ci	&shl		($rounds,4);
1927e1051a39Sopenharmony_ci	&mov		($out,16);
1928e1051a39Sopenharmony_ci	&sub		($out,$rounds);			# twisted rounds
1929e1051a39Sopenharmony_ci	&mov		(&DWP($key_off,"esp"),$key);
1930e1051a39Sopenharmony_ci	&lea		($key,&DWP(32,$key,$rounds));	# end of key schedule
1931e1051a39Sopenharmony_ci	&mov		(&DWP($rounds_off,"esp"),$out);
1932e1051a39Sopenharmony_ci
1933e1051a39Sopenharmony_ci	&cmp		($inp,$len);
1934e1051a39Sopenharmony_ci	&ja		(&label("short"));
1935e1051a39Sopenharmony_ci	&jmp		(&label("grandloop"));
1936e1051a39Sopenharmony_ci
1937e1051a39Sopenharmony_ci&set_label("grandloop",32);
1938e1051a39Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
1939e1051a39Sopenharmony_ci	&lea		($i3,&DWP(3,$block));
1940e1051a39Sopenharmony_ci	&lea		($i5,&DWP(5,$block));
1941e1051a39Sopenharmony_ci	&add		($block,6);
1942e1051a39Sopenharmony_ci	&bsf		($i1,$i1);
1943e1051a39Sopenharmony_ci	&bsf		($i3,$i3);
1944e1051a39Sopenharmony_ci	&bsf		($i5,$i5);
1945e1051a39Sopenharmony_ci	&shl		($i1,4);
1946e1051a39Sopenharmony_ci	&shl		($i3,4);
1947e1051a39Sopenharmony_ci	&shl		($i5,4);
1948e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(0,$l_));
1949e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(0,$l_,$i1));
1950e1051a39Sopenharmony_ci	&mov		($rounds,&DWP($rounds_off,"esp"));
1951e1051a39Sopenharmony_ci	&movdqa		($inout2,$inout0);
1952e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_,$i3));
1953e1051a39Sopenharmony_ci	&movdqa		($inout4,$inout0);
1954e1051a39Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i5));
1955e1051a39Sopenharmony_ci
1956e1051a39Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ last offset_i
1957e1051a39Sopenharmony_ci	&pxor		($inout1,$inout0);
1958e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$inout0);
1959e1051a39Sopenharmony_ci	&pxor		($inout2,$inout1);
1960e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$inout1);
1961e1051a39Sopenharmony_ci	&pxor		($inout3,$inout2);
1962e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*2,"esp"),$inout2);
1963e1051a39Sopenharmony_ci	&pxor		($inout4,$inout3);
1964e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$inout3);
1965e1051a39Sopenharmony_ci	&pxor		($inout5,$inout4);
1966e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*4,"esp"),$inout4);
1967e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*5,"esp"),$inout5);
1968e1051a39Sopenharmony_ci
1969e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
1970e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
1971e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
1972e1051a39Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
1973e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(16*3,$inp));
1974e1051a39Sopenharmony_ci	&movdqu		($inout4,&QWP(16*4,$inp));
1975e1051a39Sopenharmony_ci	&movdqu		($inout5,&QWP(16*5,$inp));
1976e1051a39Sopenharmony_ci	&lea		($inp,&DWP(16*6,$inp));
1977e1051a39Sopenharmony_ci
1978e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
1979e1051a39Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
1980e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout1);
1981e1051a39Sopenharmony_ci	&pxor		($inout1,$rndkey0);
1982e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout2);
1983e1051a39Sopenharmony_ci	&pxor		($inout2,$rndkey0);
1984e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout3);
1985e1051a39Sopenharmony_ci	&pxor		($inout3,$rndkey0);
1986e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout4);
1987e1051a39Sopenharmony_ci	&pxor		($inout4,$rndkey0);
1988e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout5);
1989e1051a39Sopenharmony_ci	&pxor		($inout5,$rndkey0);
1990e1051a39Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
1991e1051a39Sopenharmony_ci
1992e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
1993e1051a39Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
1994e1051a39Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
1995e1051a39Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
1996e1051a39Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
1997e1051a39Sopenharmony_ci	&pxor		($inout4,&QWP(16*4,"esp"));
1998e1051a39Sopenharmony_ci	&pxor		($inout5,&QWP(16*5,"esp"));
1999e1051a39Sopenharmony_ci
2000e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2001e1051a39Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
2002e1051a39Sopenharmony_ci	&aesenc		($inout1,$rndkey1);
2003e1051a39Sopenharmony_ci	&aesenc		($inout2,$rndkey1);
2004e1051a39Sopenharmony_ci	&aesenc		($inout3,$rndkey1);
2005e1051a39Sopenharmony_ci	&aesenc		($inout4,$rndkey1);
2006e1051a39Sopenharmony_ci	&aesenc		($inout5,$rndkey1);
2007e1051a39Sopenharmony_ci
2008e1051a39Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
2009e1051a39Sopenharmony_ci	&mov		($len,&DWP($end_off,"esp"));
2010e1051a39Sopenharmony_ci	&call		("_aesni_encrypt6_enter");
2011e1051a39Sopenharmony_ci
2012e1051a39Sopenharmony_ci	&movdqa		($rndkey0,&QWP(16*5,"esp"));	# pass last offset_i
2013e1051a39Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2014e1051a39Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
2015e1051a39Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
2016e1051a39Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
2017e1051a39Sopenharmony_ci	&pxor		($inout4,&QWP(16*4,"esp"));
2018e1051a39Sopenharmony_ci	&pxor		($inout5,$rndkey0);
2019e1051a39Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2020e1051a39Sopenharmony_ci
2021e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16*6,$out,$inp),$inout0);# store output
2022e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16*5,$out,$inp),$inout1);
2023e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16*4,$out,$inp),$inout2);
2024e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16*3,$out,$inp),$inout3);
2025e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16*2,$out,$inp),$inout4);
2026e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16*1,$out,$inp),$inout5);
2027e1051a39Sopenharmony_ci	&cmp		($inp,$len);			# done yet?
2028e1051a39Sopenharmony_ci	&jbe		(&label("grandloop"));
2029e1051a39Sopenharmony_ci
2030e1051a39Sopenharmony_ci&set_label("short");
2031e1051a39Sopenharmony_ci	&add		($len,16*6);
2032e1051a39Sopenharmony_ci	&sub		($len,$inp);
2033e1051a39Sopenharmony_ci	&jz		(&label("done"));
2034e1051a39Sopenharmony_ci
2035e1051a39Sopenharmony_ci	&cmp		($len,16*2);
2036e1051a39Sopenharmony_ci	&jb		(&label("one"));
2037e1051a39Sopenharmony_ci	&je		(&label("two"));
2038e1051a39Sopenharmony_ci
2039e1051a39Sopenharmony_ci	&cmp		($len,16*4);
2040e1051a39Sopenharmony_ci	&jb		(&label("three"));
2041e1051a39Sopenharmony_ci	&je		(&label("four"));
2042e1051a39Sopenharmony_ci
2043e1051a39Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
2044e1051a39Sopenharmony_ci	&lea		($i3,&DWP(3,$block));
2045e1051a39Sopenharmony_ci	&bsf		($i1,$i1);
2046e1051a39Sopenharmony_ci	&bsf		($i3,$i3);
2047e1051a39Sopenharmony_ci	&shl		($i1,4);
2048e1051a39Sopenharmony_ci	&shl		($i3,4);
2049e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(0,$l_));
2050e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(0,$l_,$i1));
2051e1051a39Sopenharmony_ci	&mov		($rounds,&DWP($rounds_off,"esp"));
2052e1051a39Sopenharmony_ci	&movdqa		($inout2,$inout0);
2053e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_,$i3));
2054e1051a39Sopenharmony_ci	&movdqa		($inout4,$inout0);
2055e1051a39Sopenharmony_ci
2056e1051a39Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ last offset_i
2057e1051a39Sopenharmony_ci	&pxor		($inout1,$inout0);
2058e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$inout0);
2059e1051a39Sopenharmony_ci	&pxor		($inout2,$inout1);
2060e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$inout1);
2061e1051a39Sopenharmony_ci	&pxor		($inout3,$inout2);
2062e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*2,"esp"),$inout2);
2063e1051a39Sopenharmony_ci	&pxor		($inout4,$inout3);
2064e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$inout3);
2065e1051a39Sopenharmony_ci	&pxor		($inout5,$inout4);
2066e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*4,"esp"),$inout4);
2067e1051a39Sopenharmony_ci
2068e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
2069e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2070e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
2071e1051a39Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
2072e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(16*3,$inp));
2073e1051a39Sopenharmony_ci	&movdqu		($inout4,&QWP(16*4,$inp));
2074e1051a39Sopenharmony_ci	&pxor		($inout5,$inout5);
2075e1051a39Sopenharmony_ci
2076e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
2077e1051a39Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
2078e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout1);
2079e1051a39Sopenharmony_ci	&pxor		($inout1,$rndkey0);
2080e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout2);
2081e1051a39Sopenharmony_ci	&pxor		($inout2,$rndkey0);
2082e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout3);
2083e1051a39Sopenharmony_ci	&pxor		($inout3,$rndkey0);
2084e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout4);
2085e1051a39Sopenharmony_ci	&pxor		($inout4,$rndkey0);
2086e1051a39Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2087e1051a39Sopenharmony_ci
2088e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
2089e1051a39Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2090e1051a39Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
2091e1051a39Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
2092e1051a39Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
2093e1051a39Sopenharmony_ci	&pxor		($inout4,&QWP(16*4,"esp"));
2094e1051a39Sopenharmony_ci
2095e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2096e1051a39Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
2097e1051a39Sopenharmony_ci	&aesenc		($inout1,$rndkey1);
2098e1051a39Sopenharmony_ci	&aesenc		($inout2,$rndkey1);
2099e1051a39Sopenharmony_ci	&aesenc		($inout3,$rndkey1);
2100e1051a39Sopenharmony_ci	&aesenc		($inout4,$rndkey1);
2101e1051a39Sopenharmony_ci	&aesenc		($inout5,$rndkey1);
2102e1051a39Sopenharmony_ci
2103e1051a39Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
2104e1051a39Sopenharmony_ci	&call		("_aesni_encrypt6_enter");
2105e1051a39Sopenharmony_ci
2106e1051a39Sopenharmony_ci	&movdqa		($rndkey0,&QWP(16*4,"esp"));	# pass last offset_i
2107e1051a39Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2108e1051a39Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
2109e1051a39Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
2110e1051a39Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
2111e1051a39Sopenharmony_ci	&pxor		($inout4,$rndkey0);
2112e1051a39Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2113e1051a39Sopenharmony_ci
2114e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*0,$out,$inp),$inout0);	# store output
2115e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*1,$out,$inp),$inout1);
2116e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*2,$out,$inp),$inout2);
2117e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*3,$out,$inp),$inout3);
2118e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*4,$out,$inp),$inout4);
2119e1051a39Sopenharmony_ci
2120e1051a39Sopenharmony_ci	&jmp		(&label("done"));
2121e1051a39Sopenharmony_ci
2122e1051a39Sopenharmony_ci&set_label("one",16);
2123e1051a39Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_));
2124e1051a39Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
2125e1051a39Sopenharmony_ci
2126e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2127e1051a39Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
2128e1051a39Sopenharmony_ci
2129e1051a39Sopenharmony_ci	&pxor		($inout5,$rndkey0);		# ^ last offset_i
2130e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
2131e1051a39Sopenharmony_ci	&pxor		($inout0,$inout5);		# ^ offset_i
2132e1051a39Sopenharmony_ci
2133e1051a39Sopenharmony_ci	&movdqa		($inout4,$rndkey1);
2134e1051a39Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
2135e1051a39Sopenharmony_ci	if ($inline)
2136e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
2137e1051a39Sopenharmony_ci	else
2138e1051a39Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
2139e1051a39Sopenharmony_ci
2140e1051a39Sopenharmony_ci	&xorps		($inout0,$inout5);		# ^ offset_i
2141e1051a39Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2142e1051a39Sopenharmony_ci	&movdqa		($rndkey1,$inout4);		# pass the checksum
2143e1051a39Sopenharmony_ci	&movups		(&QWP(0,$out,$inp),$inout0);
2144e1051a39Sopenharmony_ci
2145e1051a39Sopenharmony_ci	&jmp		(&label("done"));
2146e1051a39Sopenharmony_ci
2147e1051a39Sopenharmony_ci&set_label("two",16);
2148e1051a39Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
2149e1051a39Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
2150e1051a39Sopenharmony_ci	&bsf		($i1,$i1);
2151e1051a39Sopenharmony_ci	&shl		($i1,4);
2152e1051a39Sopenharmony_ci	&movdqu		($inout4,&QWP(0,$l_));
2153e1051a39Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i1));
2154e1051a39Sopenharmony_ci
2155e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2156e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
2157e1051a39Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
2158e1051a39Sopenharmony_ci
2159e1051a39Sopenharmony_ci	&pxor		($inout4,$rndkey0);		# ^ last offset_i
2160e1051a39Sopenharmony_ci	&pxor		($inout5,$inout4);
2161e1051a39Sopenharmony_ci
2162e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
2163e1051a39Sopenharmony_ci	&pxor		($inout0,$inout4);		# ^ offset_i
2164e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout1);
2165e1051a39Sopenharmony_ci	&pxor		($inout1,$inout5);
2166e1051a39Sopenharmony_ci
2167e1051a39Sopenharmony_ci	&movdqa		($inout3,$rndkey1)
2168e1051a39Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
2169e1051a39Sopenharmony_ci	&call		("_aesni_encrypt2");
2170e1051a39Sopenharmony_ci
2171e1051a39Sopenharmony_ci	&xorps		($inout0,$inout4);		# ^ offset_i
2172e1051a39Sopenharmony_ci	&xorps		($inout1,$inout5);
2173e1051a39Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2174e1051a39Sopenharmony_ci	&movdqa		($rndkey1,$inout3);		# pass the checksum
2175e1051a39Sopenharmony_ci	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2176e1051a39Sopenharmony_ci	&movups		(&QWP(16*1,$out,$inp),$inout1);
2177e1051a39Sopenharmony_ci
2178e1051a39Sopenharmony_ci	&jmp		(&label("done"));
2179e1051a39Sopenharmony_ci
2180e1051a39Sopenharmony_ci&set_label("three",16);
2181e1051a39Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
2182e1051a39Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
2183e1051a39Sopenharmony_ci	&bsf		($i1,$i1);
2184e1051a39Sopenharmony_ci	&shl		($i1,4);
2185e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_));
2186e1051a39Sopenharmony_ci	&movdqu		($inout4,&QWP(0,$l_,$i1));
2187e1051a39Sopenharmony_ci	&movdqa		($inout5,$inout3);
2188e1051a39Sopenharmony_ci
2189e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2190e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
2191e1051a39Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
2192e1051a39Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
2193e1051a39Sopenharmony_ci
2194e1051a39Sopenharmony_ci	&pxor		($inout3,$rndkey0);		# ^ last offset_i
2195e1051a39Sopenharmony_ci	&pxor		($inout4,$inout3);
2196e1051a39Sopenharmony_ci	&pxor		($inout5,$inout4);
2197e1051a39Sopenharmony_ci
2198e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
2199e1051a39Sopenharmony_ci	&pxor		($inout0,$inout3);		# ^ offset_i
2200e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout1);
2201e1051a39Sopenharmony_ci	&pxor		($inout1,$inout4);
2202e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout2);
2203e1051a39Sopenharmony_ci	&pxor		($inout2,$inout5);
2204e1051a39Sopenharmony_ci
2205e1051a39Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2206e1051a39Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
2207e1051a39Sopenharmony_ci	&call		("_aesni_encrypt3");
2208e1051a39Sopenharmony_ci
2209e1051a39Sopenharmony_ci	&xorps		($inout0,$inout3);		# ^ offset_i
2210e1051a39Sopenharmony_ci	&xorps		($inout1,$inout4);
2211e1051a39Sopenharmony_ci	&xorps		($inout2,$inout5);
2212e1051a39Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2213e1051a39Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2214e1051a39Sopenharmony_ci	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2215e1051a39Sopenharmony_ci	&movups		(&QWP(16*1,$out,$inp),$inout1);
2216e1051a39Sopenharmony_ci	&movups		(&QWP(16*2,$out,$inp),$inout2);
2217e1051a39Sopenharmony_ci
2218e1051a39Sopenharmony_ci	&jmp		(&label("done"));
2219e1051a39Sopenharmony_ci
2220e1051a39Sopenharmony_ci&set_label("four",16);
2221e1051a39Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
2222e1051a39Sopenharmony_ci	&lea		($i3,&DWP(3,$block));
2223e1051a39Sopenharmony_ci	&bsf		($i1,$i1);
2224e1051a39Sopenharmony_ci	&bsf		($i3,$i3);
2225e1051a39Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
2226e1051a39Sopenharmony_ci	&shl		($i1,4);
2227e1051a39Sopenharmony_ci	&shl		($i3,4);
2228e1051a39Sopenharmony_ci	&movdqu		($inout2,&QWP(0,$l_));
2229e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_,$i1));
2230e1051a39Sopenharmony_ci	&movdqa		($inout4,$inout2);
2231e1051a39Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i3));
2232e1051a39Sopenharmony_ci
2233e1051a39Sopenharmony_ci	&pxor		($inout2,$rndkey0);		# ^ last offset_i
2234e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2235e1051a39Sopenharmony_ci	&pxor		($inout3,$inout2);
2236e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
2237e1051a39Sopenharmony_ci	&pxor		($inout4,$inout3);
2238e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$inout2);
2239e1051a39Sopenharmony_ci	&pxor		($inout5,$inout4);
2240e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$inout3);
2241e1051a39Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
2242e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(16*3,$inp));
2243e1051a39Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
2244e1051a39Sopenharmony_ci
2245e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
2246e1051a39Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2247e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout1);
2248e1051a39Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
2249e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout2);
2250e1051a39Sopenharmony_ci	&pxor		($inout2,$inout4);
2251e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout3);
2252e1051a39Sopenharmony_ci	&pxor		($inout3,$inout5);
2253e1051a39Sopenharmony_ci
2254e1051a39Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1)
2255e1051a39Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
2256e1051a39Sopenharmony_ci	&call		("_aesni_encrypt4");
2257e1051a39Sopenharmony_ci
2258e1051a39Sopenharmony_ci	&xorps		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2259e1051a39Sopenharmony_ci	&xorps		($inout1,&QWP(16*1,"esp"));
2260e1051a39Sopenharmony_ci	&xorps		($inout2,$inout4);
2261e1051a39Sopenharmony_ci	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2262e1051a39Sopenharmony_ci	&xorps		($inout3,$inout5);
2263e1051a39Sopenharmony_ci	&movups		(&QWP(16*1,$out,$inp),$inout1);
2264e1051a39Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2265e1051a39Sopenharmony_ci	&movups		(&QWP(16*2,$out,$inp),$inout2);
2266e1051a39Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2267e1051a39Sopenharmony_ci	&movups		(&QWP(16*3,$out,$inp),$inout3);
2268e1051a39Sopenharmony_ci
2269e1051a39Sopenharmony_ci&set_label("done");
2270e1051a39Sopenharmony_ci	&mov	($key,&DWP($esp_off,"esp"));
2271e1051a39Sopenharmony_ci	&pxor	($inout0,$inout0);		# clear register bank
2272e1051a39Sopenharmony_ci	&pxor	($inout1,$inout1);
2273e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*0,"esp"),$inout0);	# clear stack
2274e1051a39Sopenharmony_ci	&pxor	($inout2,$inout2);
2275e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*1,"esp"),$inout0);
2276e1051a39Sopenharmony_ci	&pxor	($inout3,$inout3);
2277e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*2,"esp"),$inout0);
2278e1051a39Sopenharmony_ci	&pxor	($inout4,$inout4);
2279e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*3,"esp"),$inout0);
2280e1051a39Sopenharmony_ci	&pxor	($inout5,$inout5);
2281e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*4,"esp"),$inout0);
2282e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*5,"esp"),$inout0);
2283e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*6,"esp"),$inout0);
2284e1051a39Sopenharmony_ci
2285e1051a39Sopenharmony_ci	&lea	("esp",&DWP(0,$key));
2286e1051a39Sopenharmony_ci	&mov	($rounds,&wparam(5));		# &offset_i
2287e1051a39Sopenharmony_ci	&mov	($rounds_,&wparam(7));		# &checksum
2288e1051a39Sopenharmony_ci	&movdqu	(&QWP(0,$rounds),$rndkey0);
2289e1051a39Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);
2290e1051a39Sopenharmony_ci	&movdqu	(&QWP(0,$rounds_),$rndkey1);
2291e1051a39Sopenharmony_ci	&pxor	($rndkey1,$rndkey1);
2292e1051a39Sopenharmony_ci&function_end("aesni_ocb_encrypt");
2293e1051a39Sopenharmony_ci
2294e1051a39Sopenharmony_ci&function_begin("aesni_ocb_decrypt");
2295e1051a39Sopenharmony_ci	&mov	($rounds,&wparam(5));		# &offset_i
2296e1051a39Sopenharmony_ci	&mov	($rounds_,&wparam(7));		# &checksum
2297e1051a39Sopenharmony_ci
2298e1051a39Sopenharmony_ci	&mov	($inp,&wparam(0));
2299e1051a39Sopenharmony_ci	&mov	($out,&wparam(1));
2300e1051a39Sopenharmony_ci	&mov	($len,&wparam(2));
2301e1051a39Sopenharmony_ci	&mov	($key,&wparam(3));
2302e1051a39Sopenharmony_ci	&movdqu	($rndkey0,&QWP(0,$rounds));	# load offset_i
2303e1051a39Sopenharmony_ci	&mov	($block,&wparam(4));		# start_block_num
2304e1051a39Sopenharmony_ci	&movdqu	($rndkey1,&QWP(0,$rounds_));	# load checksum
2305e1051a39Sopenharmony_ci	&mov	($l_,&wparam(6));		# L_
2306e1051a39Sopenharmony_ci
2307e1051a39Sopenharmony_ci	&mov	($rounds,"esp");
2308e1051a39Sopenharmony_ci	&sub	("esp",$esp_off+4);		# alloca
2309e1051a39Sopenharmony_ci	&and	("esp",-16);			# align stack
2310e1051a39Sopenharmony_ci
2311e1051a39Sopenharmony_ci	&sub	($out,$inp);
2312e1051a39Sopenharmony_ci	&shl	($len,4);
2313e1051a39Sopenharmony_ci	&lea	($len,&DWP(-16*6,$inp,$len));	# end of input - 16*6
2314e1051a39Sopenharmony_ci	&mov	(&DWP($out_off,"esp"),$out);
2315e1051a39Sopenharmony_ci	&mov	(&DWP($end_off,"esp"),$len);
2316e1051a39Sopenharmony_ci	&mov	(&DWP($esp_off,"esp"),$rounds);
2317e1051a39Sopenharmony_ci
2318e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
2319e1051a39Sopenharmony_ci
2320e1051a39Sopenharmony_ci	&test	($block,1);
2321e1051a39Sopenharmony_ci	&jnz	(&label("odd"));
2322e1051a39Sopenharmony_ci
2323e1051a39Sopenharmony_ci	&bsf		($i3,$block);
2324e1051a39Sopenharmony_ci	&add		($block,1);
2325e1051a39Sopenharmony_ci	&shl		($i3,4);
2326e1051a39Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i3));
2327e1051a39Sopenharmony_ci	&mov		($i3,$key);			# put aside key
2328e1051a39Sopenharmony_ci
2329e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2330e1051a39Sopenharmony_ci	&lea		($inp,&DWP(16,$inp));
2331e1051a39Sopenharmony_ci
2332e1051a39Sopenharmony_ci	&pxor		($inout5,$rndkey0);		# ^ last offset_i
2333e1051a39Sopenharmony_ci	&pxor		($inout0,$inout5);		# ^ offset_i
2334e1051a39Sopenharmony_ci
2335e1051a39Sopenharmony_ci	&movdqa		($inout4,$rndkey1);
2336e1051a39Sopenharmony_ci	if ($inline)
2337e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
2338e1051a39Sopenharmony_ci	else
2339e1051a39Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
2340e1051a39Sopenharmony_ci
2341e1051a39Sopenharmony_ci	&xorps		($inout0,$inout5);		# ^ offset_i
2342e1051a39Sopenharmony_ci	&movaps		($rndkey1,$inout4);		# pass the checksum
2343e1051a39Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2344e1051a39Sopenharmony_ci	&xorps		($rndkey1,$inout0);		# checksum
2345e1051a39Sopenharmony_ci	&movups		(&QWP(-16,$out,$inp),$inout0);	# store output
2346e1051a39Sopenharmony_ci
2347e1051a39Sopenharmony_ci	&mov		($rounds,&DWP(240,$i3));
2348e1051a39Sopenharmony_ci	&mov		($key,$i3);			# restore key
2349e1051a39Sopenharmony_ci	&mov		($len,&DWP($end_off,"esp"));
2350e1051a39Sopenharmony_ci
2351e1051a39Sopenharmony_ci&set_label("odd");
2352e1051a39Sopenharmony_ci	&shl		($rounds,4);
2353e1051a39Sopenharmony_ci	&mov		($out,16);
2354e1051a39Sopenharmony_ci	&sub		($out,$rounds);			# twisted rounds
2355e1051a39Sopenharmony_ci	&mov		(&DWP($key_off,"esp"),$key);
2356e1051a39Sopenharmony_ci	&lea		($key,&DWP(32,$key,$rounds));	# end of key schedule
2357e1051a39Sopenharmony_ci	&mov		(&DWP($rounds_off,"esp"),$out);
2358e1051a39Sopenharmony_ci
2359e1051a39Sopenharmony_ci	&cmp		($inp,$len);
2360e1051a39Sopenharmony_ci	&ja		(&label("short"));
2361e1051a39Sopenharmony_ci	&jmp		(&label("grandloop"));
2362e1051a39Sopenharmony_ci
2363e1051a39Sopenharmony_ci&set_label("grandloop",32);
2364e1051a39Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
2365e1051a39Sopenharmony_ci	&lea		($i3,&DWP(3,$block));
2366e1051a39Sopenharmony_ci	&lea		($i5,&DWP(5,$block));
2367e1051a39Sopenharmony_ci	&add		($block,6);
2368e1051a39Sopenharmony_ci	&bsf		($i1,$i1);
2369e1051a39Sopenharmony_ci	&bsf		($i3,$i3);
2370e1051a39Sopenharmony_ci	&bsf		($i5,$i5);
2371e1051a39Sopenharmony_ci	&shl		($i1,4);
2372e1051a39Sopenharmony_ci	&shl		($i3,4);
2373e1051a39Sopenharmony_ci	&shl		($i5,4);
2374e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(0,$l_));
2375e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(0,$l_,$i1));
2376e1051a39Sopenharmony_ci	&mov		($rounds,&DWP($rounds_off,"esp"));
2377e1051a39Sopenharmony_ci	&movdqa		($inout2,$inout0);
2378e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_,$i3));
2379e1051a39Sopenharmony_ci	&movdqa		($inout4,$inout0);
2380e1051a39Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i5));
2381e1051a39Sopenharmony_ci
2382e1051a39Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ last offset_i
2383e1051a39Sopenharmony_ci	&pxor		($inout1,$inout0);
2384e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$inout0);
2385e1051a39Sopenharmony_ci	&pxor		($inout2,$inout1);
2386e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$inout1);
2387e1051a39Sopenharmony_ci	&pxor		($inout3,$inout2);
2388e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*2,"esp"),$inout2);
2389e1051a39Sopenharmony_ci	&pxor		($inout4,$inout3);
2390e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$inout3);
2391e1051a39Sopenharmony_ci	&pxor		($inout5,$inout4);
2392e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*4,"esp"),$inout4);
2393e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*5,"esp"),$inout5);
2394e1051a39Sopenharmony_ci
2395e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
2396e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2397e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
2398e1051a39Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
2399e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(16*3,$inp));
2400e1051a39Sopenharmony_ci	&movdqu		($inout4,&QWP(16*4,$inp));
2401e1051a39Sopenharmony_ci	&movdqu		($inout5,&QWP(16*5,$inp));
2402e1051a39Sopenharmony_ci	&lea		($inp,&DWP(16*6,$inp));
2403e1051a39Sopenharmony_ci
2404e1051a39Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2405e1051a39Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
2406e1051a39Sopenharmony_ci	&pxor		($inout1,$rndkey0);
2407e1051a39Sopenharmony_ci	&pxor		($inout2,$rndkey0);
2408e1051a39Sopenharmony_ci	&pxor		($inout3,$rndkey0);
2409e1051a39Sopenharmony_ci	&pxor		($inout4,$rndkey0);
2410e1051a39Sopenharmony_ci	&pxor		($inout5,$rndkey0);
2411e1051a39Sopenharmony_ci
2412e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
2413e1051a39Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2414e1051a39Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
2415e1051a39Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
2416e1051a39Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
2417e1051a39Sopenharmony_ci	&pxor		($inout4,&QWP(16*4,"esp"));
2418e1051a39Sopenharmony_ci	&pxor		($inout5,&QWP(16*5,"esp"));
2419e1051a39Sopenharmony_ci
2420e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2421e1051a39Sopenharmony_ci	&aesdec		($inout0,$rndkey1);
2422e1051a39Sopenharmony_ci	&aesdec		($inout1,$rndkey1);
2423e1051a39Sopenharmony_ci	&aesdec		($inout2,$rndkey1);
2424e1051a39Sopenharmony_ci	&aesdec		($inout3,$rndkey1);
2425e1051a39Sopenharmony_ci	&aesdec		($inout4,$rndkey1);
2426e1051a39Sopenharmony_ci	&aesdec		($inout5,$rndkey1);
2427e1051a39Sopenharmony_ci
2428e1051a39Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
2429e1051a39Sopenharmony_ci	&mov		($len,&DWP($end_off,"esp"));
2430e1051a39Sopenharmony_ci	&call		("_aesni_decrypt6_enter");
2431e1051a39Sopenharmony_ci
2432e1051a39Sopenharmony_ci	&movdqa		($rndkey0,&QWP(16*5,"esp"));	# pass last offset_i
2433e1051a39Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2434e1051a39Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));
2435e1051a39Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
2436e1051a39Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
2437e1051a39Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
2438e1051a39Sopenharmony_ci	&pxor		($inout4,&QWP(16*4,"esp"));
2439e1051a39Sopenharmony_ci	&pxor		($inout5,$rndkey0);
2440e1051a39Sopenharmony_ci
2441e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
2442e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16*6,$out,$inp),$inout0);# store output
2443e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout1);
2444e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16*5,$out,$inp),$inout1);
2445e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout2);
2446e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16*4,$out,$inp),$inout2);
2447e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout3);
2448e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16*3,$out,$inp),$inout3);
2449e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout4);
2450e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16*2,$out,$inp),$inout4);
2451e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout5);
2452e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16*1,$out,$inp),$inout5);
2453e1051a39Sopenharmony_ci	&cmp		($inp,$len);			# done yet?
2454e1051a39Sopenharmony_ci	&jbe		(&label("grandloop"));
2455e1051a39Sopenharmony_ci
2456e1051a39Sopenharmony_ci&set_label("short");
2457e1051a39Sopenharmony_ci	&add		($len,16*6);
2458e1051a39Sopenharmony_ci	&sub		($len,$inp);
2459e1051a39Sopenharmony_ci	&jz		(&label("done"));
2460e1051a39Sopenharmony_ci
2461e1051a39Sopenharmony_ci	&cmp		($len,16*2);
2462e1051a39Sopenharmony_ci	&jb		(&label("one"));
2463e1051a39Sopenharmony_ci	&je		(&label("two"));
2464e1051a39Sopenharmony_ci
2465e1051a39Sopenharmony_ci	&cmp		($len,16*4);
2466e1051a39Sopenharmony_ci	&jb		(&label("three"));
2467e1051a39Sopenharmony_ci	&je		(&label("four"));
2468e1051a39Sopenharmony_ci
2469e1051a39Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
2470e1051a39Sopenharmony_ci	&lea		($i3,&DWP(3,$block));
2471e1051a39Sopenharmony_ci	&bsf		($i1,$i1);
2472e1051a39Sopenharmony_ci	&bsf		($i3,$i3);
2473e1051a39Sopenharmony_ci	&shl		($i1,4);
2474e1051a39Sopenharmony_ci	&shl		($i3,4);
2475e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(0,$l_));
2476e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(0,$l_,$i1));
2477e1051a39Sopenharmony_ci	&mov		($rounds,&DWP($rounds_off,"esp"));
2478e1051a39Sopenharmony_ci	&movdqa		($inout2,$inout0);
2479e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_,$i3));
2480e1051a39Sopenharmony_ci	&movdqa		($inout4,$inout0);
2481e1051a39Sopenharmony_ci
2482e1051a39Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ last offset_i
2483e1051a39Sopenharmony_ci	&pxor		($inout1,$inout0);
2484e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$inout0);
2485e1051a39Sopenharmony_ci	&pxor		($inout2,$inout1);
2486e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$inout1);
2487e1051a39Sopenharmony_ci	&pxor		($inout3,$inout2);
2488e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*2,"esp"),$inout2);
2489e1051a39Sopenharmony_ci	&pxor		($inout4,$inout3);
2490e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$inout3);
2491e1051a39Sopenharmony_ci	&pxor		($inout5,$inout4);
2492e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*4,"esp"),$inout4);
2493e1051a39Sopenharmony_ci
2494e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
2495e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2496e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
2497e1051a39Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
2498e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(16*3,$inp));
2499e1051a39Sopenharmony_ci	&movdqu		($inout4,&QWP(16*4,$inp));
2500e1051a39Sopenharmony_ci	&pxor		($inout5,$inout5);
2501e1051a39Sopenharmony_ci
2502e1051a39Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2503e1051a39Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
2504e1051a39Sopenharmony_ci	&pxor		($inout1,$rndkey0);
2505e1051a39Sopenharmony_ci	&pxor		($inout2,$rndkey0);
2506e1051a39Sopenharmony_ci	&pxor		($inout3,$rndkey0);
2507e1051a39Sopenharmony_ci	&pxor		($inout4,$rndkey0);
2508e1051a39Sopenharmony_ci
2509e1051a39Sopenharmony_ci	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
2510e1051a39Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2511e1051a39Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
2512e1051a39Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
2513e1051a39Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
2514e1051a39Sopenharmony_ci	&pxor		($inout4,&QWP(16*4,"esp"));
2515e1051a39Sopenharmony_ci
2516e1051a39Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2517e1051a39Sopenharmony_ci	&aesdec		($inout0,$rndkey1);
2518e1051a39Sopenharmony_ci	&aesdec		($inout1,$rndkey1);
2519e1051a39Sopenharmony_ci	&aesdec		($inout2,$rndkey1);
2520e1051a39Sopenharmony_ci	&aesdec		($inout3,$rndkey1);
2521e1051a39Sopenharmony_ci	&aesdec		($inout4,$rndkey1);
2522e1051a39Sopenharmony_ci	&aesdec		($inout5,$rndkey1);
2523e1051a39Sopenharmony_ci
2524e1051a39Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
2525e1051a39Sopenharmony_ci	&call		("_aesni_decrypt6_enter");
2526e1051a39Sopenharmony_ci
2527e1051a39Sopenharmony_ci	&movdqa		($rndkey0,&QWP(16*4,"esp"));	# pass last offset_i
2528e1051a39Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2529e1051a39Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));
2530e1051a39Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
2531e1051a39Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
2532e1051a39Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
2533e1051a39Sopenharmony_ci	&pxor		($inout4,$rndkey0);
2534e1051a39Sopenharmony_ci
2535e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
2536e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*0,$out,$inp),$inout0);	# store output
2537e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout1);
2538e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*1,$out,$inp),$inout1);
2539e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout2);
2540e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*2,$out,$inp),$inout2);
2541e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout3);
2542e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*3,$out,$inp),$inout3);
2543e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout4);
2544e1051a39Sopenharmony_ci	&movdqu		(&QWP(16*4,$out,$inp),$inout4);
2545e1051a39Sopenharmony_ci
2546e1051a39Sopenharmony_ci	&jmp		(&label("done"));
2547e1051a39Sopenharmony_ci
2548e1051a39Sopenharmony_ci&set_label("one",16);
2549e1051a39Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_));
2550e1051a39Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
2551e1051a39Sopenharmony_ci
2552e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2553e1051a39Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
2554e1051a39Sopenharmony_ci
2555e1051a39Sopenharmony_ci	&pxor		($inout5,$rndkey0);		# ^ last offset_i
2556e1051a39Sopenharmony_ci	&pxor		($inout0,$inout5);		# ^ offset_i
2557e1051a39Sopenharmony_ci
2558e1051a39Sopenharmony_ci	&movdqa		($inout4,$rndkey1);
2559e1051a39Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
2560e1051a39Sopenharmony_ci	if ($inline)
2561e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
2562e1051a39Sopenharmony_ci	else
2563e1051a39Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
2564e1051a39Sopenharmony_ci
2565e1051a39Sopenharmony_ci	&xorps		($inout0,$inout5);		# ^ offset_i
2566e1051a39Sopenharmony_ci	&movaps		($rndkey1,$inout4);		# pass the checksum
2567e1051a39Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2568e1051a39Sopenharmony_ci	&xorps		($rndkey1,$inout0);		# checksum
2569e1051a39Sopenharmony_ci	&movups		(&QWP(0,$out,$inp),$inout0);
2570e1051a39Sopenharmony_ci
2571e1051a39Sopenharmony_ci	&jmp		(&label("done"));
2572e1051a39Sopenharmony_ci
2573e1051a39Sopenharmony_ci&set_label("two",16);
2574e1051a39Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
2575e1051a39Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
2576e1051a39Sopenharmony_ci	&bsf		($i1,$i1);
2577e1051a39Sopenharmony_ci	&shl		($i1,4);
2578e1051a39Sopenharmony_ci	&movdqu		($inout4,&QWP(0,$l_));
2579e1051a39Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i1));
2580e1051a39Sopenharmony_ci
2581e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2582e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
2583e1051a39Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
2584e1051a39Sopenharmony_ci
2585e1051a39Sopenharmony_ci	&movdqa		($inout3,$rndkey1);
2586e1051a39Sopenharmony_ci	&pxor		($inout4,$rndkey0);		# ^ last offset_i
2587e1051a39Sopenharmony_ci	&pxor		($inout5,$inout4);
2588e1051a39Sopenharmony_ci
2589e1051a39Sopenharmony_ci	&pxor		($inout0,$inout4);		# ^ offset_i
2590e1051a39Sopenharmony_ci	&pxor		($inout1,$inout5);
2591e1051a39Sopenharmony_ci
2592e1051a39Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
2593e1051a39Sopenharmony_ci	&call		("_aesni_decrypt2");
2594e1051a39Sopenharmony_ci
2595e1051a39Sopenharmony_ci	&xorps		($inout0,$inout4);		# ^ offset_i
2596e1051a39Sopenharmony_ci	&xorps		($inout1,$inout5);
2597e1051a39Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2598e1051a39Sopenharmony_ci	&xorps		($inout3,$inout0);		# checksum
2599e1051a39Sopenharmony_ci	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2600e1051a39Sopenharmony_ci	&xorps		($inout3,$inout1);
2601e1051a39Sopenharmony_ci	&movups		(&QWP(16*1,$out,$inp),$inout1);
2602e1051a39Sopenharmony_ci	&movaps		($rndkey1,$inout3);		# pass the checksum
2603e1051a39Sopenharmony_ci
2604e1051a39Sopenharmony_ci	&jmp		(&label("done"));
2605e1051a39Sopenharmony_ci
2606e1051a39Sopenharmony_ci&set_label("three",16);
2607e1051a39Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
2608e1051a39Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
2609e1051a39Sopenharmony_ci	&bsf		($i1,$i1);
2610e1051a39Sopenharmony_ci	&shl		($i1,4);
2611e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_));
2612e1051a39Sopenharmony_ci	&movdqu		($inout4,&QWP(0,$l_,$i1));
2613e1051a39Sopenharmony_ci	&movdqa		($inout5,$inout3);
2614e1051a39Sopenharmony_ci
2615e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2616e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
2617e1051a39Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
2618e1051a39Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
2619e1051a39Sopenharmony_ci
2620e1051a39Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2621e1051a39Sopenharmony_ci	&pxor		($inout3,$rndkey0);		# ^ last offset_i
2622e1051a39Sopenharmony_ci	&pxor		($inout4,$inout3);
2623e1051a39Sopenharmony_ci	&pxor		($inout5,$inout4);
2624e1051a39Sopenharmony_ci
2625e1051a39Sopenharmony_ci	&pxor		($inout0,$inout3);		# ^ offset_i
2626e1051a39Sopenharmony_ci	&pxor		($inout1,$inout4);
2627e1051a39Sopenharmony_ci	&pxor		($inout2,$inout5);
2628e1051a39Sopenharmony_ci
2629e1051a39Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
2630e1051a39Sopenharmony_ci	&call		("_aesni_decrypt3");
2631e1051a39Sopenharmony_ci
2632e1051a39Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2633e1051a39Sopenharmony_ci	&xorps		($inout0,$inout3);		# ^ offset_i
2634e1051a39Sopenharmony_ci	&xorps		($inout1,$inout4);
2635e1051a39Sopenharmony_ci	&xorps		($inout2,$inout5);
2636e1051a39Sopenharmony_ci	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2637e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
2638e1051a39Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2639e1051a39Sopenharmony_ci	&movups		(&QWP(16*1,$out,$inp),$inout1);
2640e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout1);
2641e1051a39Sopenharmony_ci	&movups		(&QWP(16*2,$out,$inp),$inout2);
2642e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout2);
2643e1051a39Sopenharmony_ci
2644e1051a39Sopenharmony_ci	&jmp		(&label("done"));
2645e1051a39Sopenharmony_ci
2646e1051a39Sopenharmony_ci&set_label("four",16);
2647e1051a39Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
2648e1051a39Sopenharmony_ci	&lea		($i3,&DWP(3,$block));
2649e1051a39Sopenharmony_ci	&bsf		($i1,$i1);
2650e1051a39Sopenharmony_ci	&bsf		($i3,$i3);
2651e1051a39Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
2652e1051a39Sopenharmony_ci	&shl		($i1,4);
2653e1051a39Sopenharmony_ci	&shl		($i3,4);
2654e1051a39Sopenharmony_ci	&movdqu		($inout2,&QWP(0,$l_));
2655e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_,$i1));
2656e1051a39Sopenharmony_ci	&movdqa		($inout4,$inout2);
2657e1051a39Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i3));
2658e1051a39Sopenharmony_ci
2659e1051a39Sopenharmony_ci	&pxor		($inout2,$rndkey0);		# ^ last offset_i
2660e1051a39Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2661e1051a39Sopenharmony_ci	&pxor		($inout3,$inout2);
2662e1051a39Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
2663e1051a39Sopenharmony_ci	&pxor		($inout4,$inout3);
2664e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$inout2);
2665e1051a39Sopenharmony_ci	&pxor		($inout5,$inout4);
2666e1051a39Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$inout3);
2667e1051a39Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
2668e1051a39Sopenharmony_ci	&movdqu		($inout3,&QWP(16*3,$inp));
2669e1051a39Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
2670e1051a39Sopenharmony_ci
2671e1051a39Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2672e1051a39Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2673e1051a39Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
2674e1051a39Sopenharmony_ci	&pxor		($inout2,$inout4);
2675e1051a39Sopenharmony_ci	&pxor		($inout3,$inout5);
2676e1051a39Sopenharmony_ci
2677e1051a39Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
2678e1051a39Sopenharmony_ci	&call		("_aesni_decrypt4");
2679e1051a39Sopenharmony_ci
2680e1051a39Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2681e1051a39Sopenharmony_ci	&xorps		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2682e1051a39Sopenharmony_ci	&xorps		($inout1,&QWP(16*1,"esp"));
2683e1051a39Sopenharmony_ci	&xorps		($inout2,$inout4);
2684e1051a39Sopenharmony_ci	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2685e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
2686e1051a39Sopenharmony_ci	&xorps		($inout3,$inout5);
2687e1051a39Sopenharmony_ci	&movups		(&QWP(16*1,$out,$inp),$inout1);
2688e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout1);
2689e1051a39Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2690e1051a39Sopenharmony_ci	&movups		(&QWP(16*2,$out,$inp),$inout2);
2691e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout2);
2692e1051a39Sopenharmony_ci	&movups		(&QWP(16*3,$out,$inp),$inout3);
2693e1051a39Sopenharmony_ci	&pxor		($rndkey1,$inout3);
2694e1051a39Sopenharmony_ci
2695e1051a39Sopenharmony_ci&set_label("done");
2696e1051a39Sopenharmony_ci	&mov	($key,&DWP($esp_off,"esp"));
2697e1051a39Sopenharmony_ci	&pxor	($inout0,$inout0);		# clear register bank
2698e1051a39Sopenharmony_ci	&pxor	($inout1,$inout1);
2699e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*0,"esp"),$inout0);	# clear stack
2700e1051a39Sopenharmony_ci	&pxor	($inout2,$inout2);
2701e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*1,"esp"),$inout0);
2702e1051a39Sopenharmony_ci	&pxor	($inout3,$inout3);
2703e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*2,"esp"),$inout0);
2704e1051a39Sopenharmony_ci	&pxor	($inout4,$inout4);
2705e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*3,"esp"),$inout0);
2706e1051a39Sopenharmony_ci	&pxor	($inout5,$inout5);
2707e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*4,"esp"),$inout0);
2708e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*5,"esp"),$inout0);
2709e1051a39Sopenharmony_ci	&movdqa	(&QWP(16*6,"esp"),$inout0);
2710e1051a39Sopenharmony_ci
2711e1051a39Sopenharmony_ci	&lea	("esp",&DWP(0,$key));
2712e1051a39Sopenharmony_ci	&mov	($rounds,&wparam(5));		# &offset_i
2713e1051a39Sopenharmony_ci	&mov	($rounds_,&wparam(7));		# &checksum
2714e1051a39Sopenharmony_ci	&movdqu	(&QWP(0,$rounds),$rndkey0);
2715e1051a39Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);
2716e1051a39Sopenharmony_ci	&movdqu	(&QWP(0,$rounds_),$rndkey1);
2717e1051a39Sopenharmony_ci	&pxor	($rndkey1,$rndkey1);
2718e1051a39Sopenharmony_ci&function_end("aesni_ocb_decrypt");
2719e1051a39Sopenharmony_ci}
2720e1051a39Sopenharmony_ci}
2721e1051a39Sopenharmony_ci
2722e1051a39Sopenharmony_ci######################################################################
2723e1051a39Sopenharmony_ci# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2724e1051a39Sopenharmony_ci#                           size_t length, const AES_KEY *key,
2725e1051a39Sopenharmony_ci#                           unsigned char *ivp,const int enc);
2726e1051a39Sopenharmony_ci&function_begin("${PREFIX}_cbc_encrypt");
2727e1051a39Sopenharmony_ci	&mov	($inp,&wparam(0));
2728e1051a39Sopenharmony_ci	&mov	($rounds_,"esp");
2729e1051a39Sopenharmony_ci	&mov	($out,&wparam(1));
2730e1051a39Sopenharmony_ci	&sub	($rounds_,24);
2731e1051a39Sopenharmony_ci	&mov	($len,&wparam(2));
2732e1051a39Sopenharmony_ci	&and	($rounds_,-16);
2733e1051a39Sopenharmony_ci	&mov	($key,&wparam(3));
2734e1051a39Sopenharmony_ci	&mov	($key_,&wparam(4));
2735e1051a39Sopenharmony_ci	&test	($len,$len);
2736e1051a39Sopenharmony_ci	&jz	(&label("cbc_abort"));
2737e1051a39Sopenharmony_ci
2738e1051a39Sopenharmony_ci	&cmp	(&wparam(5),0);
2739e1051a39Sopenharmony_ci	&xchg	($rounds_,"esp");		# alloca
2740e1051a39Sopenharmony_ci	&movups	($ivec,&QWP(0,$key_));		# load IV
2741e1051a39Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
2742e1051a39Sopenharmony_ci	&mov	($key_,$key);			# backup $key
2743e1051a39Sopenharmony_ci	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
2744e1051a39Sopenharmony_ci	&mov	($rounds_,$rounds);		# backup $rounds
2745e1051a39Sopenharmony_ci	&je	(&label("cbc_decrypt"));
2746e1051a39Sopenharmony_ci
2747e1051a39Sopenharmony_ci	&movaps	($inout0,$ivec);
2748e1051a39Sopenharmony_ci	&cmp	($len,16);
2749e1051a39Sopenharmony_ci	&jb	(&label("cbc_enc_tail"));
2750e1051a39Sopenharmony_ci	&sub	($len,16);
2751e1051a39Sopenharmony_ci	&jmp	(&label("cbc_enc_loop"));
2752e1051a39Sopenharmony_ci
2753e1051a39Sopenharmony_ci&set_label("cbc_enc_loop",16);
2754e1051a39Sopenharmony_ci	&movups	($ivec,&QWP(0,$inp));		# input actually
2755e1051a39Sopenharmony_ci	&lea	($inp,&DWP(16,$inp));
2756e1051a39Sopenharmony_ci	if ($inline)
2757e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
2758e1051a39Sopenharmony_ci	else
2759e1051a39Sopenharmony_ci	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
2760e1051a39Sopenharmony_ci	&mov	($rounds,$rounds_);	# restore $rounds
2761e1051a39Sopenharmony_ci	&mov	($key,$key_);		# restore $key
2762e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);	# store output
2763e1051a39Sopenharmony_ci	&lea	($out,&DWP(16,$out));
2764e1051a39Sopenharmony_ci	&sub	($len,16);
2765e1051a39Sopenharmony_ci	&jnc	(&label("cbc_enc_loop"));
2766e1051a39Sopenharmony_ci	&add	($len,16);
2767e1051a39Sopenharmony_ci	&jnz	(&label("cbc_enc_tail"));
2768e1051a39Sopenharmony_ci	&movaps	($ivec,$inout0);
2769e1051a39Sopenharmony_ci	&pxor	($inout0,$inout0);
2770e1051a39Sopenharmony_ci	&jmp	(&label("cbc_ret"));
2771e1051a39Sopenharmony_ci
2772e1051a39Sopenharmony_ci&set_label("cbc_enc_tail");
2773e1051a39Sopenharmony_ci	&mov	("ecx",$len);		# zaps $rounds
2774e1051a39Sopenharmony_ci	&data_word(0xA4F3F689);		# rep movsb
2775e1051a39Sopenharmony_ci	&mov	("ecx",16);		# zero tail
2776e1051a39Sopenharmony_ci	&sub	("ecx",$len);
2777e1051a39Sopenharmony_ci	&xor	("eax","eax");		# zaps $len
2778e1051a39Sopenharmony_ci	&data_word(0xAAF3F689);		# rep stosb
2779e1051a39Sopenharmony_ci	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
2780e1051a39Sopenharmony_ci	&mov	($rounds,$rounds_);	# restore $rounds
2781e1051a39Sopenharmony_ci	&mov	($inp,$out);		# $inp and $out are the same
2782e1051a39Sopenharmony_ci	&mov	($key,$key_);		# restore $key
2783e1051a39Sopenharmony_ci	&jmp	(&label("cbc_enc_loop"));
2784e1051a39Sopenharmony_ci######################################################################
2785e1051a39Sopenharmony_ci&set_label("cbc_decrypt",16);
2786e1051a39Sopenharmony_ci	&cmp	($len,0x50);
2787e1051a39Sopenharmony_ci	&jbe	(&label("cbc_dec_tail"));
2788e1051a39Sopenharmony_ci	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
2789e1051a39Sopenharmony_ci	&sub	($len,0x50);
2790e1051a39Sopenharmony_ci	&jmp	(&label("cbc_dec_loop6_enter"));
2791e1051a39Sopenharmony_ci
2792e1051a39Sopenharmony_ci&set_label("cbc_dec_loop6",16);
2793e1051a39Sopenharmony_ci	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
2794e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout5);
2795e1051a39Sopenharmony_ci	&lea	($out,&DWP(0x10,$out));
2796e1051a39Sopenharmony_ci&set_label("cbc_dec_loop6_enter");
2797e1051a39Sopenharmony_ci	&movdqu	($inout0,&QWP(0,$inp));
2798e1051a39Sopenharmony_ci	&movdqu	($inout1,&QWP(0x10,$inp));
2799e1051a39Sopenharmony_ci	&movdqu	($inout2,&QWP(0x20,$inp));
2800e1051a39Sopenharmony_ci	&movdqu	($inout3,&QWP(0x30,$inp));
2801e1051a39Sopenharmony_ci	&movdqu	($inout4,&QWP(0x40,$inp));
2802e1051a39Sopenharmony_ci	&movdqu	($inout5,&QWP(0x50,$inp));
2803e1051a39Sopenharmony_ci
2804e1051a39Sopenharmony_ci	&call	("_aesni_decrypt6");
2805e1051a39Sopenharmony_ci
2806e1051a39Sopenharmony_ci	&movups	($rndkey1,&QWP(0,$inp));
2807e1051a39Sopenharmony_ci	&movups	($rndkey0,&QWP(0x10,$inp));
2808e1051a39Sopenharmony_ci	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
2809e1051a39Sopenharmony_ci	&xorps	($inout1,$rndkey1);
2810e1051a39Sopenharmony_ci	&movups	($rndkey1,&QWP(0x20,$inp));
2811e1051a39Sopenharmony_ci	&xorps	($inout2,$rndkey0);
2812e1051a39Sopenharmony_ci	&movups	($rndkey0,&QWP(0x30,$inp));
2813e1051a39Sopenharmony_ci	&xorps	($inout3,$rndkey1);
2814e1051a39Sopenharmony_ci	&movups	($rndkey1,&QWP(0x40,$inp));
2815e1051a39Sopenharmony_ci	&xorps	($inout4,$rndkey0);
2816e1051a39Sopenharmony_ci	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
2817e1051a39Sopenharmony_ci	&xorps	($inout5,$rndkey1);
2818e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
2819e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
2820e1051a39Sopenharmony_ci	&lea	($inp,&DWP(0x60,$inp));
2821e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
2822e1051a39Sopenharmony_ci	&mov	($rounds,$rounds_);		# restore $rounds
2823e1051a39Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
2824e1051a39Sopenharmony_ci	&mov	($key,$key_);			# restore $key
2825e1051a39Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
2826e1051a39Sopenharmony_ci	&lea	($out,&DWP(0x50,$out));
2827e1051a39Sopenharmony_ci	&sub	($len,0x60);
2828e1051a39Sopenharmony_ci	&ja	(&label("cbc_dec_loop6"));
2829e1051a39Sopenharmony_ci
2830e1051a39Sopenharmony_ci	&movaps	($inout0,$inout5);
2831e1051a39Sopenharmony_ci	&movaps	($ivec,$rndkey0);
2832e1051a39Sopenharmony_ci	&add	($len,0x50);
2833e1051a39Sopenharmony_ci	&jle	(&label("cbc_dec_clear_tail_collected"));
2834e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
2835e1051a39Sopenharmony_ci	&lea	($out,&DWP(0x10,$out));
2836e1051a39Sopenharmony_ci&set_label("cbc_dec_tail");
2837e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));
2838e1051a39Sopenharmony_ci	&movaps	($in0,$inout0);
2839e1051a39Sopenharmony_ci	&cmp	($len,0x10);
2840e1051a39Sopenharmony_ci	&jbe	(&label("cbc_dec_one"));
2841e1051a39Sopenharmony_ci
2842e1051a39Sopenharmony_ci	&movups	($inout1,&QWP(0x10,$inp));
2843e1051a39Sopenharmony_ci	&movaps	($in1,$inout1);
2844e1051a39Sopenharmony_ci	&cmp	($len,0x20);
2845e1051a39Sopenharmony_ci	&jbe	(&label("cbc_dec_two"));
2846e1051a39Sopenharmony_ci
2847e1051a39Sopenharmony_ci	&movups	($inout2,&QWP(0x20,$inp));
2848e1051a39Sopenharmony_ci	&cmp	($len,0x30);
2849e1051a39Sopenharmony_ci	&jbe	(&label("cbc_dec_three"));
2850e1051a39Sopenharmony_ci
2851e1051a39Sopenharmony_ci	&movups	($inout3,&QWP(0x30,$inp));
2852e1051a39Sopenharmony_ci	&cmp	($len,0x40);
2853e1051a39Sopenharmony_ci	&jbe	(&label("cbc_dec_four"));
2854e1051a39Sopenharmony_ci
2855e1051a39Sopenharmony_ci	&movups	($inout4,&QWP(0x40,$inp));
2856e1051a39Sopenharmony_ci	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
2857e1051a39Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));
2858e1051a39Sopenharmony_ci	&xorps	($inout5,$inout5);
2859e1051a39Sopenharmony_ci	&call	("_aesni_decrypt6");
2860e1051a39Sopenharmony_ci	&movups	($rndkey1,&QWP(0,$inp));
2861e1051a39Sopenharmony_ci	&movups	($rndkey0,&QWP(0x10,$inp));
2862e1051a39Sopenharmony_ci	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
2863e1051a39Sopenharmony_ci	&xorps	($inout1,$rndkey1);
2864e1051a39Sopenharmony_ci	&movups	($rndkey1,&QWP(0x20,$inp));
2865e1051a39Sopenharmony_ci	&xorps	($inout2,$rndkey0);
2866e1051a39Sopenharmony_ci	&movups	($rndkey0,&QWP(0x30,$inp));
2867e1051a39Sopenharmony_ci	&xorps	($inout3,$rndkey1);
2868e1051a39Sopenharmony_ci	&movups	($ivec,&QWP(0x40,$inp));	# IV
2869e1051a39Sopenharmony_ci	&xorps	($inout4,$rndkey0);
2870e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
2871e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
2872e1051a39Sopenharmony_ci	&pxor	($inout1,$inout1);
2873e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
2874e1051a39Sopenharmony_ci	&pxor	($inout2,$inout2);
2875e1051a39Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
2876e1051a39Sopenharmony_ci	&pxor	($inout3,$inout3);
2877e1051a39Sopenharmony_ci	&lea	($out,&DWP(0x40,$out));
2878e1051a39Sopenharmony_ci	&movaps	($inout0,$inout4);
2879e1051a39Sopenharmony_ci	&pxor	($inout4,$inout4);
2880e1051a39Sopenharmony_ci	&sub	($len,0x50);
2881e1051a39Sopenharmony_ci	&jmp	(&label("cbc_dec_tail_collected"));
2882e1051a39Sopenharmony_ci
2883e1051a39Sopenharmony_ci&set_label("cbc_dec_one",16);
2884e1051a39Sopenharmony_ci	if ($inline)
2885e1051a39Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
2886e1051a39Sopenharmony_ci	else
2887e1051a39Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
2888e1051a39Sopenharmony_ci	&xorps	($inout0,$ivec);
2889e1051a39Sopenharmony_ci	&movaps	($ivec,$in0);
2890e1051a39Sopenharmony_ci	&sub	($len,0x10);
2891e1051a39Sopenharmony_ci	&jmp	(&label("cbc_dec_tail_collected"));
2892e1051a39Sopenharmony_ci
2893e1051a39Sopenharmony_ci&set_label("cbc_dec_two",16);
2894e1051a39Sopenharmony_ci	&call	("_aesni_decrypt2");
2895e1051a39Sopenharmony_ci	&xorps	($inout0,$ivec);
2896e1051a39Sopenharmony_ci	&xorps	($inout1,$in0);
2897e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
2898e1051a39Sopenharmony_ci	&movaps	($inout0,$inout1);
2899e1051a39Sopenharmony_ci	&pxor	($inout1,$inout1);
2900e1051a39Sopenharmony_ci	&lea	($out,&DWP(0x10,$out));
2901e1051a39Sopenharmony_ci	&movaps	($ivec,$in1);
2902e1051a39Sopenharmony_ci	&sub	($len,0x20);
2903e1051a39Sopenharmony_ci	&jmp	(&label("cbc_dec_tail_collected"));
2904e1051a39Sopenharmony_ci
2905e1051a39Sopenharmony_ci&set_label("cbc_dec_three",16);
2906e1051a39Sopenharmony_ci	&call	("_aesni_decrypt3");
2907e1051a39Sopenharmony_ci	&xorps	($inout0,$ivec);
2908e1051a39Sopenharmony_ci	&xorps	($inout1,$in0);
2909e1051a39Sopenharmony_ci	&xorps	($inout2,$in1);
2910e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
2911e1051a39Sopenharmony_ci	&movaps	($inout0,$inout2);
2912e1051a39Sopenharmony_ci	&pxor	($inout2,$inout2);
2913e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
2914e1051a39Sopenharmony_ci	&pxor	($inout1,$inout1);
2915e1051a39Sopenharmony_ci	&lea	($out,&DWP(0x20,$out));
2916e1051a39Sopenharmony_ci	&movups	($ivec,&QWP(0x20,$inp));
2917e1051a39Sopenharmony_ci	&sub	($len,0x30);
2918e1051a39Sopenharmony_ci	&jmp	(&label("cbc_dec_tail_collected"));
2919e1051a39Sopenharmony_ci
2920e1051a39Sopenharmony_ci&set_label("cbc_dec_four",16);
2921e1051a39Sopenharmony_ci	&call	("_aesni_decrypt4");
2922e1051a39Sopenharmony_ci	&movups	($rndkey1,&QWP(0x10,$inp));
2923e1051a39Sopenharmony_ci	&movups	($rndkey0,&QWP(0x20,$inp));
2924e1051a39Sopenharmony_ci	&xorps	($inout0,$ivec);
2925e1051a39Sopenharmony_ci	&movups	($ivec,&QWP(0x30,$inp));
2926e1051a39Sopenharmony_ci	&xorps	($inout1,$in0);
2927e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
2928e1051a39Sopenharmony_ci	&xorps	($inout2,$rndkey1);
2929e1051a39Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
2930e1051a39Sopenharmony_ci	&pxor	($inout1,$inout1);
2931e1051a39Sopenharmony_ci	&xorps	($inout3,$rndkey0);
2932e1051a39Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
2933e1051a39Sopenharmony_ci	&pxor	($inout2,$inout2);
2934e1051a39Sopenharmony_ci	&lea	($out,&DWP(0x30,$out));
2935e1051a39Sopenharmony_ci	&movaps	($inout0,$inout3);
2936e1051a39Sopenharmony_ci	&pxor	($inout3,$inout3);
2937e1051a39Sopenharmony_ci	&sub	($len,0x40);
2938e1051a39Sopenharmony_ci	&jmp	(&label("cbc_dec_tail_collected"));
2939e1051a39Sopenharmony_ci
2940e1051a39Sopenharmony_ci&set_label("cbc_dec_clear_tail_collected",16);
2941e1051a39Sopenharmony_ci	&pxor	($inout1,$inout1);
2942e1051a39Sopenharmony_ci	&pxor	($inout2,$inout2);
2943e1051a39Sopenharmony_ci	&pxor	($inout3,$inout3);
2944e1051a39Sopenharmony_ci	&pxor	($inout4,$inout4);
2945e1051a39Sopenharmony_ci&set_label("cbc_dec_tail_collected");
2946e1051a39Sopenharmony_ci	&and	($len,15);
2947e1051a39Sopenharmony_ci	&jnz	(&label("cbc_dec_tail_partial"));
2948e1051a39Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
2949e1051a39Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);
2950e1051a39Sopenharmony_ci	&jmp	(&label("cbc_ret"));
2951e1051a39Sopenharmony_ci
2952e1051a39Sopenharmony_ci&set_label("cbc_dec_tail_partial",16);
2953e1051a39Sopenharmony_ci	&movaps	(&QWP(0,"esp"),$inout0);
2954e1051a39Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);
2955e1051a39Sopenharmony_ci	&mov	("ecx",16);
2956e1051a39Sopenharmony_ci	&mov	($inp,"esp");
2957e1051a39Sopenharmony_ci	&sub	("ecx",$len);
2958e1051a39Sopenharmony_ci	&data_word(0xA4F3F689);		# rep movsb
2959e1051a39Sopenharmony_ci	&movdqa	(&QWP(0,"esp"),$inout0);
2960e1051a39Sopenharmony_ci
2961e1051a39Sopenharmony_ci&set_label("cbc_ret");
2962e1051a39Sopenharmony_ci	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
2963e1051a39Sopenharmony_ci	&mov	($key_,&wparam(4));
2964e1051a39Sopenharmony_ci	&pxor	($inout0,$inout0);
2965e1051a39Sopenharmony_ci	&pxor	($rndkey1,$rndkey1);
2966e1051a39Sopenharmony_ci	&movups	(&QWP(0,$key_),$ivec);	# output IV
2967e1051a39Sopenharmony_ci	&pxor	($ivec,$ivec);
2968e1051a39Sopenharmony_ci&set_label("cbc_abort");
2969e1051a39Sopenharmony_ci&function_end("${PREFIX}_cbc_encrypt");
2970e1051a39Sopenharmony_ci
2971e1051a39Sopenharmony_ci######################################################################
2972e1051a39Sopenharmony_ci# Mechanical port from aesni-x86_64.pl.
2973e1051a39Sopenharmony_ci#
2974e1051a39Sopenharmony_ci# _aesni_set_encrypt_key is private interface,
2975e1051a39Sopenharmony_ci# input:
2976e1051a39Sopenharmony_ci#	"eax"	const unsigned char *userKey
2977e1051a39Sopenharmony_ci#	$rounds	int bits
2978e1051a39Sopenharmony_ci#	$key	AES_KEY *key
2979e1051a39Sopenharmony_ci# output:
2980e1051a39Sopenharmony_ci#	"eax"	return code
2981e1051a39Sopenharmony_ci#	$round	rounds
2982e1051a39Sopenharmony_ci
2983e1051a39Sopenharmony_ci&function_begin_B("_aesni_set_encrypt_key");
2984e1051a39Sopenharmony_ci	&push	("ebp");
2985e1051a39Sopenharmony_ci	&push	("ebx");
2986e1051a39Sopenharmony_ci	&test	("eax","eax");
2987e1051a39Sopenharmony_ci	&jz	(&label("bad_pointer"));
2988e1051a39Sopenharmony_ci	&test	($key,$key);
2989e1051a39Sopenharmony_ci	&jz	(&label("bad_pointer"));
2990e1051a39Sopenharmony_ci
2991e1051a39Sopenharmony_ci	&call	(&label("pic"));
2992e1051a39Sopenharmony_ci&set_label("pic");
2993e1051a39Sopenharmony_ci	&blindpop("ebx");
2994e1051a39Sopenharmony_ci	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2995e1051a39Sopenharmony_ci
2996e1051a39Sopenharmony_ci	&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
2997e1051a39Sopenharmony_ci	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
2998e1051a39Sopenharmony_ci	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
2999e1051a39Sopenharmony_ci	&mov	("ebp",&DWP(4,"ebp"));
3000e1051a39Sopenharmony_ci	&lea	($key,&DWP(16,$key));
3001e1051a39Sopenharmony_ci	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
3002e1051a39Sopenharmony_ci	&cmp	($rounds,256);
3003e1051a39Sopenharmony_ci	&je	(&label("14rounds"));
3004e1051a39Sopenharmony_ci	&cmp	($rounds,192);
3005e1051a39Sopenharmony_ci	&je	(&label("12rounds"));
3006e1051a39Sopenharmony_ci	&cmp	($rounds,128);
3007e1051a39Sopenharmony_ci	&jne	(&label("bad_keybits"));
3008e1051a39Sopenharmony_ci
3009e1051a39Sopenharmony_ci&set_label("10rounds",16);
3010e1051a39Sopenharmony_ci	&cmp		("ebp",1<<28);
3011e1051a39Sopenharmony_ci	&je		(&label("10rounds_alt"));
3012e1051a39Sopenharmony_ci
3013e1051a39Sopenharmony_ci	&mov		($rounds,9);
3014e1051a39Sopenharmony_ci	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
3015e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
3016e1051a39Sopenharmony_ci	&call		(&label("key_128_cold"));
3017e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
3018e1051a39Sopenharmony_ci	&call		(&label("key_128"));
3019e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
3020e1051a39Sopenharmony_ci	&call		(&label("key_128"));
3021e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
3022e1051a39Sopenharmony_ci	&call		(&label("key_128"));
3023e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
3024e1051a39Sopenharmony_ci	&call		(&label("key_128"));
3025e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
3026e1051a39Sopenharmony_ci	&call		(&label("key_128"));
3027e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
3028e1051a39Sopenharmony_ci	&call		(&label("key_128"));
3029e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
3030e1051a39Sopenharmony_ci	&call		(&label("key_128"));
3031e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
3032e1051a39Sopenharmony_ci	&call		(&label("key_128"));
3033e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
3034e1051a39Sopenharmony_ci	&call		(&label("key_128"));
3035e1051a39Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
3036e1051a39Sopenharmony_ci	&mov		(&DWP(80,$key),$rounds);
3037e1051a39Sopenharmony_ci
3038e1051a39Sopenharmony_ci	&jmp	(&label("good_key"));
3039e1051a39Sopenharmony_ci
3040e1051a39Sopenharmony_ci&set_label("key_128",16);
3041e1051a39Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
3042e1051a39Sopenharmony_ci	&lea		($key,&DWP(16,$key));
3043e1051a39Sopenharmony_ci&set_label("key_128_cold");
3044e1051a39Sopenharmony_ci	&shufps		("xmm4","xmm0",0b00010000);
3045e1051a39Sopenharmony_ci	&xorps		("xmm0","xmm4");
3046e1051a39Sopenharmony_ci	&shufps		("xmm4","xmm0",0b10001100);
3047e1051a39Sopenharmony_ci	&xorps		("xmm0","xmm4");
3048e1051a39Sopenharmony_ci	&shufps		("xmm1","xmm1",0b11111111);	# critical path
3049e1051a39Sopenharmony_ci	&xorps		("xmm0","xmm1");
3050e1051a39Sopenharmony_ci	&ret();
3051e1051a39Sopenharmony_ci
3052e1051a39Sopenharmony_ci&set_label("10rounds_alt",16);
3053e1051a39Sopenharmony_ci	&movdqa		("xmm5",&QWP(0x00,"ebx"));
3054e1051a39Sopenharmony_ci	&mov		($rounds,8);
3055e1051a39Sopenharmony_ci	&movdqa		("xmm4",&QWP(0x20,"ebx"));
3056e1051a39Sopenharmony_ci	&movdqa		("xmm2","xmm0");
3057e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16,$key),"xmm0");
3058e1051a39Sopenharmony_ci
3059e1051a39Sopenharmony_ci&set_label("loop_key128");
3060e1051a39Sopenharmony_ci	&pshufb		("xmm0","xmm5");
3061e1051a39Sopenharmony_ci	&aesenclast	("xmm0","xmm4");
3062e1051a39Sopenharmony_ci	&pslld		("xmm4",1);
3063e1051a39Sopenharmony_ci	&lea		($key,&DWP(16,$key));
3064e1051a39Sopenharmony_ci
3065e1051a39Sopenharmony_ci	&movdqa		("xmm3","xmm2");
3066e1051a39Sopenharmony_ci	&pslldq		("xmm2",4);
3067e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm2");
3068e1051a39Sopenharmony_ci	&pslldq		("xmm2",4);
3069e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm2");
3070e1051a39Sopenharmony_ci	&pslldq		("xmm2",4);
3071e1051a39Sopenharmony_ci	&pxor		("xmm2","xmm3");
3072e1051a39Sopenharmony_ci
3073e1051a39Sopenharmony_ci	&pxor		("xmm0","xmm2");
3074e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16,$key),"xmm0");
3075e1051a39Sopenharmony_ci	&movdqa		("xmm2","xmm0");
3076e1051a39Sopenharmony_ci
3077e1051a39Sopenharmony_ci	&dec		($rounds);
3078e1051a39Sopenharmony_ci	&jnz		(&label("loop_key128"));
3079e1051a39Sopenharmony_ci
3080e1051a39Sopenharmony_ci	&movdqa		("xmm4",&QWP(0x30,"ebx"));
3081e1051a39Sopenharmony_ci
3082e1051a39Sopenharmony_ci	&pshufb		("xmm0","xmm5");
3083e1051a39Sopenharmony_ci	&aesenclast	("xmm0","xmm4");
3084e1051a39Sopenharmony_ci	&pslld		("xmm4",1);
3085e1051a39Sopenharmony_ci
3086e1051a39Sopenharmony_ci	&movdqa		("xmm3","xmm2");
3087e1051a39Sopenharmony_ci	&pslldq		("xmm2",4);
3088e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm2");
3089e1051a39Sopenharmony_ci	&pslldq		("xmm2",4);
3090e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm2");
3091e1051a39Sopenharmony_ci	&pslldq		("xmm2",4);
3092e1051a39Sopenharmony_ci	&pxor		("xmm2","xmm3");
3093e1051a39Sopenharmony_ci
3094e1051a39Sopenharmony_ci	&pxor		("xmm0","xmm2");
3095e1051a39Sopenharmony_ci	&movdqu		(&QWP(0,$key),"xmm0");
3096e1051a39Sopenharmony_ci
3097e1051a39Sopenharmony_ci	&movdqa		("xmm2","xmm0");
3098e1051a39Sopenharmony_ci	&pshufb		("xmm0","xmm5");
3099e1051a39Sopenharmony_ci	&aesenclast	("xmm0","xmm4");
3100e1051a39Sopenharmony_ci
3101e1051a39Sopenharmony_ci	&movdqa		("xmm3","xmm2");
3102e1051a39Sopenharmony_ci	&pslldq		("xmm2",4);
3103e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm2");
3104e1051a39Sopenharmony_ci	&pslldq		("xmm2",4);
3105e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm2");
3106e1051a39Sopenharmony_ci	&pslldq		("xmm2",4);
3107e1051a39Sopenharmony_ci	&pxor		("xmm2","xmm3");
3108e1051a39Sopenharmony_ci
3109e1051a39Sopenharmony_ci	&pxor		("xmm0","xmm2");
3110e1051a39Sopenharmony_ci	&movdqu		(&QWP(16,$key),"xmm0");
3111e1051a39Sopenharmony_ci
3112e1051a39Sopenharmony_ci	&mov		($rounds,9);
3113e1051a39Sopenharmony_ci	&mov		(&DWP(96,$key),$rounds);
3114e1051a39Sopenharmony_ci
3115e1051a39Sopenharmony_ci	&jmp	(&label("good_key"));
3116e1051a39Sopenharmony_ci
3117e1051a39Sopenharmony_ci&set_label("12rounds",16);
3118e1051a39Sopenharmony_ci	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
3119e1051a39Sopenharmony_ci	&cmp		("ebp",1<<28);
3120e1051a39Sopenharmony_ci	&je		(&label("12rounds_alt"));
3121e1051a39Sopenharmony_ci
3122e1051a39Sopenharmony_ci	&mov		($rounds,11);
3123e1051a39Sopenharmony_ci	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
3124e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
3125e1051a39Sopenharmony_ci	&call		(&label("key_192a_cold"));
3126e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
3127e1051a39Sopenharmony_ci	&call		(&label("key_192b"));
3128e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
3129e1051a39Sopenharmony_ci	&call		(&label("key_192a"));
3130e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
3131e1051a39Sopenharmony_ci	&call		(&label("key_192b"));
3132e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
3133e1051a39Sopenharmony_ci	&call		(&label("key_192a"));
3134e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
3135e1051a39Sopenharmony_ci	&call		(&label("key_192b"));
3136e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
3137e1051a39Sopenharmony_ci	&call		(&label("key_192a"));
3138e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
3139e1051a39Sopenharmony_ci	&call		(&label("key_192b"));
3140e1051a39Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
3141e1051a39Sopenharmony_ci	&mov		(&DWP(48,$key),$rounds);
3142e1051a39Sopenharmony_ci
3143e1051a39Sopenharmony_ci	&jmp	(&label("good_key"));
3144e1051a39Sopenharmony_ci
3145e1051a39Sopenharmony_ci&set_label("key_192a",16);
3146e1051a39Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
3147e1051a39Sopenharmony_ci	&lea		($key,&DWP(16,$key));
3148e1051a39Sopenharmony_ci&set_label("key_192a_cold",16);
3149e1051a39Sopenharmony_ci	&movaps		("xmm5","xmm2");
3150e1051a39Sopenharmony_ci&set_label("key_192b_warm");
3151e1051a39Sopenharmony_ci	&shufps		("xmm4","xmm0",0b00010000);
3152e1051a39Sopenharmony_ci	&movdqa		("xmm3","xmm2");
3153e1051a39Sopenharmony_ci	&xorps		("xmm0","xmm4");
3154e1051a39Sopenharmony_ci	&shufps		("xmm4","xmm0",0b10001100);
3155e1051a39Sopenharmony_ci	&pslldq		("xmm3",4);
3156e1051a39Sopenharmony_ci	&xorps		("xmm0","xmm4");
3157e1051a39Sopenharmony_ci	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
3158e1051a39Sopenharmony_ci	&pxor		("xmm2","xmm3");
3159e1051a39Sopenharmony_ci	&pxor		("xmm0","xmm1");
3160e1051a39Sopenharmony_ci	&pshufd		("xmm3","xmm0",0b11111111);
3161e1051a39Sopenharmony_ci	&pxor		("xmm2","xmm3");
3162e1051a39Sopenharmony_ci	&ret();
3163e1051a39Sopenharmony_ci
3164e1051a39Sopenharmony_ci&set_label("key_192b",16);
3165e1051a39Sopenharmony_ci	&movaps		("xmm3","xmm0");
3166e1051a39Sopenharmony_ci	&shufps		("xmm5","xmm0",0b01000100);
3167e1051a39Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm5");
3168e1051a39Sopenharmony_ci	&shufps		("xmm3","xmm2",0b01001110);
3169e1051a39Sopenharmony_ci	&$movekey	(&QWP(16,$key),"xmm3");
3170e1051a39Sopenharmony_ci	&lea		($key,&DWP(32,$key));
3171e1051a39Sopenharmony_ci	&jmp		(&label("key_192b_warm"));
3172e1051a39Sopenharmony_ci
3173e1051a39Sopenharmony_ci&set_label("12rounds_alt",16);
3174e1051a39Sopenharmony_ci	&movdqa		("xmm5",&QWP(0x10,"ebx"));
3175e1051a39Sopenharmony_ci	&movdqa		("xmm4",&QWP(0x20,"ebx"));
3176e1051a39Sopenharmony_ci	&mov		($rounds,8);
3177e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16,$key),"xmm0");
3178e1051a39Sopenharmony_ci
3179e1051a39Sopenharmony_ci&set_label("loop_key192");
3180e1051a39Sopenharmony_ci	&movq		(&QWP(0,$key),"xmm2");
3181e1051a39Sopenharmony_ci	&movdqa		("xmm1","xmm2");
3182e1051a39Sopenharmony_ci	&pshufb		("xmm2","xmm5");
3183e1051a39Sopenharmony_ci	&aesenclast	("xmm2","xmm4");
3184e1051a39Sopenharmony_ci	&pslld		("xmm4",1);
3185e1051a39Sopenharmony_ci	&lea		($key,&DWP(24,$key));
3186e1051a39Sopenharmony_ci
3187e1051a39Sopenharmony_ci	&movdqa		("xmm3","xmm0");
3188e1051a39Sopenharmony_ci	&pslldq		("xmm0",4);
3189e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm0");
3190e1051a39Sopenharmony_ci	&pslldq		("xmm0",4);
3191e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm0");
3192e1051a39Sopenharmony_ci	&pslldq		("xmm0",4);
3193e1051a39Sopenharmony_ci	&pxor		("xmm0","xmm3");
3194e1051a39Sopenharmony_ci
3195e1051a39Sopenharmony_ci	&pshufd		("xmm3","xmm0",0xff);
3196e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm1");
3197e1051a39Sopenharmony_ci	&pslldq		("xmm1",4);
3198e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm1");
3199e1051a39Sopenharmony_ci
3200e1051a39Sopenharmony_ci	&pxor		("xmm0","xmm2");
3201e1051a39Sopenharmony_ci	&pxor		("xmm2","xmm3");
3202e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16,$key),"xmm0");
3203e1051a39Sopenharmony_ci
3204e1051a39Sopenharmony_ci	&dec		($rounds);
3205e1051a39Sopenharmony_ci	&jnz		(&label("loop_key192"));
3206e1051a39Sopenharmony_ci
3207e1051a39Sopenharmony_ci	&mov	($rounds,11);
3208e1051a39Sopenharmony_ci	&mov	(&DWP(32,$key),$rounds);
3209e1051a39Sopenharmony_ci
3210e1051a39Sopenharmony_ci	&jmp	(&label("good_key"));
3211e1051a39Sopenharmony_ci
3212e1051a39Sopenharmony_ci&set_label("14rounds",16);
3213e1051a39Sopenharmony_ci	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
3214e1051a39Sopenharmony_ci	&lea		($key,&DWP(16,$key));
3215e1051a39Sopenharmony_ci	&cmp		("ebp",1<<28);
3216e1051a39Sopenharmony_ci	&je		(&label("14rounds_alt"));
3217e1051a39Sopenharmony_ci
3218e1051a39Sopenharmony_ci	&mov		($rounds,13);
3219e1051a39Sopenharmony_ci	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
3220e1051a39Sopenharmony_ci	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
3221e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
3222e1051a39Sopenharmony_ci	&call		(&label("key_256a_cold"));
3223e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
3224e1051a39Sopenharmony_ci	&call		(&label("key_256b"));
3225e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
3226e1051a39Sopenharmony_ci	&call		(&label("key_256a"));
3227e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
3228e1051a39Sopenharmony_ci	&call		(&label("key_256b"));
3229e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
3230e1051a39Sopenharmony_ci	&call		(&label("key_256a"));
3231e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
3232e1051a39Sopenharmony_ci	&call		(&label("key_256b"));
3233e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
3234e1051a39Sopenharmony_ci	&call		(&label("key_256a"));
3235e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
3236e1051a39Sopenharmony_ci	&call		(&label("key_256b"));
3237e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
3238e1051a39Sopenharmony_ci	&call		(&label("key_256a"));
3239e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
3240e1051a39Sopenharmony_ci	&call		(&label("key_256b"));
3241e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
3242e1051a39Sopenharmony_ci	&call		(&label("key_256a"));
3243e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
3244e1051a39Sopenharmony_ci	&call		(&label("key_256b"));
3245e1051a39Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
3246e1051a39Sopenharmony_ci	&call		(&label("key_256a"));
3247e1051a39Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
3248e1051a39Sopenharmony_ci	&mov		(&DWP(16,$key),$rounds);
3249e1051a39Sopenharmony_ci	&xor		("eax","eax");
3250e1051a39Sopenharmony_ci
3251e1051a39Sopenharmony_ci	&jmp	(&label("good_key"));
3252e1051a39Sopenharmony_ci
3253e1051a39Sopenharmony_ci&set_label("key_256a",16);
3254e1051a39Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm2");
3255e1051a39Sopenharmony_ci	&lea		($key,&DWP(16,$key));
3256e1051a39Sopenharmony_ci&set_label("key_256a_cold");
3257e1051a39Sopenharmony_ci	&shufps		("xmm4","xmm0",0b00010000);
3258e1051a39Sopenharmony_ci	&xorps		("xmm0","xmm4");
3259e1051a39Sopenharmony_ci	&shufps		("xmm4","xmm0",0b10001100);
3260e1051a39Sopenharmony_ci	&xorps		("xmm0","xmm4");
3261e1051a39Sopenharmony_ci	&shufps		("xmm1","xmm1",0b11111111);	# critical path
3262e1051a39Sopenharmony_ci	&xorps		("xmm0","xmm1");
3263e1051a39Sopenharmony_ci	&ret();
3264e1051a39Sopenharmony_ci
3265e1051a39Sopenharmony_ci&set_label("key_256b",16);
3266e1051a39Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
3267e1051a39Sopenharmony_ci	&lea		($key,&DWP(16,$key));
3268e1051a39Sopenharmony_ci
3269e1051a39Sopenharmony_ci	&shufps		("xmm4","xmm2",0b00010000);
3270e1051a39Sopenharmony_ci	&xorps		("xmm2","xmm4");
3271e1051a39Sopenharmony_ci	&shufps		("xmm4","xmm2",0b10001100);
3272e1051a39Sopenharmony_ci	&xorps		("xmm2","xmm4");
3273e1051a39Sopenharmony_ci	&shufps		("xmm1","xmm1",0b10101010);	# critical path
3274e1051a39Sopenharmony_ci	&xorps		("xmm2","xmm1");
3275e1051a39Sopenharmony_ci	&ret();
3276e1051a39Sopenharmony_ci
3277e1051a39Sopenharmony_ci&set_label("14rounds_alt",16);
3278e1051a39Sopenharmony_ci	&movdqa		("xmm5",&QWP(0x00,"ebx"));
3279e1051a39Sopenharmony_ci	&movdqa		("xmm4",&QWP(0x20,"ebx"));
3280e1051a39Sopenharmony_ci	&mov		($rounds,7);
3281e1051a39Sopenharmony_ci	&movdqu		(&QWP(-32,$key),"xmm0");
3282e1051a39Sopenharmony_ci	&movdqa		("xmm1","xmm2");
3283e1051a39Sopenharmony_ci	&movdqu		(&QWP(-16,$key),"xmm2");
3284e1051a39Sopenharmony_ci
3285e1051a39Sopenharmony_ci&set_label("loop_key256");
3286e1051a39Sopenharmony_ci	&pshufb		("xmm2","xmm5");
3287e1051a39Sopenharmony_ci	&aesenclast	("xmm2","xmm4");
3288e1051a39Sopenharmony_ci
3289e1051a39Sopenharmony_ci	&movdqa		("xmm3","xmm0");
3290e1051a39Sopenharmony_ci	&pslldq		("xmm0",4);
3291e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm0");
3292e1051a39Sopenharmony_ci	&pslldq		("xmm0",4);
3293e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm0");
3294e1051a39Sopenharmony_ci	&pslldq		("xmm0",4);
3295e1051a39Sopenharmony_ci	&pxor		("xmm0","xmm3");
3296e1051a39Sopenharmony_ci	&pslld		("xmm4",1);
3297e1051a39Sopenharmony_ci
3298e1051a39Sopenharmony_ci	&pxor		("xmm0","xmm2");
3299e1051a39Sopenharmony_ci	&movdqu		(&QWP(0,$key),"xmm0");
3300e1051a39Sopenharmony_ci
3301e1051a39Sopenharmony_ci	&dec		($rounds);
3302e1051a39Sopenharmony_ci	&jz		(&label("done_key256"));
3303e1051a39Sopenharmony_ci
3304e1051a39Sopenharmony_ci	&pshufd		("xmm2","xmm0",0xff);
3305e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm3");
3306e1051a39Sopenharmony_ci	&aesenclast	("xmm2","xmm3");
3307e1051a39Sopenharmony_ci
3308e1051a39Sopenharmony_ci	&movdqa		("xmm3","xmm1");
3309e1051a39Sopenharmony_ci	&pslldq		("xmm1",4);
3310e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm1");
3311e1051a39Sopenharmony_ci	&pslldq		("xmm1",4);
3312e1051a39Sopenharmony_ci	&pxor		("xmm3","xmm1");
3313e1051a39Sopenharmony_ci	&pslldq		("xmm1",4);
3314e1051a39Sopenharmony_ci	&pxor		("xmm1","xmm3");
3315e1051a39Sopenharmony_ci
3316e1051a39Sopenharmony_ci	&pxor		("xmm2","xmm1");
3317e1051a39Sopenharmony_ci	&movdqu		(&QWP(16,$key),"xmm2");
3318e1051a39Sopenharmony_ci	&lea		($key,&DWP(32,$key));
3319e1051a39Sopenharmony_ci	&movdqa		("xmm1","xmm2");
3320e1051a39Sopenharmony_ci	&jmp		(&label("loop_key256"));
3321e1051a39Sopenharmony_ci
3322e1051a39Sopenharmony_ci&set_label("done_key256");
3323e1051a39Sopenharmony_ci	&mov		($rounds,13);
3324e1051a39Sopenharmony_ci	&mov		(&DWP(16,$key),$rounds);
3325e1051a39Sopenharmony_ci
3326e1051a39Sopenharmony_ci&set_label("good_key");
3327e1051a39Sopenharmony_ci	&pxor	("xmm0","xmm0");
3328e1051a39Sopenharmony_ci	&pxor	("xmm1","xmm1");
3329e1051a39Sopenharmony_ci	&pxor	("xmm2","xmm2");
3330e1051a39Sopenharmony_ci	&pxor	("xmm3","xmm3");
3331e1051a39Sopenharmony_ci	&pxor	("xmm4","xmm4");
3332e1051a39Sopenharmony_ci	&pxor	("xmm5","xmm5");
3333e1051a39Sopenharmony_ci	&xor	("eax","eax");
3334e1051a39Sopenharmony_ci	&pop	("ebx");
3335e1051a39Sopenharmony_ci	&pop	("ebp");
3336e1051a39Sopenharmony_ci	&ret	();
3337e1051a39Sopenharmony_ci
3338e1051a39Sopenharmony_ci&set_label("bad_pointer",4);
3339e1051a39Sopenharmony_ci	&mov	("eax",-1);
3340e1051a39Sopenharmony_ci	&pop	("ebx");
3341e1051a39Sopenharmony_ci	&pop	("ebp");
3342e1051a39Sopenharmony_ci	&ret	();
3343e1051a39Sopenharmony_ci&set_label("bad_keybits",4);
3344e1051a39Sopenharmony_ci	&pxor	("xmm0","xmm0");
3345e1051a39Sopenharmony_ci	&mov	("eax",-2);
3346e1051a39Sopenharmony_ci	&pop	("ebx");
3347e1051a39Sopenharmony_ci	&pop	("ebp");
3348e1051a39Sopenharmony_ci	&ret	();
3349e1051a39Sopenharmony_ci&function_end_B("_aesni_set_encrypt_key");
3350e1051a39Sopenharmony_ci
3351e1051a39Sopenharmony_ci# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
3352e1051a39Sopenharmony_ci#                              AES_KEY *key)
3353e1051a39Sopenharmony_ci&function_begin_B("${PREFIX}_set_encrypt_key");
3354e1051a39Sopenharmony_ci	&mov	("eax",&wparam(0));
3355e1051a39Sopenharmony_ci	&mov	($rounds,&wparam(1));
3356e1051a39Sopenharmony_ci	&mov	($key,&wparam(2));
3357e1051a39Sopenharmony_ci	&call	("_aesni_set_encrypt_key");
3358e1051a39Sopenharmony_ci	&ret	();
3359e1051a39Sopenharmony_ci&function_end_B("${PREFIX}_set_encrypt_key");
3360e1051a39Sopenharmony_ci
3361e1051a39Sopenharmony_ci# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
3362e1051a39Sopenharmony_ci#                              AES_KEY *key)
3363e1051a39Sopenharmony_ci&function_begin_B("${PREFIX}_set_decrypt_key");
3364e1051a39Sopenharmony_ci	&mov	("eax",&wparam(0));
3365e1051a39Sopenharmony_ci	&mov	($rounds,&wparam(1));
3366e1051a39Sopenharmony_ci	&mov	($key,&wparam(2));
3367e1051a39Sopenharmony_ci	&call	("_aesni_set_encrypt_key");
3368e1051a39Sopenharmony_ci	&mov	($key,&wparam(2));
3369e1051a39Sopenharmony_ci	&shl	($rounds,4);	# rounds-1 after _aesni_set_encrypt_key
3370e1051a39Sopenharmony_ci	&test	("eax","eax");
3371e1051a39Sopenharmony_ci	&jnz	(&label("dec_key_ret"));
3372e1051a39Sopenharmony_ci	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
3373e1051a39Sopenharmony_ci
3374e1051a39Sopenharmony_ci	&$movekey	("xmm0",&QWP(0,$key));	# just swap
3375e1051a39Sopenharmony_ci	&$movekey	("xmm1",&QWP(0,"eax"));
3376e1051a39Sopenharmony_ci	&$movekey	(&QWP(0,"eax"),"xmm0");
3377e1051a39Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm1");
3378e1051a39Sopenharmony_ci	&lea		($key,&DWP(16,$key));
3379e1051a39Sopenharmony_ci	&lea		("eax",&DWP(-16,"eax"));
3380e1051a39Sopenharmony_ci
3381e1051a39Sopenharmony_ci&set_label("dec_key_inverse");
3382e1051a39Sopenharmony_ci	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
3383e1051a39Sopenharmony_ci	&$movekey	("xmm1",&QWP(0,"eax"));
3384e1051a39Sopenharmony_ci	&aesimc		("xmm0","xmm0");
3385e1051a39Sopenharmony_ci	&aesimc		("xmm1","xmm1");
3386e1051a39Sopenharmony_ci	&lea		($key,&DWP(16,$key));
3387e1051a39Sopenharmony_ci	&lea		("eax",&DWP(-16,"eax"));
3388e1051a39Sopenharmony_ci	&$movekey	(&QWP(16,"eax"),"xmm0");
3389e1051a39Sopenharmony_ci	&$movekey	(&QWP(-16,$key),"xmm1");
3390e1051a39Sopenharmony_ci	&cmp		("eax",$key);
3391e1051a39Sopenharmony_ci	&ja		(&label("dec_key_inverse"));
3392e1051a39Sopenharmony_ci
3393e1051a39Sopenharmony_ci	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
3394e1051a39Sopenharmony_ci	&aesimc		("xmm0","xmm0");
3395e1051a39Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
3396e1051a39Sopenharmony_ci
3397e1051a39Sopenharmony_ci	&pxor		("xmm0","xmm0");
3398e1051a39Sopenharmony_ci	&pxor		("xmm1","xmm1");
3399e1051a39Sopenharmony_ci	&xor		("eax","eax");		# return success
3400e1051a39Sopenharmony_ci&set_label("dec_key_ret");
3401e1051a39Sopenharmony_ci	&ret	();
3402e1051a39Sopenharmony_ci&function_end_B("${PREFIX}_set_decrypt_key");
3403e1051a39Sopenharmony_ci
3404e1051a39Sopenharmony_ci&set_label("key_const",64);
3405e1051a39Sopenharmony_ci&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
3406e1051a39Sopenharmony_ci&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
3407e1051a39Sopenharmony_ci&data_word(1,1,1,1);
3408e1051a39Sopenharmony_ci&data_word(0x1b,0x1b,0x1b,0x1b);
3409e1051a39Sopenharmony_ci&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
3410e1051a39Sopenharmony_ci
3411e1051a39Sopenharmony_ci&asm_finish();
3412e1051a39Sopenharmony_ci
3413e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
3414