11cb0ef41Sopenharmony_ci#! /usr/bin/env perl
21cb0ef41Sopenharmony_ci# Copyright 2009-2022 The OpenSSL Project Authors. All Rights Reserved.
31cb0ef41Sopenharmony_ci#
41cb0ef41Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
51cb0ef41Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
61cb0ef41Sopenharmony_ci# in the file LICENSE in the source distribution or at
71cb0ef41Sopenharmony_ci# https://www.openssl.org/source/license.html
81cb0ef41Sopenharmony_ci
91cb0ef41Sopenharmony_ci
101cb0ef41Sopenharmony_ci# ====================================================================
111cb0ef41Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
121cb0ef41Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
131cb0ef41Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
141cb0ef41Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
151cb0ef41Sopenharmony_ci# ====================================================================
161cb0ef41Sopenharmony_ci#
171cb0ef41Sopenharmony_ci# This module implements support for Intel AES-NI extension. In
181cb0ef41Sopenharmony_ci# OpenSSL context it's used with Intel engine, but can also be used as
191cb0ef41Sopenharmony_ci# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
201cb0ef41Sopenharmony_ci# details].
211cb0ef41Sopenharmony_ci#
221cb0ef41Sopenharmony_ci# Performance.
231cb0ef41Sopenharmony_ci#
241cb0ef41Sopenharmony_ci# To start with see corresponding paragraph in aesni-x86_64.pl...
251cb0ef41Sopenharmony_ci# Instead of filling table similar to one found there I've chosen to
261cb0ef41Sopenharmony_ci# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
271cb0ef41Sopenharmony_ci# The simplified table below represents 32-bit performance relative
281cb0ef41Sopenharmony_ci# to 64-bit one in every given point. Ratios vary for different
291cb0ef41Sopenharmony_ci# encryption modes, therefore interval values.
301cb0ef41Sopenharmony_ci#
311cb0ef41Sopenharmony_ci#	16-byte     64-byte     256-byte    1-KB        8-KB
321cb0ef41Sopenharmony_ci#	53-67%      67-84%      91-94%      95-98%      97-99.5%
331cb0ef41Sopenharmony_ci#
341cb0ef41Sopenharmony_ci# Lower ratios for smaller block sizes are perfectly understandable,
351cb0ef41Sopenharmony_ci# because function call overhead is higher in 32-bit mode. Largest
361cb0ef41Sopenharmony_ci# 8-KB block performance is virtually same: 32-bit code is less than
371cb0ef41Sopenharmony_ci# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
381cb0ef41Sopenharmony_ci
391cb0ef41Sopenharmony_ci# January 2011
401cb0ef41Sopenharmony_ci#
411cb0ef41Sopenharmony_ci# See aesni-x86_64.pl for details. Unlike x86_64 version this module
421cb0ef41Sopenharmony_ci# interleaves at most 6 aes[enc|dec] instructions, because there are
431cb0ef41Sopenharmony_ci# not enough registers for 8x interleave [which should be optimal for
441cb0ef41Sopenharmony_ci# Sandy Bridge]. Actually, performance results for 6x interleave
451cb0ef41Sopenharmony_ci# factor presented in aesni-x86_64.pl (except for CTR) are for this
461cb0ef41Sopenharmony_ci# module.
471cb0ef41Sopenharmony_ci
481cb0ef41Sopenharmony_ci# April 2011
491cb0ef41Sopenharmony_ci#
501cb0ef41Sopenharmony_ci# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
511cb0ef41Sopenharmony_ci# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
521cb0ef41Sopenharmony_ci
531cb0ef41Sopenharmony_ci# November 2015
541cb0ef41Sopenharmony_ci#
551cb0ef41Sopenharmony_ci# Add aesni_ocb_[en|de]crypt.
561cb0ef41Sopenharmony_ci
571cb0ef41Sopenharmony_ci######################################################################
581cb0ef41Sopenharmony_ci# Current large-block performance in cycles per byte processed with
591cb0ef41Sopenharmony_ci# 128-bit key (less is better).
601cb0ef41Sopenharmony_ci#
611cb0ef41Sopenharmony_ci#		CBC en-/decrypt	CTR	XTS	ECB	OCB
621cb0ef41Sopenharmony_ci# Westmere	3.77/1.37	1.37	1.52	1.27
631cb0ef41Sopenharmony_ci# * Bridge	5.07/0.98	0.99	1.09	0.91	1.10
641cb0ef41Sopenharmony_ci# Haswell	4.44/0.80	0.97	1.03	0.72	0.76
651cb0ef41Sopenharmony_ci# Skylake	2.68/0.65	0.65	0.66	0.64	0.66
661cb0ef41Sopenharmony_ci# Silvermont	5.77/3.56	3.67	4.03	3.46	4.03
671cb0ef41Sopenharmony_ci# Goldmont	3.84/1.39	1.39	1.63	1.31	1.70
681cb0ef41Sopenharmony_ci# Bulldozer	5.80/0.98	1.05	1.24	0.93	1.23
691cb0ef41Sopenharmony_ci
701cb0ef41Sopenharmony_ci$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
711cb0ef41Sopenharmony_ci			# generates drop-in replacement for
721cb0ef41Sopenharmony_ci			# crypto/aes/asm/aes-586.pl:-)
731cb0ef41Sopenharmony_ci$inline=1;		# inline _aesni_[en|de]crypt
741cb0ef41Sopenharmony_ci
751cb0ef41Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
761cb0ef41Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm");
771cb0ef41Sopenharmony_cirequire "x86asm.pl";
781cb0ef41Sopenharmony_ci
791cb0ef41Sopenharmony_ci$output = pop and open STDOUT,">$output";
801cb0ef41Sopenharmony_ci
811cb0ef41Sopenharmony_ci&asm_init($ARGV[0]);
821cb0ef41Sopenharmony_ci
831cb0ef41Sopenharmony_ci&external_label("OPENSSL_ia32cap_P");
841cb0ef41Sopenharmony_ci&static_label("key_const");
851cb0ef41Sopenharmony_ci
861cb0ef41Sopenharmony_ciif ($PREFIX eq "aesni")	{ $movekey=\&movups; }
871cb0ef41Sopenharmony_cielse			{ $movekey=\&movups; }
881cb0ef41Sopenharmony_ci
891cb0ef41Sopenharmony_ci$len="eax";
901cb0ef41Sopenharmony_ci$rounds="ecx";
911cb0ef41Sopenharmony_ci$key="edx";
921cb0ef41Sopenharmony_ci$inp="esi";
931cb0ef41Sopenharmony_ci$out="edi";
941cb0ef41Sopenharmony_ci$rounds_="ebx";	# backup copy for $rounds
951cb0ef41Sopenharmony_ci$key_="ebp";	# backup copy for $key
961cb0ef41Sopenharmony_ci
971cb0ef41Sopenharmony_ci$rndkey0="xmm0";
981cb0ef41Sopenharmony_ci$rndkey1="xmm1";
991cb0ef41Sopenharmony_ci$inout0="xmm2";
1001cb0ef41Sopenharmony_ci$inout1="xmm3";
1011cb0ef41Sopenharmony_ci$inout2="xmm4";
1021cb0ef41Sopenharmony_ci$inout3="xmm5";	$in1="xmm5";
1031cb0ef41Sopenharmony_ci$inout4="xmm6";	$in0="xmm6";
1041cb0ef41Sopenharmony_ci$inout5="xmm7";	$ivec="xmm7";
1051cb0ef41Sopenharmony_ci
1061cb0ef41Sopenharmony_ci# AESNI extension
1071cb0ef41Sopenharmony_cisub aeskeygenassist
1081cb0ef41Sopenharmony_ci{ my($dst,$src,$imm)=@_;
1091cb0ef41Sopenharmony_ci    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
1101cb0ef41Sopenharmony_ci    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
1111cb0ef41Sopenharmony_ci}
1121cb0ef41Sopenharmony_cisub aescommon
1131cb0ef41Sopenharmony_ci{ my($opcodelet,$dst,$src)=@_;
1141cb0ef41Sopenharmony_ci    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
1151cb0ef41Sopenharmony_ci    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
1161cb0ef41Sopenharmony_ci}
1171cb0ef41Sopenharmony_cisub aesimc	{ aescommon(0xdb,@_); }
1181cb0ef41Sopenharmony_cisub aesenc	{ aescommon(0xdc,@_); }
1191cb0ef41Sopenharmony_cisub aesenclast	{ aescommon(0xdd,@_); }
1201cb0ef41Sopenharmony_cisub aesdec	{ aescommon(0xde,@_); }
1211cb0ef41Sopenharmony_cisub aesdeclast	{ aescommon(0xdf,@_); }
1221cb0ef41Sopenharmony_ci
1231cb0ef41Sopenharmony_ci# Inline version of internal aesni_[en|de]crypt1
1241cb0ef41Sopenharmony_ci{ my $sn;
1251cb0ef41Sopenharmony_cisub aesni_inline_generate1
1261cb0ef41Sopenharmony_ci{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
1271cb0ef41Sopenharmony_ci  $sn++;
1281cb0ef41Sopenharmony_ci
1291cb0ef41Sopenharmony_ci    &$movekey		($rndkey0,&QWP(0,$key));
1301cb0ef41Sopenharmony_ci    &$movekey		($rndkey1,&QWP(16,$key));
1311cb0ef41Sopenharmony_ci    &xorps		($ivec,$rndkey0)	if (defined($ivec));
1321cb0ef41Sopenharmony_ci    &lea		($key,&DWP(32,$key));
1331cb0ef41Sopenharmony_ci    &xorps		($inout,$ivec)		if (defined($ivec));
1341cb0ef41Sopenharmony_ci    &xorps		($inout,$rndkey0)	if (!defined($ivec));
1351cb0ef41Sopenharmony_ci    &set_label("${p}1_loop_$sn");
1361cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
1371cb0ef41Sopenharmony_ci	&dec		($rounds);
1381cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key));
1391cb0ef41Sopenharmony_ci	&lea		($key,&DWP(16,$key));
1401cb0ef41Sopenharmony_ci    &jnz		(&label("${p}1_loop_$sn"));
1411cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout,$rndkey1)";
1421cb0ef41Sopenharmony_ci}}
1431cb0ef41Sopenharmony_ci
1441cb0ef41Sopenharmony_cisub aesni_generate1	# fully unrolled loop
1451cb0ef41Sopenharmony_ci{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
1461cb0ef41Sopenharmony_ci
1471cb0ef41Sopenharmony_ci    &function_begin_B("_aesni_${p}rypt1");
1481cb0ef41Sopenharmony_ci	&movups		($rndkey0,&QWP(0,$key));
1491cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0x10,$key));
1501cb0ef41Sopenharmony_ci	&xorps		($inout,$rndkey0);
1511cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0x20,$key));
1521cb0ef41Sopenharmony_ci	&lea		($key,&DWP(0x30,$key));
1531cb0ef41Sopenharmony_ci	&cmp		($rounds,11);
1541cb0ef41Sopenharmony_ci	&jb		(&label("${p}128"));
1551cb0ef41Sopenharmony_ci	&lea		($key,&DWP(0x20,$key));
1561cb0ef41Sopenharmony_ci	&je		(&label("${p}192"));
1571cb0ef41Sopenharmony_ci	&lea		($key,&DWP(0x20,$key));
1581cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
1591cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(-0x40,$key));
1601cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey0)";
1611cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-0x30,$key));
1621cb0ef41Sopenharmony_ci    &set_label("${p}192");
1631cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
1641cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(-0x20,$key));
1651cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey0)";
1661cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-0x10,$key));
1671cb0ef41Sopenharmony_ci    &set_label("${p}128");
1681cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
1691cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key));
1701cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey0)";
1711cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0x10,$key));
1721cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
1731cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0x20,$key));
1741cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey0)";
1751cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0x30,$key));
1761cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
1771cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0x40,$key));
1781cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey0)";
1791cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0x50,$key));
1801cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
1811cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0x60,$key));
1821cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey0)";
1831cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0x70,$key));
1841cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout,$rndkey1)";
1851cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout,$rndkey0)";
1861cb0ef41Sopenharmony_ci    &ret();
1871cb0ef41Sopenharmony_ci    &function_end_B("_aesni_${p}rypt1");
1881cb0ef41Sopenharmony_ci}
1891cb0ef41Sopenharmony_ci
1901cb0ef41Sopenharmony_ci# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
1911cb0ef41Sopenharmony_ci&aesni_generate1("enc") if (!$inline);
1921cb0ef41Sopenharmony_ci&function_begin_B("${PREFIX}_encrypt");
1931cb0ef41Sopenharmony_ci	&mov	("eax",&wparam(0));
1941cb0ef41Sopenharmony_ci	&mov	($key,&wparam(2));
1951cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(0,"eax"));
1961cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
1971cb0ef41Sopenharmony_ci	&mov	("eax",&wparam(1));
1981cb0ef41Sopenharmony_ci	if ($inline)
1991cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
2001cb0ef41Sopenharmony_ci	else
2011cb0ef41Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
2021cb0ef41Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);		# clear register bank
2031cb0ef41Sopenharmony_ci	&pxor	($rndkey1,$rndkey1);
2041cb0ef41Sopenharmony_ci	&movups	(&QWP(0,"eax"),$inout0);
2051cb0ef41Sopenharmony_ci	&pxor	($inout0,$inout0);
2061cb0ef41Sopenharmony_ci	&ret	();
2071cb0ef41Sopenharmony_ci&function_end_B("${PREFIX}_encrypt");
2081cb0ef41Sopenharmony_ci
2091cb0ef41Sopenharmony_ci# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
2101cb0ef41Sopenharmony_ci&aesni_generate1("dec") if(!$inline);
2111cb0ef41Sopenharmony_ci&function_begin_B("${PREFIX}_decrypt");
2121cb0ef41Sopenharmony_ci	&mov	("eax",&wparam(0));
2131cb0ef41Sopenharmony_ci	&mov	($key,&wparam(2));
2141cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(0,"eax"));
2151cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
2161cb0ef41Sopenharmony_ci	&mov	("eax",&wparam(1));
2171cb0ef41Sopenharmony_ci	if ($inline)
2181cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
2191cb0ef41Sopenharmony_ci	else
2201cb0ef41Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
2211cb0ef41Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);		# clear register bank
2221cb0ef41Sopenharmony_ci	&pxor	($rndkey1,$rndkey1);
2231cb0ef41Sopenharmony_ci	&movups	(&QWP(0,"eax"),$inout0);
2241cb0ef41Sopenharmony_ci	&pxor	($inout0,$inout0);
2251cb0ef41Sopenharmony_ci	&ret	();
2261cb0ef41Sopenharmony_ci&function_end_B("${PREFIX}_decrypt");
2271cb0ef41Sopenharmony_ci
2281cb0ef41Sopenharmony_ci# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
2291cb0ef41Sopenharmony_ci# factor. Why 3x subroutine were originally used in loops? Even though
2301cb0ef41Sopenharmony_ci# aes[enc|dec] latency was originally 6, it could be scheduled only
2311cb0ef41Sopenharmony_ci# every *2nd* cycle. Thus 3x interleave was the one providing optimal
2321cb0ef41Sopenharmony_ci# utilization, i.e. when subroutine's throughput is virtually same as
2331cb0ef41Sopenharmony_ci# of non-interleaved subroutine [for number of input blocks up to 3].
2341cb0ef41Sopenharmony_ci# This is why it originally made no sense to implement 2x subroutine.
2351cb0ef41Sopenharmony_ci# But times change and it became appropriate to spend extra 192 bytes
2361cb0ef41Sopenharmony_ci# on 2x subroutine on Atom Silvermont account. For processors that
2371cb0ef41Sopenharmony_ci# can schedule aes[enc|dec] every cycle optimal interleave factor
2381cb0ef41Sopenharmony_ci# equals to corresponding instructions latency. 8x is optimal for
2391cb0ef41Sopenharmony_ci# * Bridge, but it's unfeasible to accommodate such implementation
2401cb0ef41Sopenharmony_ci# in XMM registers addressable in 32-bit mode and therefore maximum
2411cb0ef41Sopenharmony_ci# of 6x is used instead...
2421cb0ef41Sopenharmony_ci
2431cb0ef41Sopenharmony_cisub aesni_generate2
2441cb0ef41Sopenharmony_ci{ my $p=shift;
2451cb0ef41Sopenharmony_ci
2461cb0ef41Sopenharmony_ci    &function_begin_B("_aesni_${p}rypt2");
2471cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key));
2481cb0ef41Sopenharmony_ci	&shl		($rounds,4);
2491cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key));
2501cb0ef41Sopenharmony_ci	&xorps		($inout0,$rndkey0);
2511cb0ef41Sopenharmony_ci	&pxor		($inout1,$rndkey0);
2521cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(32,$key));
2531cb0ef41Sopenharmony_ci	&lea		($key,&DWP(32,$key,$rounds));
2541cb0ef41Sopenharmony_ci	&neg		($rounds);
2551cb0ef41Sopenharmony_ci	&add		($rounds,16);
2561cb0ef41Sopenharmony_ci
2571cb0ef41Sopenharmony_ci    &set_label("${p}2_loop");
2581cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey1)";
2591cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey1)";
2601cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
2611cb0ef41Sopenharmony_ci	&add		($rounds,32);
2621cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey0)";
2631cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey0)";
2641cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2651cb0ef41Sopenharmony_ci	&jnz		(&label("${p}2_loop"));
2661cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout0,$rndkey1)";
2671cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout1,$rndkey1)";
2681cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout0,$rndkey0)";
2691cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout1,$rndkey0)";
2701cb0ef41Sopenharmony_ci    &ret();
2711cb0ef41Sopenharmony_ci    &function_end_B("_aesni_${p}rypt2");
2721cb0ef41Sopenharmony_ci}
2731cb0ef41Sopenharmony_ci
2741cb0ef41Sopenharmony_cisub aesni_generate3
2751cb0ef41Sopenharmony_ci{ my $p=shift;
2761cb0ef41Sopenharmony_ci
2771cb0ef41Sopenharmony_ci    &function_begin_B("_aesni_${p}rypt3");
2781cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key));
2791cb0ef41Sopenharmony_ci	&shl		($rounds,4);
2801cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key));
2811cb0ef41Sopenharmony_ci	&xorps		($inout0,$rndkey0);
2821cb0ef41Sopenharmony_ci	&pxor		($inout1,$rndkey0);
2831cb0ef41Sopenharmony_ci	&pxor		($inout2,$rndkey0);
2841cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(32,$key));
2851cb0ef41Sopenharmony_ci	&lea		($key,&DWP(32,$key,$rounds));
2861cb0ef41Sopenharmony_ci	&neg		($rounds);
2871cb0ef41Sopenharmony_ci	&add		($rounds,16);
2881cb0ef41Sopenharmony_ci
2891cb0ef41Sopenharmony_ci    &set_label("${p}3_loop");
2901cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey1)";
2911cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey1)";
2921cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey1)";
2931cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
2941cb0ef41Sopenharmony_ci	&add		($rounds,32);
2951cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey0)";
2961cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey0)";
2971cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey0)";
2981cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2991cb0ef41Sopenharmony_ci	&jnz		(&label("${p}3_loop"));
3001cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout0,$rndkey1)";
3011cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout1,$rndkey1)";
3021cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout2,$rndkey1)";
3031cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout0,$rndkey0)";
3041cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout1,$rndkey0)";
3051cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout2,$rndkey0)";
3061cb0ef41Sopenharmony_ci    &ret();
3071cb0ef41Sopenharmony_ci    &function_end_B("_aesni_${p}rypt3");
3081cb0ef41Sopenharmony_ci}
3091cb0ef41Sopenharmony_ci
3101cb0ef41Sopenharmony_ci# 4x interleave is implemented to improve small block performance,
3111cb0ef41Sopenharmony_ci# most notably [and naturally] 4 block by ~30%. One can argue that one
3121cb0ef41Sopenharmony_ci# should have implemented 5x as well, but improvement  would be <20%,
3131cb0ef41Sopenharmony_ci# so it's not worth it...
3141cb0ef41Sopenharmony_cisub aesni_generate4
3151cb0ef41Sopenharmony_ci{ my $p=shift;
3161cb0ef41Sopenharmony_ci
3171cb0ef41Sopenharmony_ci    &function_begin_B("_aesni_${p}rypt4");
3181cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key));
3191cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key));
3201cb0ef41Sopenharmony_ci	&shl		($rounds,4);
3211cb0ef41Sopenharmony_ci	&xorps		($inout0,$rndkey0);
3221cb0ef41Sopenharmony_ci	&pxor		($inout1,$rndkey0);
3231cb0ef41Sopenharmony_ci	&pxor		($inout2,$rndkey0);
3241cb0ef41Sopenharmony_ci	&pxor		($inout3,$rndkey0);
3251cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(32,$key));
3261cb0ef41Sopenharmony_ci	&lea		($key,&DWP(32,$key,$rounds));
3271cb0ef41Sopenharmony_ci	&neg		($rounds);
3281cb0ef41Sopenharmony_ci	&data_byte	(0x0f,0x1f,0x40,0x00);
3291cb0ef41Sopenharmony_ci	&add		($rounds,16);
3301cb0ef41Sopenharmony_ci
3311cb0ef41Sopenharmony_ci    &set_label("${p}4_loop");
3321cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey1)";
3331cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey1)";
3341cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey1)";
3351cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout3,$rndkey1)";
3361cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
3371cb0ef41Sopenharmony_ci	&add		($rounds,32);
3381cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey0)";
3391cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey0)";
3401cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey0)";
3411cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout3,$rndkey0)";
3421cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
3431cb0ef41Sopenharmony_ci    &jnz		(&label("${p}4_loop"));
3441cb0ef41Sopenharmony_ci
3451cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout0,$rndkey1)";
3461cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout1,$rndkey1)";
3471cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout2,$rndkey1)";
3481cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout3,$rndkey1)";
3491cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout0,$rndkey0)";
3501cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout1,$rndkey0)";
3511cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout2,$rndkey0)";
3521cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout3,$rndkey0)";
3531cb0ef41Sopenharmony_ci    &ret();
3541cb0ef41Sopenharmony_ci    &function_end_B("_aesni_${p}rypt4");
3551cb0ef41Sopenharmony_ci}
3561cb0ef41Sopenharmony_ci
3571cb0ef41Sopenharmony_cisub aesni_generate6
3581cb0ef41Sopenharmony_ci{ my $p=shift;
3591cb0ef41Sopenharmony_ci
3601cb0ef41Sopenharmony_ci    &function_begin_B("_aesni_${p}rypt6");
3611cb0ef41Sopenharmony_ci    &static_label("_aesni_${p}rypt6_enter");
3621cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key));
3631cb0ef41Sopenharmony_ci	&shl		($rounds,4);
3641cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key));
3651cb0ef41Sopenharmony_ci	&xorps		($inout0,$rndkey0);
3661cb0ef41Sopenharmony_ci	&pxor		($inout1,$rndkey0);	# pxor does better here
3671cb0ef41Sopenharmony_ci	&pxor		($inout2,$rndkey0);
3681cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey1)";
3691cb0ef41Sopenharmony_ci	&pxor		($inout3,$rndkey0);
3701cb0ef41Sopenharmony_ci	&pxor		($inout4,$rndkey0);
3711cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey1)";
3721cb0ef41Sopenharmony_ci	&lea		($key,&DWP(32,$key,$rounds));
3731cb0ef41Sopenharmony_ci	&neg		($rounds);
3741cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey1)";
3751cb0ef41Sopenharmony_ci	&pxor		($inout5,$rndkey0);
3761cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
3771cb0ef41Sopenharmony_ci	&add		($rounds,16);
3781cb0ef41Sopenharmony_ci	&jmp		(&label("_aesni_${p}rypt6_inner"));
3791cb0ef41Sopenharmony_ci
3801cb0ef41Sopenharmony_ci    &set_label("${p}6_loop",16);
3811cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey1)";
3821cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey1)";
3831cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey1)";
3841cb0ef41Sopenharmony_ci    &set_label("_aesni_${p}rypt6_inner");
3851cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout3,$rndkey1)";
3861cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout4,$rndkey1)";
3871cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout5,$rndkey1)";
3881cb0ef41Sopenharmony_ci    &set_label("_aesni_${p}rypt6_enter");
3891cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
3901cb0ef41Sopenharmony_ci	&add		($rounds,32);
3911cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout0,$rndkey0)";
3921cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout1,$rndkey0)";
3931cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout2,$rndkey0)";
3941cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout3,$rndkey0)";
3951cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout4,$rndkey0)";
3961cb0ef41Sopenharmony_ci	eval"&aes${p}	($inout5,$rndkey0)";
3971cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
3981cb0ef41Sopenharmony_ci    &jnz		(&label("${p}6_loop"));
3991cb0ef41Sopenharmony_ci
4001cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout0,$rndkey1)";
4011cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout1,$rndkey1)";
4021cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout2,$rndkey1)";
4031cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout3,$rndkey1)";
4041cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout4,$rndkey1)";
4051cb0ef41Sopenharmony_ci    eval"&aes${p}	($inout5,$rndkey1)";
4061cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout0,$rndkey0)";
4071cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout1,$rndkey0)";
4081cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout2,$rndkey0)";
4091cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout3,$rndkey0)";
4101cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout4,$rndkey0)";
4111cb0ef41Sopenharmony_ci    eval"&aes${p}last	($inout5,$rndkey0)";
4121cb0ef41Sopenharmony_ci    &ret();
4131cb0ef41Sopenharmony_ci    &function_end_B("_aesni_${p}rypt6");
4141cb0ef41Sopenharmony_ci}
4151cb0ef41Sopenharmony_ci&aesni_generate2("enc") if ($PREFIX eq "aesni");
4161cb0ef41Sopenharmony_ci&aesni_generate2("dec");
4171cb0ef41Sopenharmony_ci&aesni_generate3("enc") if ($PREFIX eq "aesni");
4181cb0ef41Sopenharmony_ci&aesni_generate3("dec");
4191cb0ef41Sopenharmony_ci&aesni_generate4("enc") if ($PREFIX eq "aesni");
4201cb0ef41Sopenharmony_ci&aesni_generate4("dec");
4211cb0ef41Sopenharmony_ci&aesni_generate6("enc") if ($PREFIX eq "aesni");
4221cb0ef41Sopenharmony_ci&aesni_generate6("dec");
4231cb0ef41Sopenharmony_ci
4241cb0ef41Sopenharmony_ciif ($PREFIX eq "aesni") {
4251cb0ef41Sopenharmony_ci######################################################################
4261cb0ef41Sopenharmony_ci# void aesni_ecb_encrypt (const void *in, void *out,
4271cb0ef41Sopenharmony_ci#                         size_t length, const AES_KEY *key,
4281cb0ef41Sopenharmony_ci#                         int enc);
4291cb0ef41Sopenharmony_ci&function_begin("aesni_ecb_encrypt");
4301cb0ef41Sopenharmony_ci	&mov	($inp,&wparam(0));
4311cb0ef41Sopenharmony_ci	&mov	($out,&wparam(1));
4321cb0ef41Sopenharmony_ci	&mov	($len,&wparam(2));
4331cb0ef41Sopenharmony_ci	&mov	($key,&wparam(3));
4341cb0ef41Sopenharmony_ci	&mov	($rounds_,&wparam(4));
4351cb0ef41Sopenharmony_ci	&and	($len,-16);
4361cb0ef41Sopenharmony_ci	&jz	(&label("ecb_ret"));
4371cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
4381cb0ef41Sopenharmony_ci	&test	($rounds_,$rounds_);
4391cb0ef41Sopenharmony_ci	&jz	(&label("ecb_decrypt"));
4401cb0ef41Sopenharmony_ci
4411cb0ef41Sopenharmony_ci	&mov	($key_,$key);		# backup $key
4421cb0ef41Sopenharmony_ci	&mov	($rounds_,$rounds);	# backup $rounds
4431cb0ef41Sopenharmony_ci	&cmp	($len,0x60);
4441cb0ef41Sopenharmony_ci	&jb	(&label("ecb_enc_tail"));
4451cb0ef41Sopenharmony_ci
4461cb0ef41Sopenharmony_ci	&movdqu	($inout0,&QWP(0,$inp));
4471cb0ef41Sopenharmony_ci	&movdqu	($inout1,&QWP(0x10,$inp));
4481cb0ef41Sopenharmony_ci	&movdqu	($inout2,&QWP(0x20,$inp));
4491cb0ef41Sopenharmony_ci	&movdqu	($inout3,&QWP(0x30,$inp));
4501cb0ef41Sopenharmony_ci	&movdqu	($inout4,&QWP(0x40,$inp));
4511cb0ef41Sopenharmony_ci	&movdqu	($inout5,&QWP(0x50,$inp));
4521cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(0x60,$inp));
4531cb0ef41Sopenharmony_ci	&sub	($len,0x60);
4541cb0ef41Sopenharmony_ci	&jmp	(&label("ecb_enc_loop6_enter"));
4551cb0ef41Sopenharmony_ci
4561cb0ef41Sopenharmony_ci&set_label("ecb_enc_loop6",16);
4571cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
4581cb0ef41Sopenharmony_ci	&movdqu	($inout0,&QWP(0,$inp));
4591cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
4601cb0ef41Sopenharmony_ci	&movdqu	($inout1,&QWP(0x10,$inp));
4611cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
4621cb0ef41Sopenharmony_ci	&movdqu	($inout2,&QWP(0x20,$inp));
4631cb0ef41Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
4641cb0ef41Sopenharmony_ci	&movdqu	($inout3,&QWP(0x30,$inp));
4651cb0ef41Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
4661cb0ef41Sopenharmony_ci	&movdqu	($inout4,&QWP(0x40,$inp));
4671cb0ef41Sopenharmony_ci	&movups	(&QWP(0x50,$out),$inout5);
4681cb0ef41Sopenharmony_ci	&lea	($out,&DWP(0x60,$out));
4691cb0ef41Sopenharmony_ci	&movdqu	($inout5,&QWP(0x50,$inp));
4701cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(0x60,$inp));
4711cb0ef41Sopenharmony_ci&set_label("ecb_enc_loop6_enter");
4721cb0ef41Sopenharmony_ci
4731cb0ef41Sopenharmony_ci	&call	("_aesni_encrypt6");
4741cb0ef41Sopenharmony_ci
4751cb0ef41Sopenharmony_ci	&mov	($key,$key_);		# restore $key
4761cb0ef41Sopenharmony_ci	&mov	($rounds,$rounds_);	# restore $rounds
4771cb0ef41Sopenharmony_ci	&sub	($len,0x60);
4781cb0ef41Sopenharmony_ci	&jnc	(&label("ecb_enc_loop6"));
4791cb0ef41Sopenharmony_ci
4801cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
4811cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
4821cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
4831cb0ef41Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
4841cb0ef41Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
4851cb0ef41Sopenharmony_ci	&movups	(&QWP(0x50,$out),$inout5);
4861cb0ef41Sopenharmony_ci	&lea	($out,&DWP(0x60,$out));
4871cb0ef41Sopenharmony_ci	&add	($len,0x60);
4881cb0ef41Sopenharmony_ci	&jz	(&label("ecb_ret"));
4891cb0ef41Sopenharmony_ci
4901cb0ef41Sopenharmony_ci&set_label("ecb_enc_tail");
4911cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));
4921cb0ef41Sopenharmony_ci	&cmp	($len,0x20);
4931cb0ef41Sopenharmony_ci	&jb	(&label("ecb_enc_one"));
4941cb0ef41Sopenharmony_ci	&movups	($inout1,&QWP(0x10,$inp));
4951cb0ef41Sopenharmony_ci	&je	(&label("ecb_enc_two"));
4961cb0ef41Sopenharmony_ci	&movups	($inout2,&QWP(0x20,$inp));
4971cb0ef41Sopenharmony_ci	&cmp	($len,0x40);
4981cb0ef41Sopenharmony_ci	&jb	(&label("ecb_enc_three"));
4991cb0ef41Sopenharmony_ci	&movups	($inout3,&QWP(0x30,$inp));
5001cb0ef41Sopenharmony_ci	&je	(&label("ecb_enc_four"));
5011cb0ef41Sopenharmony_ci	&movups	($inout4,&QWP(0x40,$inp));
5021cb0ef41Sopenharmony_ci	&xorps	($inout5,$inout5);
5031cb0ef41Sopenharmony_ci	&call	("_aesni_encrypt6");
5041cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
5051cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
5061cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
5071cb0ef41Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
5081cb0ef41Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
5091cb0ef41Sopenharmony_ci	jmp	(&label("ecb_ret"));
5101cb0ef41Sopenharmony_ci
5111cb0ef41Sopenharmony_ci&set_label("ecb_enc_one",16);
5121cb0ef41Sopenharmony_ci	if ($inline)
5131cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
5141cb0ef41Sopenharmony_ci	else
5151cb0ef41Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
5161cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
5171cb0ef41Sopenharmony_ci	&jmp	(&label("ecb_ret"));
5181cb0ef41Sopenharmony_ci
5191cb0ef41Sopenharmony_ci&set_label("ecb_enc_two",16);
5201cb0ef41Sopenharmony_ci	&call	("_aesni_encrypt2");
5211cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
5221cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
5231cb0ef41Sopenharmony_ci	&jmp	(&label("ecb_ret"));
5241cb0ef41Sopenharmony_ci
5251cb0ef41Sopenharmony_ci&set_label("ecb_enc_three",16);
5261cb0ef41Sopenharmony_ci	&call	("_aesni_encrypt3");
5271cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
5281cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
5291cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
5301cb0ef41Sopenharmony_ci	&jmp	(&label("ecb_ret"));
5311cb0ef41Sopenharmony_ci
5321cb0ef41Sopenharmony_ci&set_label("ecb_enc_four",16);
5331cb0ef41Sopenharmony_ci	&call	("_aesni_encrypt4");
5341cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
5351cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
5361cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
5371cb0ef41Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
5381cb0ef41Sopenharmony_ci	&jmp	(&label("ecb_ret"));
5391cb0ef41Sopenharmony_ci######################################################################
5401cb0ef41Sopenharmony_ci&set_label("ecb_decrypt",16);
5411cb0ef41Sopenharmony_ci	&mov	($key_,$key);		# backup $key
5421cb0ef41Sopenharmony_ci	&mov	($rounds_,$rounds);	# backup $rounds
5431cb0ef41Sopenharmony_ci	&cmp	($len,0x60);
5441cb0ef41Sopenharmony_ci	&jb	(&label("ecb_dec_tail"));
5451cb0ef41Sopenharmony_ci
5461cb0ef41Sopenharmony_ci	&movdqu	($inout0,&QWP(0,$inp));
5471cb0ef41Sopenharmony_ci	&movdqu	($inout1,&QWP(0x10,$inp));
5481cb0ef41Sopenharmony_ci	&movdqu	($inout2,&QWP(0x20,$inp));
5491cb0ef41Sopenharmony_ci	&movdqu	($inout3,&QWP(0x30,$inp));
5501cb0ef41Sopenharmony_ci	&movdqu	($inout4,&QWP(0x40,$inp));
5511cb0ef41Sopenharmony_ci	&movdqu	($inout5,&QWP(0x50,$inp));
5521cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(0x60,$inp));
5531cb0ef41Sopenharmony_ci	&sub	($len,0x60);
5541cb0ef41Sopenharmony_ci	&jmp	(&label("ecb_dec_loop6_enter"));
5551cb0ef41Sopenharmony_ci
5561cb0ef41Sopenharmony_ci&set_label("ecb_dec_loop6",16);
5571cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
5581cb0ef41Sopenharmony_ci	&movdqu	($inout0,&QWP(0,$inp));
5591cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
5601cb0ef41Sopenharmony_ci	&movdqu	($inout1,&QWP(0x10,$inp));
5611cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
5621cb0ef41Sopenharmony_ci	&movdqu	($inout2,&QWP(0x20,$inp));
5631cb0ef41Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
5641cb0ef41Sopenharmony_ci	&movdqu	($inout3,&QWP(0x30,$inp));
5651cb0ef41Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
5661cb0ef41Sopenharmony_ci	&movdqu	($inout4,&QWP(0x40,$inp));
5671cb0ef41Sopenharmony_ci	&movups	(&QWP(0x50,$out),$inout5);
5681cb0ef41Sopenharmony_ci	&lea	($out,&DWP(0x60,$out));
5691cb0ef41Sopenharmony_ci	&movdqu	($inout5,&QWP(0x50,$inp));
5701cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(0x60,$inp));
5711cb0ef41Sopenharmony_ci&set_label("ecb_dec_loop6_enter");
5721cb0ef41Sopenharmony_ci
5731cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt6");
5741cb0ef41Sopenharmony_ci
5751cb0ef41Sopenharmony_ci	&mov	($key,$key_);		# restore $key
5761cb0ef41Sopenharmony_ci	&mov	($rounds,$rounds_);	# restore $rounds
5771cb0ef41Sopenharmony_ci	&sub	($len,0x60);
5781cb0ef41Sopenharmony_ci	&jnc	(&label("ecb_dec_loop6"));
5791cb0ef41Sopenharmony_ci
5801cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
5811cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
5821cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
5831cb0ef41Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
5841cb0ef41Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
5851cb0ef41Sopenharmony_ci	&movups	(&QWP(0x50,$out),$inout5);
5861cb0ef41Sopenharmony_ci	&lea	($out,&DWP(0x60,$out));
5871cb0ef41Sopenharmony_ci	&add	($len,0x60);
5881cb0ef41Sopenharmony_ci	&jz	(&label("ecb_ret"));
5891cb0ef41Sopenharmony_ci
5901cb0ef41Sopenharmony_ci&set_label("ecb_dec_tail");
5911cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));
5921cb0ef41Sopenharmony_ci	&cmp	($len,0x20);
5931cb0ef41Sopenharmony_ci	&jb	(&label("ecb_dec_one"));
5941cb0ef41Sopenharmony_ci	&movups	($inout1,&QWP(0x10,$inp));
5951cb0ef41Sopenharmony_ci	&je	(&label("ecb_dec_two"));
5961cb0ef41Sopenharmony_ci	&movups	($inout2,&QWP(0x20,$inp));
5971cb0ef41Sopenharmony_ci	&cmp	($len,0x40);
5981cb0ef41Sopenharmony_ci	&jb	(&label("ecb_dec_three"));
5991cb0ef41Sopenharmony_ci	&movups	($inout3,&QWP(0x30,$inp));
6001cb0ef41Sopenharmony_ci	&je	(&label("ecb_dec_four"));
6011cb0ef41Sopenharmony_ci	&movups	($inout4,&QWP(0x40,$inp));
6021cb0ef41Sopenharmony_ci	&xorps	($inout5,$inout5);
6031cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt6");
6041cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
6051cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
6061cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
6071cb0ef41Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
6081cb0ef41Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
6091cb0ef41Sopenharmony_ci	&jmp	(&label("ecb_ret"));
6101cb0ef41Sopenharmony_ci
6111cb0ef41Sopenharmony_ci&set_label("ecb_dec_one",16);
6121cb0ef41Sopenharmony_ci	if ($inline)
6131cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
6141cb0ef41Sopenharmony_ci	else
6151cb0ef41Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
6161cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
6171cb0ef41Sopenharmony_ci	&jmp	(&label("ecb_ret"));
6181cb0ef41Sopenharmony_ci
6191cb0ef41Sopenharmony_ci&set_label("ecb_dec_two",16);
6201cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt2");
6211cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
6221cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
6231cb0ef41Sopenharmony_ci	&jmp	(&label("ecb_ret"));
6241cb0ef41Sopenharmony_ci
6251cb0ef41Sopenharmony_ci&set_label("ecb_dec_three",16);
6261cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt3");
6271cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
6281cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
6291cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
6301cb0ef41Sopenharmony_ci	&jmp	(&label("ecb_ret"));
6311cb0ef41Sopenharmony_ci
6321cb0ef41Sopenharmony_ci&set_label("ecb_dec_four",16);
6331cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt4");
6341cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
6351cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
6361cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
6371cb0ef41Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
6381cb0ef41Sopenharmony_ci
6391cb0ef41Sopenharmony_ci&set_label("ecb_ret");
6401cb0ef41Sopenharmony_ci	&pxor	("xmm0","xmm0");		# clear register bank
6411cb0ef41Sopenharmony_ci	&pxor	("xmm1","xmm1");
6421cb0ef41Sopenharmony_ci	&pxor	("xmm2","xmm2");
6431cb0ef41Sopenharmony_ci	&pxor	("xmm3","xmm3");
6441cb0ef41Sopenharmony_ci	&pxor	("xmm4","xmm4");
6451cb0ef41Sopenharmony_ci	&pxor	("xmm5","xmm5");
6461cb0ef41Sopenharmony_ci	&pxor	("xmm6","xmm6");
6471cb0ef41Sopenharmony_ci	&pxor	("xmm7","xmm7");
6481cb0ef41Sopenharmony_ci&function_end("aesni_ecb_encrypt");
6491cb0ef41Sopenharmony_ci
6501cb0ef41Sopenharmony_ci######################################################################
6511cb0ef41Sopenharmony_ci# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
6521cb0ef41Sopenharmony_ci#                         size_t blocks, const AES_KEY *key,
6531cb0ef41Sopenharmony_ci#                         const char *ivec,char *cmac);
6541cb0ef41Sopenharmony_ci#
6551cb0ef41Sopenharmony_ci# Handles only complete blocks, operates on 64-bit counter and
6561cb0ef41Sopenharmony_ci# does not update *ivec! Nor does it finalize CMAC value
6571cb0ef41Sopenharmony_ci# (see engine/eng_aesni.c for details)
6581cb0ef41Sopenharmony_ci#
6591cb0ef41Sopenharmony_ci{ my $cmac=$inout1;
6601cb0ef41Sopenharmony_ci&function_begin("aesni_ccm64_encrypt_blocks");
6611cb0ef41Sopenharmony_ci	&mov	($inp,&wparam(0));
6621cb0ef41Sopenharmony_ci	&mov	($out,&wparam(1));
6631cb0ef41Sopenharmony_ci	&mov	($len,&wparam(2));
6641cb0ef41Sopenharmony_ci	&mov	($key,&wparam(3));
6651cb0ef41Sopenharmony_ci	&mov	($rounds_,&wparam(4));
6661cb0ef41Sopenharmony_ci	&mov	($rounds,&wparam(5));
6671cb0ef41Sopenharmony_ci	&mov	($key_,"esp");
6681cb0ef41Sopenharmony_ci	&sub	("esp",60);
6691cb0ef41Sopenharmony_ci	&and	("esp",-16);			# align stack
6701cb0ef41Sopenharmony_ci	&mov	(&DWP(48,"esp"),$key_);
6711cb0ef41Sopenharmony_ci
6721cb0ef41Sopenharmony_ci	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
6731cb0ef41Sopenharmony_ci	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
6741cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
6751cb0ef41Sopenharmony_ci
6761cb0ef41Sopenharmony_ci	# compose byte-swap control mask for pshufb on stack
6771cb0ef41Sopenharmony_ci	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
6781cb0ef41Sopenharmony_ci	&mov	(&DWP(4,"esp"),0x08090a0b);
6791cb0ef41Sopenharmony_ci	&mov	(&DWP(8,"esp"),0x04050607);
6801cb0ef41Sopenharmony_ci	&mov	(&DWP(12,"esp"),0x00010203);
6811cb0ef41Sopenharmony_ci
6821cb0ef41Sopenharmony_ci	# compose counter increment vector on stack
6831cb0ef41Sopenharmony_ci	&mov	($rounds_,1);
6841cb0ef41Sopenharmony_ci	&xor	($key_,$key_);
6851cb0ef41Sopenharmony_ci	&mov	(&DWP(16,"esp"),$rounds_);
6861cb0ef41Sopenharmony_ci	&mov	(&DWP(20,"esp"),$key_);
6871cb0ef41Sopenharmony_ci	&mov	(&DWP(24,"esp"),$key_);
6881cb0ef41Sopenharmony_ci	&mov	(&DWP(28,"esp"),$key_);
6891cb0ef41Sopenharmony_ci
6901cb0ef41Sopenharmony_ci	&shl	($rounds,4);
6911cb0ef41Sopenharmony_ci	&mov	($rounds_,16);
6921cb0ef41Sopenharmony_ci	&lea	($key_,&DWP(0,$key));
6931cb0ef41Sopenharmony_ci	&movdqa	($inout3,&QWP(0,"esp"));
6941cb0ef41Sopenharmony_ci	&movdqa	($inout0,$ivec);
6951cb0ef41Sopenharmony_ci	&lea	($key,&DWP(32,$key,$rounds));
6961cb0ef41Sopenharmony_ci	&sub	($rounds_,$rounds);
6971cb0ef41Sopenharmony_ci	&pshufb	($ivec,$inout3);
6981cb0ef41Sopenharmony_ci
6991cb0ef41Sopenharmony_ci&set_label("ccm64_enc_outer");
7001cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key_));
7011cb0ef41Sopenharmony_ci	&mov		($rounds,$rounds_);
7021cb0ef41Sopenharmony_ci	&movups		($in0,&QWP(0,$inp));
7031cb0ef41Sopenharmony_ci
7041cb0ef41Sopenharmony_ci	&xorps		($inout0,$rndkey0);
7051cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key_));
7061cb0ef41Sopenharmony_ci	&xorps		($rndkey0,$in0);
7071cb0ef41Sopenharmony_ci	&xorps		($cmac,$rndkey0);		# cmac^=inp
7081cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(32,$key_));
7091cb0ef41Sopenharmony_ci
7101cb0ef41Sopenharmony_ci&set_label("ccm64_enc2_loop");
7111cb0ef41Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
7121cb0ef41Sopenharmony_ci	&aesenc		($cmac,$rndkey1);
7131cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
7141cb0ef41Sopenharmony_ci	&add		($rounds,32);
7151cb0ef41Sopenharmony_ci	&aesenc		($inout0,$rndkey0);
7161cb0ef41Sopenharmony_ci	&aesenc		($cmac,$rndkey0);
7171cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
7181cb0ef41Sopenharmony_ci	&jnz		(&label("ccm64_enc2_loop"));
7191cb0ef41Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
7201cb0ef41Sopenharmony_ci	&aesenc		($cmac,$rndkey1);
7211cb0ef41Sopenharmony_ci	&paddq		($ivec,&QWP(16,"esp"));
7221cb0ef41Sopenharmony_ci	&dec		($len);
7231cb0ef41Sopenharmony_ci	&aesenclast	($inout0,$rndkey0);
7241cb0ef41Sopenharmony_ci	&aesenclast	($cmac,$rndkey0);
7251cb0ef41Sopenharmony_ci
7261cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16,$inp));
7271cb0ef41Sopenharmony_ci	&xorps	($in0,$inout0);			# inp^=E(ivec)
7281cb0ef41Sopenharmony_ci	&movdqa	($inout0,$ivec);
7291cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$in0);		# save output
7301cb0ef41Sopenharmony_ci	&pshufb	($inout0,$inout3);
7311cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16,$out));
7321cb0ef41Sopenharmony_ci	&jnz	(&label("ccm64_enc_outer"));
7331cb0ef41Sopenharmony_ci
7341cb0ef41Sopenharmony_ci	&mov	("esp",&DWP(48,"esp"));
7351cb0ef41Sopenharmony_ci	&mov	($out,&wparam(5));
7361cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$cmac);
7371cb0ef41Sopenharmony_ci
7381cb0ef41Sopenharmony_ci	&pxor	("xmm0","xmm0");		# clear register bank
7391cb0ef41Sopenharmony_ci	&pxor	("xmm1","xmm1");
7401cb0ef41Sopenharmony_ci	&pxor	("xmm2","xmm2");
7411cb0ef41Sopenharmony_ci	&pxor	("xmm3","xmm3");
7421cb0ef41Sopenharmony_ci	&pxor	("xmm4","xmm4");
7431cb0ef41Sopenharmony_ci	&pxor	("xmm5","xmm5");
7441cb0ef41Sopenharmony_ci	&pxor	("xmm6","xmm6");
7451cb0ef41Sopenharmony_ci	&pxor	("xmm7","xmm7");
7461cb0ef41Sopenharmony_ci&function_end("aesni_ccm64_encrypt_blocks");
7471cb0ef41Sopenharmony_ci
7481cb0ef41Sopenharmony_ci&function_begin("aesni_ccm64_decrypt_blocks");
7491cb0ef41Sopenharmony_ci	&mov	($inp,&wparam(0));
7501cb0ef41Sopenharmony_ci	&mov	($out,&wparam(1));
7511cb0ef41Sopenharmony_ci	&mov	($len,&wparam(2));
7521cb0ef41Sopenharmony_ci	&mov	($key,&wparam(3));
7531cb0ef41Sopenharmony_ci	&mov	($rounds_,&wparam(4));
7541cb0ef41Sopenharmony_ci	&mov	($rounds,&wparam(5));
7551cb0ef41Sopenharmony_ci	&mov	($key_,"esp");
7561cb0ef41Sopenharmony_ci	&sub	("esp",60);
7571cb0ef41Sopenharmony_ci	&and	("esp",-16);			# align stack
7581cb0ef41Sopenharmony_ci	&mov	(&DWP(48,"esp"),$key_);
7591cb0ef41Sopenharmony_ci
7601cb0ef41Sopenharmony_ci	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
7611cb0ef41Sopenharmony_ci	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
7621cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
7631cb0ef41Sopenharmony_ci
7641cb0ef41Sopenharmony_ci	# compose byte-swap control mask for pshufb on stack
7651cb0ef41Sopenharmony_ci	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
7661cb0ef41Sopenharmony_ci	&mov	(&DWP(4,"esp"),0x08090a0b);
7671cb0ef41Sopenharmony_ci	&mov	(&DWP(8,"esp"),0x04050607);
7681cb0ef41Sopenharmony_ci	&mov	(&DWP(12,"esp"),0x00010203);
7691cb0ef41Sopenharmony_ci
7701cb0ef41Sopenharmony_ci	# compose counter increment vector on stack
7711cb0ef41Sopenharmony_ci	&mov	($rounds_,1);
7721cb0ef41Sopenharmony_ci	&xor	($key_,$key_);
7731cb0ef41Sopenharmony_ci	&mov	(&DWP(16,"esp"),$rounds_);
7741cb0ef41Sopenharmony_ci	&mov	(&DWP(20,"esp"),$key_);
7751cb0ef41Sopenharmony_ci	&mov	(&DWP(24,"esp"),$key_);
7761cb0ef41Sopenharmony_ci	&mov	(&DWP(28,"esp"),$key_);
7771cb0ef41Sopenharmony_ci
7781cb0ef41Sopenharmony_ci	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
7791cb0ef41Sopenharmony_ci	&movdqa	($inout0,$ivec);
7801cb0ef41Sopenharmony_ci
7811cb0ef41Sopenharmony_ci	&mov	($key_,$key);
7821cb0ef41Sopenharmony_ci	&mov	($rounds_,$rounds);
7831cb0ef41Sopenharmony_ci
7841cb0ef41Sopenharmony_ci	&pshufb	($ivec,$inout3);
7851cb0ef41Sopenharmony_ci	if ($inline)
7861cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
7871cb0ef41Sopenharmony_ci	else
7881cb0ef41Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
7891cb0ef41Sopenharmony_ci	&shl	($rounds_,4);
7901cb0ef41Sopenharmony_ci	&mov	($rounds,16);
7911cb0ef41Sopenharmony_ci	&movups	($in0,&QWP(0,$inp));		# load inp
7921cb0ef41Sopenharmony_ci	&paddq	($ivec,&QWP(16,"esp"));
7931cb0ef41Sopenharmony_ci	&lea	($inp,&QWP(16,$inp));
7941cb0ef41Sopenharmony_ci	&sub	($rounds,$rounds_);
7951cb0ef41Sopenharmony_ci	&lea	($key,&DWP(32,$key_,$rounds_));
7961cb0ef41Sopenharmony_ci	&mov	($rounds_,$rounds);
7971cb0ef41Sopenharmony_ci	&jmp	(&label("ccm64_dec_outer"));
7981cb0ef41Sopenharmony_ci
7991cb0ef41Sopenharmony_ci&set_label("ccm64_dec_outer",16);
8001cb0ef41Sopenharmony_ci	&xorps	($in0,$inout0);			# inp ^= E(ivec)
8011cb0ef41Sopenharmony_ci	&movdqa	($inout0,$ivec);
8021cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$in0);		# save output
8031cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16,$out));
8041cb0ef41Sopenharmony_ci	&pshufb	($inout0,$inout3);
8051cb0ef41Sopenharmony_ci
8061cb0ef41Sopenharmony_ci	&sub	($len,1);
8071cb0ef41Sopenharmony_ci	&jz	(&label("ccm64_dec_break"));
8081cb0ef41Sopenharmony_ci
8091cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(0,$key_));
8101cb0ef41Sopenharmony_ci	&mov		($rounds,$rounds_);
8111cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key_));
8121cb0ef41Sopenharmony_ci	&xorps		($in0,$rndkey0);
8131cb0ef41Sopenharmony_ci	&xorps		($inout0,$rndkey0);
8141cb0ef41Sopenharmony_ci	&xorps		($cmac,$in0);		# cmac^=out
8151cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(32,$key_));
8161cb0ef41Sopenharmony_ci
8171cb0ef41Sopenharmony_ci&set_label("ccm64_dec2_loop");
8181cb0ef41Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
8191cb0ef41Sopenharmony_ci	&aesenc		($cmac,$rndkey1);
8201cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
8211cb0ef41Sopenharmony_ci	&add		($rounds,32);
8221cb0ef41Sopenharmony_ci	&aesenc		($inout0,$rndkey0);
8231cb0ef41Sopenharmony_ci	&aesenc		($cmac,$rndkey0);
8241cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
8251cb0ef41Sopenharmony_ci	&jnz		(&label("ccm64_dec2_loop"));
8261cb0ef41Sopenharmony_ci	&movups		($in0,&QWP(0,$inp));	# load inp
8271cb0ef41Sopenharmony_ci	&paddq		($ivec,&QWP(16,"esp"));
8281cb0ef41Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
8291cb0ef41Sopenharmony_ci	&aesenc		($cmac,$rndkey1);
8301cb0ef41Sopenharmony_ci	&aesenclast	($inout0,$rndkey0);
8311cb0ef41Sopenharmony_ci	&aesenclast	($cmac,$rndkey0);
8321cb0ef41Sopenharmony_ci	&lea		($inp,&QWP(16,$inp));
8331cb0ef41Sopenharmony_ci	&jmp	(&label("ccm64_dec_outer"));
8341cb0ef41Sopenharmony_ci
8351cb0ef41Sopenharmony_ci&set_label("ccm64_dec_break",16);
8361cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key_));
8371cb0ef41Sopenharmony_ci	&mov	($key,$key_);
8381cb0ef41Sopenharmony_ci	if ($inline)
8391cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
8401cb0ef41Sopenharmony_ci	else
8411cb0ef41Sopenharmony_ci	{   &call	("_aesni_encrypt1",$cmac);	}
8421cb0ef41Sopenharmony_ci
8431cb0ef41Sopenharmony_ci	&mov	("esp",&DWP(48,"esp"));
8441cb0ef41Sopenharmony_ci	&mov	($out,&wparam(5));
8451cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$cmac);
8461cb0ef41Sopenharmony_ci
8471cb0ef41Sopenharmony_ci	&pxor	("xmm0","xmm0");		# clear register bank
8481cb0ef41Sopenharmony_ci	&pxor	("xmm1","xmm1");
8491cb0ef41Sopenharmony_ci	&pxor	("xmm2","xmm2");
8501cb0ef41Sopenharmony_ci	&pxor	("xmm3","xmm3");
8511cb0ef41Sopenharmony_ci	&pxor	("xmm4","xmm4");
8521cb0ef41Sopenharmony_ci	&pxor	("xmm5","xmm5");
8531cb0ef41Sopenharmony_ci	&pxor	("xmm6","xmm6");
8541cb0ef41Sopenharmony_ci	&pxor	("xmm7","xmm7");
8551cb0ef41Sopenharmony_ci&function_end("aesni_ccm64_decrypt_blocks");
8561cb0ef41Sopenharmony_ci}
8571cb0ef41Sopenharmony_ci
8581cb0ef41Sopenharmony_ci######################################################################
8591cb0ef41Sopenharmony_ci# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
8601cb0ef41Sopenharmony_ci#                         size_t blocks, const AES_KEY *key,
8611cb0ef41Sopenharmony_ci#                         const char *ivec);
8621cb0ef41Sopenharmony_ci#
8631cb0ef41Sopenharmony_ci# Handles only complete blocks, operates on 32-bit counter and
8641cb0ef41Sopenharmony_ci# does not update *ivec! (see crypto/modes/ctr128.c for details)
8651cb0ef41Sopenharmony_ci#
8661cb0ef41Sopenharmony_ci# stack layout:
8671cb0ef41Sopenharmony_ci#	0	pshufb mask
8681cb0ef41Sopenharmony_ci#	16	vector addend: 0,6,6,6
8691cb0ef41Sopenharmony_ci# 	32	counter-less ivec
8701cb0ef41Sopenharmony_ci#	48	1st triplet of counter vector
8711cb0ef41Sopenharmony_ci#	64	2nd triplet of counter vector
8721cb0ef41Sopenharmony_ci#	80	saved %esp
8731cb0ef41Sopenharmony_ci
8741cb0ef41Sopenharmony_ci&function_begin("aesni_ctr32_encrypt_blocks");
8751cb0ef41Sopenharmony_ci	&mov	($inp,&wparam(0));
8761cb0ef41Sopenharmony_ci	&mov	($out,&wparam(1));
8771cb0ef41Sopenharmony_ci	&mov	($len,&wparam(2));
8781cb0ef41Sopenharmony_ci	&mov	($key,&wparam(3));
8791cb0ef41Sopenharmony_ci	&mov	($rounds_,&wparam(4));
8801cb0ef41Sopenharmony_ci	&mov	($key_,"esp");
8811cb0ef41Sopenharmony_ci	&sub	("esp",88);
8821cb0ef41Sopenharmony_ci	&and	("esp",-16);			# align stack
8831cb0ef41Sopenharmony_ci	&mov	(&DWP(80,"esp"),$key_);
8841cb0ef41Sopenharmony_ci
8851cb0ef41Sopenharmony_ci	&cmp	($len,1);
8861cb0ef41Sopenharmony_ci	&je	(&label("ctr32_one_shortcut"));
8871cb0ef41Sopenharmony_ci
8881cb0ef41Sopenharmony_ci	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
8891cb0ef41Sopenharmony_ci
8901cb0ef41Sopenharmony_ci	# compose byte-swap control mask for pshufb on stack
8911cb0ef41Sopenharmony_ci	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
8921cb0ef41Sopenharmony_ci	&mov	(&DWP(4,"esp"),0x08090a0b);
8931cb0ef41Sopenharmony_ci	&mov	(&DWP(8,"esp"),0x04050607);
8941cb0ef41Sopenharmony_ci	&mov	(&DWP(12,"esp"),0x00010203);
8951cb0ef41Sopenharmony_ci
8961cb0ef41Sopenharmony_ci	# compose counter increment vector on stack
8971cb0ef41Sopenharmony_ci	&mov	($rounds,6);
8981cb0ef41Sopenharmony_ci	&xor	($key_,$key_);
8991cb0ef41Sopenharmony_ci	&mov	(&DWP(16,"esp"),$rounds);
9001cb0ef41Sopenharmony_ci	&mov	(&DWP(20,"esp"),$rounds);
9011cb0ef41Sopenharmony_ci	&mov	(&DWP(24,"esp"),$rounds);
9021cb0ef41Sopenharmony_ci	&mov	(&DWP(28,"esp"),$key_);
9031cb0ef41Sopenharmony_ci
9041cb0ef41Sopenharmony_ci	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
9051cb0ef41Sopenharmony_ci	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
9061cb0ef41Sopenharmony_ci
9071cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));	# key->rounds
9081cb0ef41Sopenharmony_ci
9091cb0ef41Sopenharmony_ci	# compose 2 vectors of 3x32-bit counters
9101cb0ef41Sopenharmony_ci	&bswap	($rounds_);
9111cb0ef41Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);
9121cb0ef41Sopenharmony_ci	&pxor	($rndkey1,$rndkey1);
9131cb0ef41Sopenharmony_ci	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
9141cb0ef41Sopenharmony_ci	&pinsrd	($rndkey0,$rounds_,0);
9151cb0ef41Sopenharmony_ci	&lea	($key_,&DWP(3,$rounds_));
9161cb0ef41Sopenharmony_ci	&pinsrd	($rndkey1,$key_,0);
9171cb0ef41Sopenharmony_ci	&inc	($rounds_);
9181cb0ef41Sopenharmony_ci	&pinsrd	($rndkey0,$rounds_,1);
9191cb0ef41Sopenharmony_ci	&inc	($key_);
9201cb0ef41Sopenharmony_ci	&pinsrd	($rndkey1,$key_,1);
9211cb0ef41Sopenharmony_ci	&inc	($rounds_);
9221cb0ef41Sopenharmony_ci	&pinsrd	($rndkey0,$rounds_,2);
9231cb0ef41Sopenharmony_ci	&inc	($key_);
9241cb0ef41Sopenharmony_ci	&pinsrd	($rndkey1,$key_,2);
9251cb0ef41Sopenharmony_ci	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
9261cb0ef41Sopenharmony_ci	&pshufb	($rndkey0,$inout0);		# byte swap
9271cb0ef41Sopenharmony_ci	&movdqu	($inout4,&QWP(0,$key));		# key[0]
9281cb0ef41Sopenharmony_ci	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
9291cb0ef41Sopenharmony_ci	&pshufb	($rndkey1,$inout0);		# byte swap
9301cb0ef41Sopenharmony_ci
9311cb0ef41Sopenharmony_ci	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
9321cb0ef41Sopenharmony_ci	&pshufd	($inout1,$rndkey0,2<<6);
9331cb0ef41Sopenharmony_ci	&cmp	($len,6);
9341cb0ef41Sopenharmony_ci	&jb	(&label("ctr32_tail"));
9351cb0ef41Sopenharmony_ci	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
9361cb0ef41Sopenharmony_ci	&shl	($rounds,4);
9371cb0ef41Sopenharmony_ci	&mov	($rounds_,16);
9381cb0ef41Sopenharmony_ci	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
9391cb0ef41Sopenharmony_ci	&mov	($key_,$key);			# backup $key
9401cb0ef41Sopenharmony_ci	&sub	($rounds_,$rounds);		# backup twisted $rounds
9411cb0ef41Sopenharmony_ci	&lea	($key,&DWP(32,$key,$rounds));
9421cb0ef41Sopenharmony_ci	&sub	($len,6);
9431cb0ef41Sopenharmony_ci	&jmp	(&label("ctr32_loop6"));
9441cb0ef41Sopenharmony_ci
9451cb0ef41Sopenharmony_ci&set_label("ctr32_loop6",16);
9461cb0ef41Sopenharmony_ci	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
9471cb0ef41Sopenharmony_ci	&pshufd	($inout2,$rndkey0,1<<6);
9481cb0ef41Sopenharmony_ci	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
9491cb0ef41Sopenharmony_ci	&pshufd	($inout3,$rndkey1,3<<6);
9501cb0ef41Sopenharmony_ci	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
9511cb0ef41Sopenharmony_ci	&pshufd	($inout4,$rndkey1,2<<6);
9521cb0ef41Sopenharmony_ci	&pxor		($inout1,$rndkey0);
9531cb0ef41Sopenharmony_ci	&pshufd	($inout5,$rndkey1,1<<6);
9541cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(16,$key_));
9551cb0ef41Sopenharmony_ci	&pxor		($inout2,$rndkey0);
9561cb0ef41Sopenharmony_ci	&pxor		($inout3,$rndkey0);
9571cb0ef41Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
9581cb0ef41Sopenharmony_ci	&pxor		($inout4,$rndkey0);
9591cb0ef41Sopenharmony_ci	&pxor		($inout5,$rndkey0);
9601cb0ef41Sopenharmony_ci	&aesenc		($inout1,$rndkey1);
9611cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(32,$key_));
9621cb0ef41Sopenharmony_ci	&mov		($rounds,$rounds_);
9631cb0ef41Sopenharmony_ci	&aesenc		($inout2,$rndkey1);
9641cb0ef41Sopenharmony_ci	&aesenc		($inout3,$rndkey1);
9651cb0ef41Sopenharmony_ci	&aesenc		($inout4,$rndkey1);
9661cb0ef41Sopenharmony_ci	&aesenc		($inout5,$rndkey1);
9671cb0ef41Sopenharmony_ci
9681cb0ef41Sopenharmony_ci	&call		(&label("_aesni_encrypt6_enter"));
9691cb0ef41Sopenharmony_ci
9701cb0ef41Sopenharmony_ci	&movups	($rndkey1,&QWP(0,$inp));
9711cb0ef41Sopenharmony_ci	&movups	($rndkey0,&QWP(0x10,$inp));
9721cb0ef41Sopenharmony_ci	&xorps	($inout0,$rndkey1);
9731cb0ef41Sopenharmony_ci	&movups	($rndkey1,&QWP(0x20,$inp));
9741cb0ef41Sopenharmony_ci	&xorps	($inout1,$rndkey0);
9751cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
9761cb0ef41Sopenharmony_ci	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
9771cb0ef41Sopenharmony_ci	&xorps	($inout2,$rndkey1);
9781cb0ef41Sopenharmony_ci	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
9791cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
9801cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
9811cb0ef41Sopenharmony_ci
9821cb0ef41Sopenharmony_ci	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
9831cb0ef41Sopenharmony_ci	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
9841cb0ef41Sopenharmony_ci	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
9851cb0ef41Sopenharmony_ci
9861cb0ef41Sopenharmony_ci	&movups	($inout1,&QWP(0x30,$inp));
9871cb0ef41Sopenharmony_ci	&movups	($inout2,&QWP(0x40,$inp));
9881cb0ef41Sopenharmony_ci	&xorps	($inout3,$inout1);
9891cb0ef41Sopenharmony_ci	&movups	($inout1,&QWP(0x50,$inp));
9901cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(0x60,$inp));
9911cb0ef41Sopenharmony_ci	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
9921cb0ef41Sopenharmony_ci	&pshufb	($rndkey0,$inout0);		# byte swap
9931cb0ef41Sopenharmony_ci	&xorps	($inout4,$inout2);
9941cb0ef41Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
9951cb0ef41Sopenharmony_ci	&xorps	($inout5,$inout1);
9961cb0ef41Sopenharmony_ci	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
9971cb0ef41Sopenharmony_ci	&pshufb	($rndkey1,$inout0);		# byte swap
9981cb0ef41Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
9991cb0ef41Sopenharmony_ci	&pshufd	($inout0,$rndkey0,3<<6);
10001cb0ef41Sopenharmony_ci	&movups	(&QWP(0x50,$out),$inout5);
10011cb0ef41Sopenharmony_ci	&lea	($out,&DWP(0x60,$out));
10021cb0ef41Sopenharmony_ci
10031cb0ef41Sopenharmony_ci	&pshufd	($inout1,$rndkey0,2<<6);
10041cb0ef41Sopenharmony_ci	&sub	($len,6);
10051cb0ef41Sopenharmony_ci	&jnc	(&label("ctr32_loop6"));
10061cb0ef41Sopenharmony_ci
10071cb0ef41Sopenharmony_ci	&add	($len,6);
10081cb0ef41Sopenharmony_ci	&jz	(&label("ctr32_ret"));
10091cb0ef41Sopenharmony_ci	&movdqu	($inout5,&QWP(0,$key_));
10101cb0ef41Sopenharmony_ci	&mov	($key,$key_);
10111cb0ef41Sopenharmony_ci	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
10121cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
10131cb0ef41Sopenharmony_ci
10141cb0ef41Sopenharmony_ci&set_label("ctr32_tail");
10151cb0ef41Sopenharmony_ci	&por	($inout0,$inout5);
10161cb0ef41Sopenharmony_ci	&cmp	($len,2);
10171cb0ef41Sopenharmony_ci	&jb	(&label("ctr32_one"));
10181cb0ef41Sopenharmony_ci
10191cb0ef41Sopenharmony_ci	&pshufd	($inout2,$rndkey0,1<<6);
10201cb0ef41Sopenharmony_ci	&por	($inout1,$inout5);
10211cb0ef41Sopenharmony_ci	&je	(&label("ctr32_two"));
10221cb0ef41Sopenharmony_ci
10231cb0ef41Sopenharmony_ci	&pshufd	($inout3,$rndkey1,3<<6);
10241cb0ef41Sopenharmony_ci	&por	($inout2,$inout5);
10251cb0ef41Sopenharmony_ci	&cmp	($len,4);
10261cb0ef41Sopenharmony_ci	&jb	(&label("ctr32_three"));
10271cb0ef41Sopenharmony_ci
10281cb0ef41Sopenharmony_ci	&pshufd	($inout4,$rndkey1,2<<6);
10291cb0ef41Sopenharmony_ci	&por	($inout3,$inout5);
10301cb0ef41Sopenharmony_ci	&je	(&label("ctr32_four"));
10311cb0ef41Sopenharmony_ci
10321cb0ef41Sopenharmony_ci	&por	($inout4,$inout5);
10331cb0ef41Sopenharmony_ci	&call	("_aesni_encrypt6");
10341cb0ef41Sopenharmony_ci	&movups	($rndkey1,&QWP(0,$inp));
10351cb0ef41Sopenharmony_ci	&movups	($rndkey0,&QWP(0x10,$inp));
10361cb0ef41Sopenharmony_ci	&xorps	($inout0,$rndkey1);
10371cb0ef41Sopenharmony_ci	&movups	($rndkey1,&QWP(0x20,$inp));
10381cb0ef41Sopenharmony_ci	&xorps	($inout1,$rndkey0);
10391cb0ef41Sopenharmony_ci	&movups	($rndkey0,&QWP(0x30,$inp));
10401cb0ef41Sopenharmony_ci	&xorps	($inout2,$rndkey1);
10411cb0ef41Sopenharmony_ci	&movups	($rndkey1,&QWP(0x40,$inp));
10421cb0ef41Sopenharmony_ci	&xorps	($inout3,$rndkey0);
10431cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
10441cb0ef41Sopenharmony_ci	&xorps	($inout4,$rndkey1);
10451cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
10461cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
10471cb0ef41Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
10481cb0ef41Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
10491cb0ef41Sopenharmony_ci	&jmp	(&label("ctr32_ret"));
10501cb0ef41Sopenharmony_ci
10511cb0ef41Sopenharmony_ci&set_label("ctr32_one_shortcut",16);
10521cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
10531cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
10541cb0ef41Sopenharmony_ci
10551cb0ef41Sopenharmony_ci&set_label("ctr32_one");
10561cb0ef41Sopenharmony_ci	if ($inline)
10571cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
10581cb0ef41Sopenharmony_ci	else
10591cb0ef41Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
10601cb0ef41Sopenharmony_ci	&movups	($in0,&QWP(0,$inp));
10611cb0ef41Sopenharmony_ci	&xorps	($in0,$inout0);
10621cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$in0);
10631cb0ef41Sopenharmony_ci	&jmp	(&label("ctr32_ret"));
10641cb0ef41Sopenharmony_ci
10651cb0ef41Sopenharmony_ci&set_label("ctr32_two",16);
10661cb0ef41Sopenharmony_ci	&call	("_aesni_encrypt2");
10671cb0ef41Sopenharmony_ci	&movups	($inout3,&QWP(0,$inp));
10681cb0ef41Sopenharmony_ci	&movups	($inout4,&QWP(0x10,$inp));
10691cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);
10701cb0ef41Sopenharmony_ci	&xorps	($inout1,$inout4);
10711cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
10721cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
10731cb0ef41Sopenharmony_ci	&jmp	(&label("ctr32_ret"));
10741cb0ef41Sopenharmony_ci
10751cb0ef41Sopenharmony_ci&set_label("ctr32_three",16);
10761cb0ef41Sopenharmony_ci	&call	("_aesni_encrypt3");
10771cb0ef41Sopenharmony_ci	&movups	($inout3,&QWP(0,$inp));
10781cb0ef41Sopenharmony_ci	&movups	($inout4,&QWP(0x10,$inp));
10791cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);
10801cb0ef41Sopenharmony_ci	&movups	($inout5,&QWP(0x20,$inp));
10811cb0ef41Sopenharmony_ci	&xorps	($inout1,$inout4);
10821cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
10831cb0ef41Sopenharmony_ci	&xorps	($inout2,$inout5);
10841cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
10851cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
10861cb0ef41Sopenharmony_ci	&jmp	(&label("ctr32_ret"));
10871cb0ef41Sopenharmony_ci
10881cb0ef41Sopenharmony_ci&set_label("ctr32_four",16);
10891cb0ef41Sopenharmony_ci	&call	("_aesni_encrypt4");
10901cb0ef41Sopenharmony_ci	&movups	($inout4,&QWP(0,$inp));
10911cb0ef41Sopenharmony_ci	&movups	($inout5,&QWP(0x10,$inp));
10921cb0ef41Sopenharmony_ci	&movups	($rndkey1,&QWP(0x20,$inp));
10931cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout4);
10941cb0ef41Sopenharmony_ci	&movups	($rndkey0,&QWP(0x30,$inp));
10951cb0ef41Sopenharmony_ci	&xorps	($inout1,$inout5);
10961cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
10971cb0ef41Sopenharmony_ci	&xorps	($inout2,$rndkey1);
10981cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
10991cb0ef41Sopenharmony_ci	&xorps	($inout3,$rndkey0);
11001cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
11011cb0ef41Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
11021cb0ef41Sopenharmony_ci
11031cb0ef41Sopenharmony_ci&set_label("ctr32_ret");
11041cb0ef41Sopenharmony_ci	&pxor	("xmm0","xmm0");		# clear register bank
11051cb0ef41Sopenharmony_ci	&pxor	("xmm1","xmm1");
11061cb0ef41Sopenharmony_ci	&pxor	("xmm2","xmm2");
11071cb0ef41Sopenharmony_ci	&pxor	("xmm3","xmm3");
11081cb0ef41Sopenharmony_ci	&pxor	("xmm4","xmm4");
11091cb0ef41Sopenharmony_ci	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
11101cb0ef41Sopenharmony_ci	&pxor	("xmm5","xmm5");
11111cb0ef41Sopenharmony_ci	&movdqa	(&QWP(48,"esp"),"xmm0");
11121cb0ef41Sopenharmony_ci	&pxor	("xmm6","xmm6");
11131cb0ef41Sopenharmony_ci	&movdqa	(&QWP(64,"esp"),"xmm0");
11141cb0ef41Sopenharmony_ci	&pxor	("xmm7","xmm7");
11151cb0ef41Sopenharmony_ci	&mov	("esp",&DWP(80,"esp"));
11161cb0ef41Sopenharmony_ci&function_end("aesni_ctr32_encrypt_blocks");
11171cb0ef41Sopenharmony_ci
11181cb0ef41Sopenharmony_ci######################################################################
11191cb0ef41Sopenharmony_ci# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
11201cb0ef41Sopenharmony_ci#	const AES_KEY *key1, const AES_KEY *key2
11211cb0ef41Sopenharmony_ci#	const unsigned char iv[16]);
11221cb0ef41Sopenharmony_ci#
11231cb0ef41Sopenharmony_ci{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
11241cb0ef41Sopenharmony_ci
11251cb0ef41Sopenharmony_ci&function_begin("aesni_xts_encrypt");
11261cb0ef41Sopenharmony_ci	&mov	($key,&wparam(4));		# key2
11271cb0ef41Sopenharmony_ci	&mov	($inp,&wparam(5));		# clear-text tweak
11281cb0ef41Sopenharmony_ci
11291cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));	# key2->rounds
11301cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));
11311cb0ef41Sopenharmony_ci	if ($inline)
11321cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
11331cb0ef41Sopenharmony_ci	else
11341cb0ef41Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
11351cb0ef41Sopenharmony_ci
11361cb0ef41Sopenharmony_ci	&mov	($inp,&wparam(0));
11371cb0ef41Sopenharmony_ci	&mov	($out,&wparam(1));
11381cb0ef41Sopenharmony_ci	&mov	($len,&wparam(2));
11391cb0ef41Sopenharmony_ci	&mov	($key,&wparam(3));		# key1
11401cb0ef41Sopenharmony_ci
11411cb0ef41Sopenharmony_ci	&mov	($key_,"esp");
11421cb0ef41Sopenharmony_ci	&sub	("esp",16*7+8);
11431cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));	# key1->rounds
11441cb0ef41Sopenharmony_ci	&and	("esp",-16);			# align stack
11451cb0ef41Sopenharmony_ci
11461cb0ef41Sopenharmony_ci	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
11471cb0ef41Sopenharmony_ci	&mov	(&DWP(16*6+4,"esp"),0);
11481cb0ef41Sopenharmony_ci	&mov	(&DWP(16*6+8,"esp"),1);
11491cb0ef41Sopenharmony_ci	&mov	(&DWP(16*6+12,"esp"),0);
11501cb0ef41Sopenharmony_ci	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
11511cb0ef41Sopenharmony_ci	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
11521cb0ef41Sopenharmony_ci
11531cb0ef41Sopenharmony_ci	&movdqa	($tweak,$inout0);
11541cb0ef41Sopenharmony_ci	&pxor	($twtmp,$twtmp);
11551cb0ef41Sopenharmony_ci	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
11561cb0ef41Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
11571cb0ef41Sopenharmony_ci
11581cb0ef41Sopenharmony_ci	&and	($len,-16);
11591cb0ef41Sopenharmony_ci	&mov	($key_,$key);			# backup $key
11601cb0ef41Sopenharmony_ci	&mov	($rounds_,$rounds);		# backup $rounds
11611cb0ef41Sopenharmony_ci	&sub	($len,16*6);
11621cb0ef41Sopenharmony_ci	&jc	(&label("xts_enc_short"));
11631cb0ef41Sopenharmony_ci
11641cb0ef41Sopenharmony_ci	&shl	($rounds,4);
11651cb0ef41Sopenharmony_ci	&mov	($rounds_,16);
11661cb0ef41Sopenharmony_ci	&sub	($rounds_,$rounds);
11671cb0ef41Sopenharmony_ci	&lea	($key,&DWP(32,$key,$rounds));
11681cb0ef41Sopenharmony_ci	&jmp	(&label("xts_enc_loop6"));
11691cb0ef41Sopenharmony_ci
11701cb0ef41Sopenharmony_ci&set_label("xts_enc_loop6",16);
11711cb0ef41Sopenharmony_ci	for ($i=0;$i<4;$i++) {
11721cb0ef41Sopenharmony_ci	    &pshufd	($twres,$twtmp,0x13);
11731cb0ef41Sopenharmony_ci	    &pxor	($twtmp,$twtmp);
11741cb0ef41Sopenharmony_ci	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
11751cb0ef41Sopenharmony_ci	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
11761cb0ef41Sopenharmony_ci	    &pand	($twres,$twmask);	# isolate carry and residue
11771cb0ef41Sopenharmony_ci	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
11781cb0ef41Sopenharmony_ci	    &pxor	($tweak,$twres);
11791cb0ef41Sopenharmony_ci	}
11801cb0ef41Sopenharmony_ci	&pshufd	($inout5,$twtmp,0x13);
11811cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
11821cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
11831cb0ef41Sopenharmony_ci	 &$movekey	($rndkey0,&QWP(0,$key_));
11841cb0ef41Sopenharmony_ci	&pand	($inout5,$twmask);		# isolate carry and residue
11851cb0ef41Sopenharmony_ci	 &movups	($inout0,&QWP(0,$inp));	# load input
11861cb0ef41Sopenharmony_ci	&pxor	($inout5,$tweak);
11871cb0ef41Sopenharmony_ci
11881cb0ef41Sopenharmony_ci	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
11891cb0ef41Sopenharmony_ci	&mov	($rounds,$rounds_);		# restore $rounds
11901cb0ef41Sopenharmony_ci	&movdqu	($inout1,&QWP(16*1,$inp));
11911cb0ef41Sopenharmony_ci	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
11921cb0ef41Sopenharmony_ci	&movdqu	($inout2,&QWP(16*2,$inp));
11931cb0ef41Sopenharmony_ci	 &pxor		($inout1,$rndkey0);
11941cb0ef41Sopenharmony_ci	&movdqu	($inout3,&QWP(16*3,$inp));
11951cb0ef41Sopenharmony_ci	 &pxor		($inout2,$rndkey0);
11961cb0ef41Sopenharmony_ci	&movdqu	($inout4,&QWP(16*4,$inp));
11971cb0ef41Sopenharmony_ci	 &pxor		($inout3,$rndkey0);
11981cb0ef41Sopenharmony_ci	&movdqu	($rndkey1,&QWP(16*5,$inp));
11991cb0ef41Sopenharmony_ci	 &pxor		($inout4,$rndkey0);
12001cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16*6,$inp));
12011cb0ef41Sopenharmony_ci	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
12021cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
12031cb0ef41Sopenharmony_ci	&pxor	($inout5,$rndkey1);
12041cb0ef41Sopenharmony_ci
12051cb0ef41Sopenharmony_ci	 &$movekey	($rndkey1,&QWP(16,$key_));
12061cb0ef41Sopenharmony_ci	&pxor	($inout1,&QWP(16*1,"esp"));
12071cb0ef41Sopenharmony_ci	&pxor	($inout2,&QWP(16*2,"esp"));
12081cb0ef41Sopenharmony_ci	 &aesenc	($inout0,$rndkey1);
12091cb0ef41Sopenharmony_ci	&pxor	($inout3,&QWP(16*3,"esp"));
12101cb0ef41Sopenharmony_ci	&pxor	($inout4,&QWP(16*4,"esp"));
12111cb0ef41Sopenharmony_ci	 &aesenc	($inout1,$rndkey1);
12121cb0ef41Sopenharmony_ci	&pxor		($inout5,$rndkey0);
12131cb0ef41Sopenharmony_ci	 &$movekey	($rndkey0,&QWP(32,$key_));
12141cb0ef41Sopenharmony_ci	 &aesenc	($inout2,$rndkey1);
12151cb0ef41Sopenharmony_ci	 &aesenc	($inout3,$rndkey1);
12161cb0ef41Sopenharmony_ci	 &aesenc	($inout4,$rndkey1);
12171cb0ef41Sopenharmony_ci	 &aesenc	($inout5,$rndkey1);
12181cb0ef41Sopenharmony_ci	&call		(&label("_aesni_encrypt6_enter"));
12191cb0ef41Sopenharmony_ci
12201cb0ef41Sopenharmony_ci	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
12211cb0ef41Sopenharmony_ci       &pxor	($twtmp,$twtmp);
12221cb0ef41Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
12231cb0ef41Sopenharmony_ci       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
12241cb0ef41Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
12251cb0ef41Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
12261cb0ef41Sopenharmony_ci	&xorps	($inout2,&QWP(16*2,"esp"));
12271cb0ef41Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
12281cb0ef41Sopenharmony_ci	&xorps	($inout3,&QWP(16*3,"esp"));
12291cb0ef41Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
12301cb0ef41Sopenharmony_ci	&xorps	($inout4,&QWP(16*4,"esp"));
12311cb0ef41Sopenharmony_ci	&movups	(&QWP(16*3,$out),$inout3);
12321cb0ef41Sopenharmony_ci	&xorps	($inout5,$tweak);
12331cb0ef41Sopenharmony_ci	&movups	(&QWP(16*4,$out),$inout4);
12341cb0ef41Sopenharmony_ci       &pshufd	($twres,$twtmp,0x13);
12351cb0ef41Sopenharmony_ci	&movups	(&QWP(16*5,$out),$inout5);
12361cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16*6,$out));
12371cb0ef41Sopenharmony_ci       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
12381cb0ef41Sopenharmony_ci
12391cb0ef41Sopenharmony_ci	&pxor	($twtmp,$twtmp);
12401cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
12411cb0ef41Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
12421cb0ef41Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
12431cb0ef41Sopenharmony_ci	&pxor	($tweak,$twres);
12441cb0ef41Sopenharmony_ci
12451cb0ef41Sopenharmony_ci	&sub	($len,16*6);
12461cb0ef41Sopenharmony_ci	&jnc	(&label("xts_enc_loop6"));
12471cb0ef41Sopenharmony_ci
12481cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
12491cb0ef41Sopenharmony_ci	&mov	($key,$key_);			# restore $key
12501cb0ef41Sopenharmony_ci	&mov	($rounds_,$rounds);
12511cb0ef41Sopenharmony_ci
12521cb0ef41Sopenharmony_ci&set_label("xts_enc_short");
12531cb0ef41Sopenharmony_ci	&add	($len,16*6);
12541cb0ef41Sopenharmony_ci	&jz	(&label("xts_enc_done6x"));
12551cb0ef41Sopenharmony_ci
12561cb0ef41Sopenharmony_ci	&movdqa	($inout3,$tweak);		# put aside previous tweak
12571cb0ef41Sopenharmony_ci	&cmp	($len,0x20);
12581cb0ef41Sopenharmony_ci	&jb	(&label("xts_enc_one"));
12591cb0ef41Sopenharmony_ci
12601cb0ef41Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
12611cb0ef41Sopenharmony_ci	&pxor	($twtmp,$twtmp);
12621cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
12631cb0ef41Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
12641cb0ef41Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
12651cb0ef41Sopenharmony_ci	&pxor	($tweak,$twres);
12661cb0ef41Sopenharmony_ci	&je	(&label("xts_enc_two"));
12671cb0ef41Sopenharmony_ci
12681cb0ef41Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
12691cb0ef41Sopenharmony_ci	&pxor	($twtmp,$twtmp);
12701cb0ef41Sopenharmony_ci	&movdqa	($inout4,$tweak);		# put aside previous tweak
12711cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
12721cb0ef41Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
12731cb0ef41Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
12741cb0ef41Sopenharmony_ci	&pxor	($tweak,$twres);
12751cb0ef41Sopenharmony_ci	&cmp	($len,0x40);
12761cb0ef41Sopenharmony_ci	&jb	(&label("xts_enc_three"));
12771cb0ef41Sopenharmony_ci
12781cb0ef41Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
12791cb0ef41Sopenharmony_ci	&pxor	($twtmp,$twtmp);
12801cb0ef41Sopenharmony_ci	&movdqa	($inout5,$tweak);		# put aside previous tweak
12811cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
12821cb0ef41Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
12831cb0ef41Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
12841cb0ef41Sopenharmony_ci	&pxor	($tweak,$twres);
12851cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*0,"esp"),$inout3);
12861cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*1,"esp"),$inout4);
12871cb0ef41Sopenharmony_ci	&je	(&label("xts_enc_four"));
12881cb0ef41Sopenharmony_ci
12891cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*2,"esp"),$inout5);
12901cb0ef41Sopenharmony_ci	&pshufd	($inout5,$twtmp,0x13);
12911cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*3,"esp"),$tweak);
12921cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
12931cb0ef41Sopenharmony_ci	&pand	($inout5,$twmask);		# isolate carry and residue
12941cb0ef41Sopenharmony_ci	&pxor	($inout5,$tweak);
12951cb0ef41Sopenharmony_ci
12961cb0ef41Sopenharmony_ci	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
12971cb0ef41Sopenharmony_ci	&movdqu	($inout1,&QWP(16*1,$inp));
12981cb0ef41Sopenharmony_ci	&movdqu	($inout2,&QWP(16*2,$inp));
12991cb0ef41Sopenharmony_ci	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
13001cb0ef41Sopenharmony_ci	&movdqu	($inout3,&QWP(16*3,$inp));
13011cb0ef41Sopenharmony_ci	&pxor	($inout1,&QWP(16*1,"esp"));
13021cb0ef41Sopenharmony_ci	&movdqu	($inout4,&QWP(16*4,$inp));
13031cb0ef41Sopenharmony_ci	&pxor	($inout2,&QWP(16*2,"esp"));
13041cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16*5,$inp));
13051cb0ef41Sopenharmony_ci	&pxor	($inout3,&QWP(16*3,"esp"));
13061cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
13071cb0ef41Sopenharmony_ci	&pxor	($inout4,$inout5);
13081cb0ef41Sopenharmony_ci
13091cb0ef41Sopenharmony_ci	&call	("_aesni_encrypt6");
13101cb0ef41Sopenharmony_ci
13111cb0ef41Sopenharmony_ci	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
13121cb0ef41Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
13131cb0ef41Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
13141cb0ef41Sopenharmony_ci	&xorps	($inout2,&QWP(16*2,"esp"));
13151cb0ef41Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
13161cb0ef41Sopenharmony_ci	&xorps	($inout3,&QWP(16*3,"esp"));
13171cb0ef41Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
13181cb0ef41Sopenharmony_ci	&xorps	($inout4,$tweak);
13191cb0ef41Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
13201cb0ef41Sopenharmony_ci	&movups	(&QWP(16*3,$out),$inout3);
13211cb0ef41Sopenharmony_ci	&movups	(&QWP(16*4,$out),$inout4);
13221cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16*5,$out));
13231cb0ef41Sopenharmony_ci	&jmp	(&label("xts_enc_done"));
13241cb0ef41Sopenharmony_ci
13251cb0ef41Sopenharmony_ci&set_label("xts_enc_one",16);
13261cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
13271cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16*1,$inp));
13281cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
13291cb0ef41Sopenharmony_ci	if ($inline)
13301cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
13311cb0ef41Sopenharmony_ci	else
13321cb0ef41Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
13331cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
13341cb0ef41Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
13351cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16*1,$out));
13361cb0ef41Sopenharmony_ci
13371cb0ef41Sopenharmony_ci	&movdqa	($tweak,$inout3);		# last tweak
13381cb0ef41Sopenharmony_ci	&jmp	(&label("xts_enc_done"));
13391cb0ef41Sopenharmony_ci
13401cb0ef41Sopenharmony_ci&set_label("xts_enc_two",16);
13411cb0ef41Sopenharmony_ci	&movaps	($inout4,$tweak);		# put aside last tweak
13421cb0ef41Sopenharmony_ci
13431cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
13441cb0ef41Sopenharmony_ci	&movups	($inout1,&QWP(16*1,$inp));
13451cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16*2,$inp));
13461cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
13471cb0ef41Sopenharmony_ci	&xorps	($inout1,$inout4);
13481cb0ef41Sopenharmony_ci
13491cb0ef41Sopenharmony_ci	&call	("_aesni_encrypt2");
13501cb0ef41Sopenharmony_ci
13511cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
13521cb0ef41Sopenharmony_ci	&xorps	($inout1,$inout4);
13531cb0ef41Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
13541cb0ef41Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
13551cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16*2,$out));
13561cb0ef41Sopenharmony_ci
13571cb0ef41Sopenharmony_ci	&movdqa	($tweak,$inout4);		# last tweak
13581cb0ef41Sopenharmony_ci	&jmp	(&label("xts_enc_done"));
13591cb0ef41Sopenharmony_ci
13601cb0ef41Sopenharmony_ci&set_label("xts_enc_three",16);
13611cb0ef41Sopenharmony_ci	&movaps	($inout5,$tweak);		# put aside last tweak
13621cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
13631cb0ef41Sopenharmony_ci	&movups	($inout1,&QWP(16*1,$inp));
13641cb0ef41Sopenharmony_ci	&movups	($inout2,&QWP(16*2,$inp));
13651cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16*3,$inp));
13661cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
13671cb0ef41Sopenharmony_ci	&xorps	($inout1,$inout4);
13681cb0ef41Sopenharmony_ci	&xorps	($inout2,$inout5);
13691cb0ef41Sopenharmony_ci
13701cb0ef41Sopenharmony_ci	&call	("_aesni_encrypt3");
13711cb0ef41Sopenharmony_ci
13721cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
13731cb0ef41Sopenharmony_ci	&xorps	($inout1,$inout4);
13741cb0ef41Sopenharmony_ci	&xorps	($inout2,$inout5);
13751cb0ef41Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
13761cb0ef41Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
13771cb0ef41Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
13781cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16*3,$out));
13791cb0ef41Sopenharmony_ci
13801cb0ef41Sopenharmony_ci	&movdqa	($tweak,$inout5);		# last tweak
13811cb0ef41Sopenharmony_ci	&jmp	(&label("xts_enc_done"));
13821cb0ef41Sopenharmony_ci
13831cb0ef41Sopenharmony_ci&set_label("xts_enc_four",16);
13841cb0ef41Sopenharmony_ci	&movaps	($inout4,$tweak);		# put aside last tweak
13851cb0ef41Sopenharmony_ci
13861cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
13871cb0ef41Sopenharmony_ci	&movups	($inout1,&QWP(16*1,$inp));
13881cb0ef41Sopenharmony_ci	&movups	($inout2,&QWP(16*2,$inp));
13891cb0ef41Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
13901cb0ef41Sopenharmony_ci	&movups	($inout3,&QWP(16*3,$inp));
13911cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16*4,$inp));
13921cb0ef41Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
13931cb0ef41Sopenharmony_ci	&xorps	($inout2,$inout5);
13941cb0ef41Sopenharmony_ci	&xorps	($inout3,$inout4);
13951cb0ef41Sopenharmony_ci
13961cb0ef41Sopenharmony_ci	&call	("_aesni_encrypt4");
13971cb0ef41Sopenharmony_ci
13981cb0ef41Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
13991cb0ef41Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
14001cb0ef41Sopenharmony_ci	&xorps	($inout2,$inout5);
14011cb0ef41Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
14021cb0ef41Sopenharmony_ci	&xorps	($inout3,$inout4);
14031cb0ef41Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
14041cb0ef41Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
14051cb0ef41Sopenharmony_ci	&movups	(&QWP(16*3,$out),$inout3);
14061cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16*4,$out));
14071cb0ef41Sopenharmony_ci
14081cb0ef41Sopenharmony_ci	&movdqa	($tweak,$inout4);		# last tweak
14091cb0ef41Sopenharmony_ci	&jmp	(&label("xts_enc_done"));
14101cb0ef41Sopenharmony_ci
14111cb0ef41Sopenharmony_ci&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
14121cb0ef41Sopenharmony_ci	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
14131cb0ef41Sopenharmony_ci	&and	($len,15);
14141cb0ef41Sopenharmony_ci	&jz	(&label("xts_enc_ret"));
14151cb0ef41Sopenharmony_ci	&movdqa	($inout3,$tweak);
14161cb0ef41Sopenharmony_ci	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
14171cb0ef41Sopenharmony_ci	&jmp	(&label("xts_enc_steal"));
14181cb0ef41Sopenharmony_ci
14191cb0ef41Sopenharmony_ci&set_label("xts_enc_done",16);
14201cb0ef41Sopenharmony_ci	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
14211cb0ef41Sopenharmony_ci	&pxor	($twtmp,$twtmp);
14221cb0ef41Sopenharmony_ci	&and	($len,15);
14231cb0ef41Sopenharmony_ci	&jz	(&label("xts_enc_ret"));
14241cb0ef41Sopenharmony_ci
14251cb0ef41Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
14261cb0ef41Sopenharmony_ci	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
14271cb0ef41Sopenharmony_ci	&pshufd	($inout3,$twtmp,0x13);
14281cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
14291cb0ef41Sopenharmony_ci	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
14301cb0ef41Sopenharmony_ci	&pxor	($inout3,$tweak);
14311cb0ef41Sopenharmony_ci
14321cb0ef41Sopenharmony_ci&set_label("xts_enc_steal");
14331cb0ef41Sopenharmony_ci	&movz	($rounds,&BP(0,$inp));
14341cb0ef41Sopenharmony_ci	&movz	($key,&BP(-16,$out));
14351cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(1,$inp));
14361cb0ef41Sopenharmony_ci	&mov	(&BP(-16,$out),&LB($rounds));
14371cb0ef41Sopenharmony_ci	&mov	(&BP(0,$out),&LB($key));
14381cb0ef41Sopenharmony_ci	&lea	($out,&DWP(1,$out));
14391cb0ef41Sopenharmony_ci	&sub	($len,1);
14401cb0ef41Sopenharmony_ci	&jnz	(&label("xts_enc_steal"));
14411cb0ef41Sopenharmony_ci
14421cb0ef41Sopenharmony_ci	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
14431cb0ef41Sopenharmony_ci	&mov	($key,$key_);			# restore $key
14441cb0ef41Sopenharmony_ci	&mov	($rounds,$rounds_);		# restore $rounds
14451cb0ef41Sopenharmony_ci
14461cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(-16,$out));	# load input
14471cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
14481cb0ef41Sopenharmony_ci	if ($inline)
14491cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
14501cb0ef41Sopenharmony_ci	else
14511cb0ef41Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
14521cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
14531cb0ef41Sopenharmony_ci	&movups	(&QWP(-16,$out),$inout0);	# write output
14541cb0ef41Sopenharmony_ci
14551cb0ef41Sopenharmony_ci&set_label("xts_enc_ret");
14561cb0ef41Sopenharmony_ci	&pxor	("xmm0","xmm0");		# clear register bank
14571cb0ef41Sopenharmony_ci	&pxor	("xmm1","xmm1");
14581cb0ef41Sopenharmony_ci	&pxor	("xmm2","xmm2");
14591cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
14601cb0ef41Sopenharmony_ci	&pxor	("xmm3","xmm3");
14611cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*1,"esp"),"xmm0");
14621cb0ef41Sopenharmony_ci	&pxor	("xmm4","xmm4");
14631cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*2,"esp"),"xmm0");
14641cb0ef41Sopenharmony_ci	&pxor	("xmm5","xmm5");
14651cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*3,"esp"),"xmm0");
14661cb0ef41Sopenharmony_ci	&pxor	("xmm6","xmm6");
14671cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*4,"esp"),"xmm0");
14681cb0ef41Sopenharmony_ci	&pxor	("xmm7","xmm7");
14691cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*5,"esp"),"xmm0");
14701cb0ef41Sopenharmony_ci	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
14711cb0ef41Sopenharmony_ci&function_end("aesni_xts_encrypt");
14721cb0ef41Sopenharmony_ci
14731cb0ef41Sopenharmony_ci&function_begin("aesni_xts_decrypt");
14741cb0ef41Sopenharmony_ci	&mov	($key,&wparam(4));		# key2
14751cb0ef41Sopenharmony_ci	&mov	($inp,&wparam(5));		# clear-text tweak
14761cb0ef41Sopenharmony_ci
14771cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));	# key2->rounds
14781cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));
14791cb0ef41Sopenharmony_ci	if ($inline)
14801cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
14811cb0ef41Sopenharmony_ci	else
14821cb0ef41Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
14831cb0ef41Sopenharmony_ci
14841cb0ef41Sopenharmony_ci	&mov	($inp,&wparam(0));
14851cb0ef41Sopenharmony_ci	&mov	($out,&wparam(1));
14861cb0ef41Sopenharmony_ci	&mov	($len,&wparam(2));
14871cb0ef41Sopenharmony_ci	&mov	($key,&wparam(3));		# key1
14881cb0ef41Sopenharmony_ci
14891cb0ef41Sopenharmony_ci	&mov	($key_,"esp");
14901cb0ef41Sopenharmony_ci	&sub	("esp",16*7+8);
14911cb0ef41Sopenharmony_ci	&and	("esp",-16);			# align stack
14921cb0ef41Sopenharmony_ci
14931cb0ef41Sopenharmony_ci	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
14941cb0ef41Sopenharmony_ci	&test	($len,15);
14951cb0ef41Sopenharmony_ci	&setnz	(&LB($rounds_));
14961cb0ef41Sopenharmony_ci	&shl	($rounds_,4);
14971cb0ef41Sopenharmony_ci	&sub	($len,$rounds_);
14981cb0ef41Sopenharmony_ci
14991cb0ef41Sopenharmony_ci	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
15001cb0ef41Sopenharmony_ci	&mov	(&DWP(16*6+4,"esp"),0);
15011cb0ef41Sopenharmony_ci	&mov	(&DWP(16*6+8,"esp"),1);
15021cb0ef41Sopenharmony_ci	&mov	(&DWP(16*6+12,"esp"),0);
15031cb0ef41Sopenharmony_ci	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
15041cb0ef41Sopenharmony_ci	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
15051cb0ef41Sopenharmony_ci
15061cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));	# key1->rounds
15071cb0ef41Sopenharmony_ci	&mov	($key_,$key);			# backup $key
15081cb0ef41Sopenharmony_ci	&mov	($rounds_,$rounds);		# backup $rounds
15091cb0ef41Sopenharmony_ci
15101cb0ef41Sopenharmony_ci	&movdqa	($tweak,$inout0);
15111cb0ef41Sopenharmony_ci	&pxor	($twtmp,$twtmp);
15121cb0ef41Sopenharmony_ci	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
15131cb0ef41Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
15141cb0ef41Sopenharmony_ci
15151cb0ef41Sopenharmony_ci	&and	($len,-16);
15161cb0ef41Sopenharmony_ci	&sub	($len,16*6);
15171cb0ef41Sopenharmony_ci	&jc	(&label("xts_dec_short"));
15181cb0ef41Sopenharmony_ci
15191cb0ef41Sopenharmony_ci	&shl	($rounds,4);
15201cb0ef41Sopenharmony_ci	&mov	($rounds_,16);
15211cb0ef41Sopenharmony_ci	&sub	($rounds_,$rounds);
15221cb0ef41Sopenharmony_ci	&lea	($key,&DWP(32,$key,$rounds));
15231cb0ef41Sopenharmony_ci	&jmp	(&label("xts_dec_loop6"));
15241cb0ef41Sopenharmony_ci
15251cb0ef41Sopenharmony_ci&set_label("xts_dec_loop6",16);
15261cb0ef41Sopenharmony_ci	for ($i=0;$i<4;$i++) {
15271cb0ef41Sopenharmony_ci	    &pshufd	($twres,$twtmp,0x13);
15281cb0ef41Sopenharmony_ci	    &pxor	($twtmp,$twtmp);
15291cb0ef41Sopenharmony_ci	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
15301cb0ef41Sopenharmony_ci	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
15311cb0ef41Sopenharmony_ci	    &pand	($twres,$twmask);	# isolate carry and residue
15321cb0ef41Sopenharmony_ci	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
15331cb0ef41Sopenharmony_ci	    &pxor	($tweak,$twres);
15341cb0ef41Sopenharmony_ci	}
15351cb0ef41Sopenharmony_ci	&pshufd	($inout5,$twtmp,0x13);
15361cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
15371cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
15381cb0ef41Sopenharmony_ci	 &$movekey	($rndkey0,&QWP(0,$key_));
15391cb0ef41Sopenharmony_ci	&pand	($inout5,$twmask);		# isolate carry and residue
15401cb0ef41Sopenharmony_ci	 &movups	($inout0,&QWP(0,$inp));	# load input
15411cb0ef41Sopenharmony_ci	&pxor	($inout5,$tweak);
15421cb0ef41Sopenharmony_ci
15431cb0ef41Sopenharmony_ci	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
15441cb0ef41Sopenharmony_ci	&mov	($rounds,$rounds_);
15451cb0ef41Sopenharmony_ci	&movdqu	($inout1,&QWP(16*1,$inp));
15461cb0ef41Sopenharmony_ci	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
15471cb0ef41Sopenharmony_ci	&movdqu	($inout2,&QWP(16*2,$inp));
15481cb0ef41Sopenharmony_ci	 &pxor		($inout1,$rndkey0);
15491cb0ef41Sopenharmony_ci	&movdqu	($inout3,&QWP(16*3,$inp));
15501cb0ef41Sopenharmony_ci	 &pxor		($inout2,$rndkey0);
15511cb0ef41Sopenharmony_ci	&movdqu	($inout4,&QWP(16*4,$inp));
15521cb0ef41Sopenharmony_ci	 &pxor		($inout3,$rndkey0);
15531cb0ef41Sopenharmony_ci	&movdqu	($rndkey1,&QWP(16*5,$inp));
15541cb0ef41Sopenharmony_ci	 &pxor		($inout4,$rndkey0);
15551cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16*6,$inp));
15561cb0ef41Sopenharmony_ci	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
15571cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
15581cb0ef41Sopenharmony_ci	&pxor	($inout5,$rndkey1);
15591cb0ef41Sopenharmony_ci
15601cb0ef41Sopenharmony_ci	 &$movekey	($rndkey1,&QWP(16,$key_));
15611cb0ef41Sopenharmony_ci	&pxor	($inout1,&QWP(16*1,"esp"));
15621cb0ef41Sopenharmony_ci	&pxor	($inout2,&QWP(16*2,"esp"));
15631cb0ef41Sopenharmony_ci	 &aesdec	($inout0,$rndkey1);
15641cb0ef41Sopenharmony_ci	&pxor	($inout3,&QWP(16*3,"esp"));
15651cb0ef41Sopenharmony_ci	&pxor	($inout4,&QWP(16*4,"esp"));
15661cb0ef41Sopenharmony_ci	 &aesdec	($inout1,$rndkey1);
15671cb0ef41Sopenharmony_ci	&pxor		($inout5,$rndkey0);
15681cb0ef41Sopenharmony_ci	 &$movekey	($rndkey0,&QWP(32,$key_));
15691cb0ef41Sopenharmony_ci	 &aesdec	($inout2,$rndkey1);
15701cb0ef41Sopenharmony_ci	 &aesdec	($inout3,$rndkey1);
15711cb0ef41Sopenharmony_ci	 &aesdec	($inout4,$rndkey1);
15721cb0ef41Sopenharmony_ci	 &aesdec	($inout5,$rndkey1);
15731cb0ef41Sopenharmony_ci	&call		(&label("_aesni_decrypt6_enter"));
15741cb0ef41Sopenharmony_ci
15751cb0ef41Sopenharmony_ci	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
15761cb0ef41Sopenharmony_ci       &pxor	($twtmp,$twtmp);
15771cb0ef41Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
15781cb0ef41Sopenharmony_ci       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
15791cb0ef41Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
15801cb0ef41Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
15811cb0ef41Sopenharmony_ci	&xorps	($inout2,&QWP(16*2,"esp"));
15821cb0ef41Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
15831cb0ef41Sopenharmony_ci	&xorps	($inout3,&QWP(16*3,"esp"));
15841cb0ef41Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
15851cb0ef41Sopenharmony_ci	&xorps	($inout4,&QWP(16*4,"esp"));
15861cb0ef41Sopenharmony_ci	&movups	(&QWP(16*3,$out),$inout3);
15871cb0ef41Sopenharmony_ci	&xorps	($inout5,$tweak);
15881cb0ef41Sopenharmony_ci	&movups	(&QWP(16*4,$out),$inout4);
15891cb0ef41Sopenharmony_ci       &pshufd	($twres,$twtmp,0x13);
15901cb0ef41Sopenharmony_ci	&movups	(&QWP(16*5,$out),$inout5);
15911cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16*6,$out));
15921cb0ef41Sopenharmony_ci       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
15931cb0ef41Sopenharmony_ci
15941cb0ef41Sopenharmony_ci	&pxor	($twtmp,$twtmp);
15951cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
15961cb0ef41Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
15971cb0ef41Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
15981cb0ef41Sopenharmony_ci	&pxor	($tweak,$twres);
15991cb0ef41Sopenharmony_ci
16001cb0ef41Sopenharmony_ci	&sub	($len,16*6);
16011cb0ef41Sopenharmony_ci	&jnc	(&label("xts_dec_loop6"));
16021cb0ef41Sopenharmony_ci
16031cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
16041cb0ef41Sopenharmony_ci	&mov	($key,$key_);			# restore $key
16051cb0ef41Sopenharmony_ci	&mov	($rounds_,$rounds);
16061cb0ef41Sopenharmony_ci
16071cb0ef41Sopenharmony_ci&set_label("xts_dec_short");
16081cb0ef41Sopenharmony_ci	&add	($len,16*6);
16091cb0ef41Sopenharmony_ci	&jz	(&label("xts_dec_done6x"));
16101cb0ef41Sopenharmony_ci
16111cb0ef41Sopenharmony_ci	&movdqa	($inout3,$tweak);		# put aside previous tweak
16121cb0ef41Sopenharmony_ci	&cmp	($len,0x20);
16131cb0ef41Sopenharmony_ci	&jb	(&label("xts_dec_one"));
16141cb0ef41Sopenharmony_ci
16151cb0ef41Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
16161cb0ef41Sopenharmony_ci	&pxor	($twtmp,$twtmp);
16171cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
16181cb0ef41Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
16191cb0ef41Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
16201cb0ef41Sopenharmony_ci	&pxor	($tweak,$twres);
16211cb0ef41Sopenharmony_ci	&je	(&label("xts_dec_two"));
16221cb0ef41Sopenharmony_ci
16231cb0ef41Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
16241cb0ef41Sopenharmony_ci	&pxor	($twtmp,$twtmp);
16251cb0ef41Sopenharmony_ci	&movdqa	($inout4,$tweak);		# put aside previous tweak
16261cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
16271cb0ef41Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
16281cb0ef41Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
16291cb0ef41Sopenharmony_ci	&pxor	($tweak,$twres);
16301cb0ef41Sopenharmony_ci	&cmp	($len,0x40);
16311cb0ef41Sopenharmony_ci	&jb	(&label("xts_dec_three"));
16321cb0ef41Sopenharmony_ci
16331cb0ef41Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
16341cb0ef41Sopenharmony_ci	&pxor	($twtmp,$twtmp);
16351cb0ef41Sopenharmony_ci	&movdqa	($inout5,$tweak);		# put aside previous tweak
16361cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
16371cb0ef41Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
16381cb0ef41Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
16391cb0ef41Sopenharmony_ci	&pxor	($tweak,$twres);
16401cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*0,"esp"),$inout3);
16411cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*1,"esp"),$inout4);
16421cb0ef41Sopenharmony_ci	&je	(&label("xts_dec_four"));
16431cb0ef41Sopenharmony_ci
16441cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*2,"esp"),$inout5);
16451cb0ef41Sopenharmony_ci	&pshufd	($inout5,$twtmp,0x13);
16461cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*3,"esp"),$tweak);
16471cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
16481cb0ef41Sopenharmony_ci	&pand	($inout5,$twmask);		# isolate carry and residue
16491cb0ef41Sopenharmony_ci	&pxor	($inout5,$tweak);
16501cb0ef41Sopenharmony_ci
16511cb0ef41Sopenharmony_ci	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
16521cb0ef41Sopenharmony_ci	&movdqu	($inout1,&QWP(16*1,$inp));
16531cb0ef41Sopenharmony_ci	&movdqu	($inout2,&QWP(16*2,$inp));
16541cb0ef41Sopenharmony_ci	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
16551cb0ef41Sopenharmony_ci	&movdqu	($inout3,&QWP(16*3,$inp));
16561cb0ef41Sopenharmony_ci	&pxor	($inout1,&QWP(16*1,"esp"));
16571cb0ef41Sopenharmony_ci	&movdqu	($inout4,&QWP(16*4,$inp));
16581cb0ef41Sopenharmony_ci	&pxor	($inout2,&QWP(16*2,"esp"));
16591cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16*5,$inp));
16601cb0ef41Sopenharmony_ci	&pxor	($inout3,&QWP(16*3,"esp"));
16611cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
16621cb0ef41Sopenharmony_ci	&pxor	($inout4,$inout5);
16631cb0ef41Sopenharmony_ci
16641cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt6");
16651cb0ef41Sopenharmony_ci
16661cb0ef41Sopenharmony_ci	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
16671cb0ef41Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
16681cb0ef41Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
16691cb0ef41Sopenharmony_ci	&xorps	($inout2,&QWP(16*2,"esp"));
16701cb0ef41Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
16711cb0ef41Sopenharmony_ci	&xorps	($inout3,&QWP(16*3,"esp"));
16721cb0ef41Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
16731cb0ef41Sopenharmony_ci	&xorps	($inout4,$tweak);
16741cb0ef41Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
16751cb0ef41Sopenharmony_ci	&movups	(&QWP(16*3,$out),$inout3);
16761cb0ef41Sopenharmony_ci	&movups	(&QWP(16*4,$out),$inout4);
16771cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16*5,$out));
16781cb0ef41Sopenharmony_ci	&jmp	(&label("xts_dec_done"));
16791cb0ef41Sopenharmony_ci
16801cb0ef41Sopenharmony_ci&set_label("xts_dec_one",16);
16811cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
16821cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16*1,$inp));
16831cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
16841cb0ef41Sopenharmony_ci	if ($inline)
16851cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
16861cb0ef41Sopenharmony_ci	else
16871cb0ef41Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
16881cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
16891cb0ef41Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
16901cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16*1,$out));
16911cb0ef41Sopenharmony_ci
16921cb0ef41Sopenharmony_ci	&movdqa	($tweak,$inout3);		# last tweak
16931cb0ef41Sopenharmony_ci	&jmp	(&label("xts_dec_done"));
16941cb0ef41Sopenharmony_ci
16951cb0ef41Sopenharmony_ci&set_label("xts_dec_two",16);
16961cb0ef41Sopenharmony_ci	&movaps	($inout4,$tweak);		# put aside last tweak
16971cb0ef41Sopenharmony_ci
16981cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
16991cb0ef41Sopenharmony_ci	&movups	($inout1,&QWP(16*1,$inp));
17001cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16*2,$inp));
17011cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
17021cb0ef41Sopenharmony_ci	&xorps	($inout1,$inout4);
17031cb0ef41Sopenharmony_ci
17041cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt2");
17051cb0ef41Sopenharmony_ci
17061cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
17071cb0ef41Sopenharmony_ci	&xorps	($inout1,$inout4);
17081cb0ef41Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
17091cb0ef41Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
17101cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16*2,$out));
17111cb0ef41Sopenharmony_ci
17121cb0ef41Sopenharmony_ci	&movdqa	($tweak,$inout4);		# last tweak
17131cb0ef41Sopenharmony_ci	&jmp	(&label("xts_dec_done"));
17141cb0ef41Sopenharmony_ci
17151cb0ef41Sopenharmony_ci&set_label("xts_dec_three",16);
17161cb0ef41Sopenharmony_ci	&movaps	($inout5,$tweak);		# put aside last tweak
17171cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
17181cb0ef41Sopenharmony_ci	&movups	($inout1,&QWP(16*1,$inp));
17191cb0ef41Sopenharmony_ci	&movups	($inout2,&QWP(16*2,$inp));
17201cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16*3,$inp));
17211cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
17221cb0ef41Sopenharmony_ci	&xorps	($inout1,$inout4);
17231cb0ef41Sopenharmony_ci	&xorps	($inout2,$inout5);
17241cb0ef41Sopenharmony_ci
17251cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt3");
17261cb0ef41Sopenharmony_ci
17271cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
17281cb0ef41Sopenharmony_ci	&xorps	($inout1,$inout4);
17291cb0ef41Sopenharmony_ci	&xorps	($inout2,$inout5);
17301cb0ef41Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
17311cb0ef41Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
17321cb0ef41Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
17331cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16*3,$out));
17341cb0ef41Sopenharmony_ci
17351cb0ef41Sopenharmony_ci	&movdqa	($tweak,$inout5);		# last tweak
17361cb0ef41Sopenharmony_ci	&jmp	(&label("xts_dec_done"));
17371cb0ef41Sopenharmony_ci
17381cb0ef41Sopenharmony_ci&set_label("xts_dec_four",16);
17391cb0ef41Sopenharmony_ci	&movaps	($inout4,$tweak);		# put aside last tweak
17401cb0ef41Sopenharmony_ci
17411cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(16*0,$inp));	# load input
17421cb0ef41Sopenharmony_ci	&movups	($inout1,&QWP(16*1,$inp));
17431cb0ef41Sopenharmony_ci	&movups	($inout2,&QWP(16*2,$inp));
17441cb0ef41Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
17451cb0ef41Sopenharmony_ci	&movups	($inout3,&QWP(16*3,$inp));
17461cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16*4,$inp));
17471cb0ef41Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
17481cb0ef41Sopenharmony_ci	&xorps	($inout2,$inout5);
17491cb0ef41Sopenharmony_ci	&xorps	($inout3,$inout4);
17501cb0ef41Sopenharmony_ci
17511cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt4");
17521cb0ef41Sopenharmony_ci
17531cb0ef41Sopenharmony_ci	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
17541cb0ef41Sopenharmony_ci	&xorps	($inout1,&QWP(16*1,"esp"));
17551cb0ef41Sopenharmony_ci	&xorps	($inout2,$inout5);
17561cb0ef41Sopenharmony_ci	&movups	(&QWP(16*0,$out),$inout0);	# write output
17571cb0ef41Sopenharmony_ci	&xorps	($inout3,$inout4);
17581cb0ef41Sopenharmony_ci	&movups	(&QWP(16*1,$out),$inout1);
17591cb0ef41Sopenharmony_ci	&movups	(&QWP(16*2,$out),$inout2);
17601cb0ef41Sopenharmony_ci	&movups	(&QWP(16*3,$out),$inout3);
17611cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16*4,$out));
17621cb0ef41Sopenharmony_ci
17631cb0ef41Sopenharmony_ci	&movdqa	($tweak,$inout4);		# last tweak
17641cb0ef41Sopenharmony_ci	&jmp	(&label("xts_dec_done"));
17651cb0ef41Sopenharmony_ci
17661cb0ef41Sopenharmony_ci&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
17671cb0ef41Sopenharmony_ci	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
17681cb0ef41Sopenharmony_ci	&and	($len,15);
17691cb0ef41Sopenharmony_ci	&jz	(&label("xts_dec_ret"));
17701cb0ef41Sopenharmony_ci	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
17711cb0ef41Sopenharmony_ci	&jmp	(&label("xts_dec_only_one_more"));
17721cb0ef41Sopenharmony_ci
17731cb0ef41Sopenharmony_ci&set_label("xts_dec_done",16);
17741cb0ef41Sopenharmony_ci	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
17751cb0ef41Sopenharmony_ci	&pxor	($twtmp,$twtmp);
17761cb0ef41Sopenharmony_ci	&and	($len,15);
17771cb0ef41Sopenharmony_ci	&jz	(&label("xts_dec_ret"));
17781cb0ef41Sopenharmony_ci
17791cb0ef41Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
17801cb0ef41Sopenharmony_ci	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
17811cb0ef41Sopenharmony_ci	&pshufd	($twres,$twtmp,0x13);
17821cb0ef41Sopenharmony_ci	&pxor	($twtmp,$twtmp);
17831cb0ef41Sopenharmony_ci	&movdqa	($twmask,&QWP(16*6,"esp"));
17841cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
17851cb0ef41Sopenharmony_ci	&pand	($twres,$twmask);		# isolate carry and residue
17861cb0ef41Sopenharmony_ci	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
17871cb0ef41Sopenharmony_ci	&pxor	($tweak,$twres);
17881cb0ef41Sopenharmony_ci
17891cb0ef41Sopenharmony_ci&set_label("xts_dec_only_one_more");
17901cb0ef41Sopenharmony_ci	&pshufd	($inout3,$twtmp,0x13);
17911cb0ef41Sopenharmony_ci	&movdqa	($inout4,$tweak);		# put aside previous tweak
17921cb0ef41Sopenharmony_ci	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
17931cb0ef41Sopenharmony_ci	&pand	($inout3,$twmask);		# isolate carry and residue
17941cb0ef41Sopenharmony_ci	&pxor	($inout3,$tweak);
17951cb0ef41Sopenharmony_ci
17961cb0ef41Sopenharmony_ci	&mov	($key,$key_);			# restore $key
17971cb0ef41Sopenharmony_ci	&mov	($rounds,$rounds_);		# restore $rounds
17981cb0ef41Sopenharmony_ci
17991cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));		# load input
18001cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# input^=tweak
18011cb0ef41Sopenharmony_ci	if ($inline)
18021cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
18031cb0ef41Sopenharmony_ci	else
18041cb0ef41Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
18051cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout3);		# output^=tweak
18061cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);		# write output
18071cb0ef41Sopenharmony_ci
18081cb0ef41Sopenharmony_ci&set_label("xts_dec_steal");
18091cb0ef41Sopenharmony_ci	&movz	($rounds,&BP(16,$inp));
18101cb0ef41Sopenharmony_ci	&movz	($key,&BP(0,$out));
18111cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(1,$inp));
18121cb0ef41Sopenharmony_ci	&mov	(&BP(0,$out),&LB($rounds));
18131cb0ef41Sopenharmony_ci	&mov	(&BP(16,$out),&LB($key));
18141cb0ef41Sopenharmony_ci	&lea	($out,&DWP(1,$out));
18151cb0ef41Sopenharmony_ci	&sub	($len,1);
18161cb0ef41Sopenharmony_ci	&jnz	(&label("xts_dec_steal"));
18171cb0ef41Sopenharmony_ci
18181cb0ef41Sopenharmony_ci	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
18191cb0ef41Sopenharmony_ci	&mov	($key,$key_);			# restore $key
18201cb0ef41Sopenharmony_ci	&mov	($rounds,$rounds_);		# restore $rounds
18211cb0ef41Sopenharmony_ci
18221cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(0,$out));		# load input
18231cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout4);		# input^=tweak
18241cb0ef41Sopenharmony_ci	if ($inline)
18251cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
18261cb0ef41Sopenharmony_ci	else
18271cb0ef41Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
18281cb0ef41Sopenharmony_ci	&xorps	($inout0,$inout4);		# output^=tweak
18291cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);		# write output
18301cb0ef41Sopenharmony_ci
18311cb0ef41Sopenharmony_ci&set_label("xts_dec_ret");
18321cb0ef41Sopenharmony_ci	&pxor	("xmm0","xmm0");		# clear register bank
18331cb0ef41Sopenharmony_ci	&pxor	("xmm1","xmm1");
18341cb0ef41Sopenharmony_ci	&pxor	("xmm2","xmm2");
18351cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
18361cb0ef41Sopenharmony_ci	&pxor	("xmm3","xmm3");
18371cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*1,"esp"),"xmm0");
18381cb0ef41Sopenharmony_ci	&pxor	("xmm4","xmm4");
18391cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*2,"esp"),"xmm0");
18401cb0ef41Sopenharmony_ci	&pxor	("xmm5","xmm5");
18411cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*3,"esp"),"xmm0");
18421cb0ef41Sopenharmony_ci	&pxor	("xmm6","xmm6");
18431cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*4,"esp"),"xmm0");
18441cb0ef41Sopenharmony_ci	&pxor	("xmm7","xmm7");
18451cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*5,"esp"),"xmm0");
18461cb0ef41Sopenharmony_ci	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
18471cb0ef41Sopenharmony_ci&function_end("aesni_xts_decrypt");
18481cb0ef41Sopenharmony_ci}
18491cb0ef41Sopenharmony_ci
18501cb0ef41Sopenharmony_ci######################################################################
18511cb0ef41Sopenharmony_ci# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
18521cb0ef41Sopenharmony_ci#	const AES_KEY *key, unsigned int start_block_num,
18531cb0ef41Sopenharmony_ci#	unsigned char offset_i[16], const unsigned char L_[][16],
18541cb0ef41Sopenharmony_ci#	unsigned char checksum[16]);
18551cb0ef41Sopenharmony_ci#
18561cb0ef41Sopenharmony_ci{
18571cb0ef41Sopenharmony_ci# offsets within stack frame
18581cb0ef41Sopenharmony_cimy $checksum = 16*6;
18591cb0ef41Sopenharmony_cimy ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
18601cb0ef41Sopenharmony_ci
18611cb0ef41Sopenharmony_ci# reassigned registers
18621cb0ef41Sopenharmony_cimy ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
18631cb0ef41Sopenharmony_ci# $l_, $blocks, $inp, $key are permanently allocated in registers;
18641cb0ef41Sopenharmony_ci# remaining non-volatile ones are offloaded to stack, which even
18651cb0ef41Sopenharmony_ci# stay invariant after written to stack.
18661cb0ef41Sopenharmony_ci
18671cb0ef41Sopenharmony_ci&function_begin("aesni_ocb_encrypt");
18681cb0ef41Sopenharmony_ci	&mov	($rounds,&wparam(5));		# &offset_i
18691cb0ef41Sopenharmony_ci	&mov	($rounds_,&wparam(7));		# &checksum
18701cb0ef41Sopenharmony_ci
18711cb0ef41Sopenharmony_ci	&mov	($inp,&wparam(0));
18721cb0ef41Sopenharmony_ci	&mov	($out,&wparam(1));
18731cb0ef41Sopenharmony_ci	&mov	($len,&wparam(2));
18741cb0ef41Sopenharmony_ci	&mov	($key,&wparam(3));
18751cb0ef41Sopenharmony_ci	&movdqu	($rndkey0,&QWP(0,$rounds));	# load offset_i
18761cb0ef41Sopenharmony_ci	&mov	($block,&wparam(4));		# start_block_num
18771cb0ef41Sopenharmony_ci	&movdqu	($rndkey1,&QWP(0,$rounds_));	# load checksum
18781cb0ef41Sopenharmony_ci	&mov	($l_,&wparam(6));		# L_
18791cb0ef41Sopenharmony_ci
18801cb0ef41Sopenharmony_ci	&mov	($rounds,"esp");
18811cb0ef41Sopenharmony_ci	&sub	("esp",$esp_off+4);		# alloca
18821cb0ef41Sopenharmony_ci	&and	("esp",-16);			# align stack
18831cb0ef41Sopenharmony_ci
18841cb0ef41Sopenharmony_ci	&sub	($out,$inp);
18851cb0ef41Sopenharmony_ci	&shl	($len,4);
18861cb0ef41Sopenharmony_ci	&lea	($len,&DWP(-16*6,$inp,$len));	# end of input - 16*6
18871cb0ef41Sopenharmony_ci	&mov	(&DWP($out_off,"esp"),$out);
18881cb0ef41Sopenharmony_ci	&mov	(&DWP($end_off,"esp"),$len);
18891cb0ef41Sopenharmony_ci	&mov	(&DWP($esp_off,"esp"),$rounds);
18901cb0ef41Sopenharmony_ci
18911cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
18921cb0ef41Sopenharmony_ci
18931cb0ef41Sopenharmony_ci	&test	($block,1);
18941cb0ef41Sopenharmony_ci	&jnz	(&label("odd"));
18951cb0ef41Sopenharmony_ci
18961cb0ef41Sopenharmony_ci	&bsf		($i3,$block);
18971cb0ef41Sopenharmony_ci	&add		($block,1);
18981cb0ef41Sopenharmony_ci	&shl		($i3,4);
18991cb0ef41Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i3));
19001cb0ef41Sopenharmony_ci	&mov		($i3,$key);			# put aside key
19011cb0ef41Sopenharmony_ci
19021cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
19031cb0ef41Sopenharmony_ci	&lea		($inp,&DWP(16,$inp));
19041cb0ef41Sopenharmony_ci
19051cb0ef41Sopenharmony_ci	&pxor		($inout5,$rndkey0);		# ^ last offset_i
19061cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
19071cb0ef41Sopenharmony_ci	&pxor		($inout0,$inout5);		# ^ offset_i
19081cb0ef41Sopenharmony_ci
19091cb0ef41Sopenharmony_ci	&movdqa		($inout4,$rndkey1);
19101cb0ef41Sopenharmony_ci	if ($inline)
19111cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
19121cb0ef41Sopenharmony_ci	else
19131cb0ef41Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
19141cb0ef41Sopenharmony_ci
19151cb0ef41Sopenharmony_ci	&xorps		($inout0,$inout5);		# ^ offset_i
19161cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
19171cb0ef41Sopenharmony_ci	&movdqa		($rndkey1,$inout4);		# pass the checksum
19181cb0ef41Sopenharmony_ci
19191cb0ef41Sopenharmony_ci	&movups		(&QWP(-16,$out,$inp),$inout0);	# store output
19201cb0ef41Sopenharmony_ci
19211cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP(240,$i3));
19221cb0ef41Sopenharmony_ci	&mov		($key,$i3);			# restore key
19231cb0ef41Sopenharmony_ci	&mov		($len,&DWP($end_off,"esp"));
19241cb0ef41Sopenharmony_ci
19251cb0ef41Sopenharmony_ci&set_label("odd");
19261cb0ef41Sopenharmony_ci	&shl		($rounds,4);
19271cb0ef41Sopenharmony_ci	&mov		($out,16);
19281cb0ef41Sopenharmony_ci	&sub		($out,$rounds);			# twisted rounds
19291cb0ef41Sopenharmony_ci	&mov		(&DWP($key_off,"esp"),$key);
19301cb0ef41Sopenharmony_ci	&lea		($key,&DWP(32,$key,$rounds));	# end of key schedule
19311cb0ef41Sopenharmony_ci	&mov		(&DWP($rounds_off,"esp"),$out);
19321cb0ef41Sopenharmony_ci
19331cb0ef41Sopenharmony_ci	&cmp		($inp,$len);
19341cb0ef41Sopenharmony_ci	&ja		(&label("short"));
19351cb0ef41Sopenharmony_ci	&jmp		(&label("grandloop"));
19361cb0ef41Sopenharmony_ci
19371cb0ef41Sopenharmony_ci&set_label("grandloop",32);
19381cb0ef41Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
19391cb0ef41Sopenharmony_ci	&lea		($i3,&DWP(3,$block));
19401cb0ef41Sopenharmony_ci	&lea		($i5,&DWP(5,$block));
19411cb0ef41Sopenharmony_ci	&add		($block,6);
19421cb0ef41Sopenharmony_ci	&bsf		($i1,$i1);
19431cb0ef41Sopenharmony_ci	&bsf		($i3,$i3);
19441cb0ef41Sopenharmony_ci	&bsf		($i5,$i5);
19451cb0ef41Sopenharmony_ci	&shl		($i1,4);
19461cb0ef41Sopenharmony_ci	&shl		($i3,4);
19471cb0ef41Sopenharmony_ci	&shl		($i5,4);
19481cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(0,$l_));
19491cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(0,$l_,$i1));
19501cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP($rounds_off,"esp"));
19511cb0ef41Sopenharmony_ci	&movdqa		($inout2,$inout0);
19521cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_,$i3));
19531cb0ef41Sopenharmony_ci	&movdqa		($inout4,$inout0);
19541cb0ef41Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i5));
19551cb0ef41Sopenharmony_ci
19561cb0ef41Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ last offset_i
19571cb0ef41Sopenharmony_ci	&pxor		($inout1,$inout0);
19581cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$inout0);
19591cb0ef41Sopenharmony_ci	&pxor		($inout2,$inout1);
19601cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$inout1);
19611cb0ef41Sopenharmony_ci	&pxor		($inout3,$inout2);
19621cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*2,"esp"),$inout2);
19631cb0ef41Sopenharmony_ci	&pxor		($inout4,$inout3);
19641cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$inout3);
19651cb0ef41Sopenharmony_ci	&pxor		($inout5,$inout4);
19661cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*4,"esp"),$inout4);
19671cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*5,"esp"),$inout5);
19681cb0ef41Sopenharmony_ci
19691cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
19701cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
19711cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
19721cb0ef41Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
19731cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(16*3,$inp));
19741cb0ef41Sopenharmony_ci	&movdqu		($inout4,&QWP(16*4,$inp));
19751cb0ef41Sopenharmony_ci	&movdqu		($inout5,&QWP(16*5,$inp));
19761cb0ef41Sopenharmony_ci	&lea		($inp,&DWP(16*6,$inp));
19771cb0ef41Sopenharmony_ci
19781cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
19791cb0ef41Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
19801cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout1);
19811cb0ef41Sopenharmony_ci	&pxor		($inout1,$rndkey0);
19821cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout2);
19831cb0ef41Sopenharmony_ci	&pxor		($inout2,$rndkey0);
19841cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout3);
19851cb0ef41Sopenharmony_ci	&pxor		($inout3,$rndkey0);
19861cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout4);
19871cb0ef41Sopenharmony_ci	&pxor		($inout4,$rndkey0);
19881cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout5);
19891cb0ef41Sopenharmony_ci	&pxor		($inout5,$rndkey0);
19901cb0ef41Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
19911cb0ef41Sopenharmony_ci
19921cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
19931cb0ef41Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
19941cb0ef41Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
19951cb0ef41Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
19961cb0ef41Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
19971cb0ef41Sopenharmony_ci	&pxor		($inout4,&QWP(16*4,"esp"));
19981cb0ef41Sopenharmony_ci	&pxor		($inout5,&QWP(16*5,"esp"));
19991cb0ef41Sopenharmony_ci
20001cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
20011cb0ef41Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
20021cb0ef41Sopenharmony_ci	&aesenc		($inout1,$rndkey1);
20031cb0ef41Sopenharmony_ci	&aesenc		($inout2,$rndkey1);
20041cb0ef41Sopenharmony_ci	&aesenc		($inout3,$rndkey1);
20051cb0ef41Sopenharmony_ci	&aesenc		($inout4,$rndkey1);
20061cb0ef41Sopenharmony_ci	&aesenc		($inout5,$rndkey1);
20071cb0ef41Sopenharmony_ci
20081cb0ef41Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
20091cb0ef41Sopenharmony_ci	&mov		($len,&DWP($end_off,"esp"));
20101cb0ef41Sopenharmony_ci	&call		("_aesni_encrypt6_enter");
20111cb0ef41Sopenharmony_ci
20121cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,&QWP(16*5,"esp"));	# pass last offset_i
20131cb0ef41Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
20141cb0ef41Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
20151cb0ef41Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
20161cb0ef41Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
20171cb0ef41Sopenharmony_ci	&pxor		($inout4,&QWP(16*4,"esp"));
20181cb0ef41Sopenharmony_ci	&pxor		($inout5,$rndkey0);
20191cb0ef41Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
20201cb0ef41Sopenharmony_ci
20211cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16*6,$out,$inp),$inout0);# store output
20221cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16*5,$out,$inp),$inout1);
20231cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16*4,$out,$inp),$inout2);
20241cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16*3,$out,$inp),$inout3);
20251cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16*2,$out,$inp),$inout4);
20261cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16*1,$out,$inp),$inout5);
20271cb0ef41Sopenharmony_ci	&cmp		($inp,$len);			# done yet?
20281cb0ef41Sopenharmony_ci	&jbe		(&label("grandloop"));
20291cb0ef41Sopenharmony_ci
20301cb0ef41Sopenharmony_ci&set_label("short");
20311cb0ef41Sopenharmony_ci	&add		($len,16*6);
20321cb0ef41Sopenharmony_ci	&sub		($len,$inp);
20331cb0ef41Sopenharmony_ci	&jz		(&label("done"));
20341cb0ef41Sopenharmony_ci
20351cb0ef41Sopenharmony_ci	&cmp		($len,16*2);
20361cb0ef41Sopenharmony_ci	&jb		(&label("one"));
20371cb0ef41Sopenharmony_ci	&je		(&label("two"));
20381cb0ef41Sopenharmony_ci
20391cb0ef41Sopenharmony_ci	&cmp		($len,16*4);
20401cb0ef41Sopenharmony_ci	&jb		(&label("three"));
20411cb0ef41Sopenharmony_ci	&je		(&label("four"));
20421cb0ef41Sopenharmony_ci
20431cb0ef41Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
20441cb0ef41Sopenharmony_ci	&lea		($i3,&DWP(3,$block));
20451cb0ef41Sopenharmony_ci	&bsf		($i1,$i1);
20461cb0ef41Sopenharmony_ci	&bsf		($i3,$i3);
20471cb0ef41Sopenharmony_ci	&shl		($i1,4);
20481cb0ef41Sopenharmony_ci	&shl		($i3,4);
20491cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(0,$l_));
20501cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(0,$l_,$i1));
20511cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP($rounds_off,"esp"));
20521cb0ef41Sopenharmony_ci	&movdqa		($inout2,$inout0);
20531cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_,$i3));
20541cb0ef41Sopenharmony_ci	&movdqa		($inout4,$inout0);
20551cb0ef41Sopenharmony_ci
20561cb0ef41Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ last offset_i
20571cb0ef41Sopenharmony_ci	&pxor		($inout1,$inout0);
20581cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$inout0);
20591cb0ef41Sopenharmony_ci	&pxor		($inout2,$inout1);
20601cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$inout1);
20611cb0ef41Sopenharmony_ci	&pxor		($inout3,$inout2);
20621cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*2,"esp"),$inout2);
20631cb0ef41Sopenharmony_ci	&pxor		($inout4,$inout3);
20641cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$inout3);
20651cb0ef41Sopenharmony_ci	&pxor		($inout5,$inout4);
20661cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*4,"esp"),$inout4);
20671cb0ef41Sopenharmony_ci
20681cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
20691cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
20701cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
20711cb0ef41Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
20721cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(16*3,$inp));
20731cb0ef41Sopenharmony_ci	&movdqu		($inout4,&QWP(16*4,$inp));
20741cb0ef41Sopenharmony_ci	&pxor		($inout5,$inout5);
20751cb0ef41Sopenharmony_ci
20761cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
20771cb0ef41Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
20781cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout1);
20791cb0ef41Sopenharmony_ci	&pxor		($inout1,$rndkey0);
20801cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout2);
20811cb0ef41Sopenharmony_ci	&pxor		($inout2,$rndkey0);
20821cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout3);
20831cb0ef41Sopenharmony_ci	&pxor		($inout3,$rndkey0);
20841cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout4);
20851cb0ef41Sopenharmony_ci	&pxor		($inout4,$rndkey0);
20861cb0ef41Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
20871cb0ef41Sopenharmony_ci
20881cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
20891cb0ef41Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
20901cb0ef41Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
20911cb0ef41Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
20921cb0ef41Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
20931cb0ef41Sopenharmony_ci	&pxor		($inout4,&QWP(16*4,"esp"));
20941cb0ef41Sopenharmony_ci
20951cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
20961cb0ef41Sopenharmony_ci	&aesenc		($inout0,$rndkey1);
20971cb0ef41Sopenharmony_ci	&aesenc		($inout1,$rndkey1);
20981cb0ef41Sopenharmony_ci	&aesenc		($inout2,$rndkey1);
20991cb0ef41Sopenharmony_ci	&aesenc		($inout3,$rndkey1);
21001cb0ef41Sopenharmony_ci	&aesenc		($inout4,$rndkey1);
21011cb0ef41Sopenharmony_ci	&aesenc		($inout5,$rndkey1);
21021cb0ef41Sopenharmony_ci
21031cb0ef41Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
21041cb0ef41Sopenharmony_ci	&call		("_aesni_encrypt6_enter");
21051cb0ef41Sopenharmony_ci
21061cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,&QWP(16*4,"esp"));	# pass last offset_i
21071cb0ef41Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
21081cb0ef41Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
21091cb0ef41Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
21101cb0ef41Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
21111cb0ef41Sopenharmony_ci	&pxor		($inout4,$rndkey0);
21121cb0ef41Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
21131cb0ef41Sopenharmony_ci
21141cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*0,$out,$inp),$inout0);	# store output
21151cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*1,$out,$inp),$inout1);
21161cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*2,$out,$inp),$inout2);
21171cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*3,$out,$inp),$inout3);
21181cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*4,$out,$inp),$inout4);
21191cb0ef41Sopenharmony_ci
21201cb0ef41Sopenharmony_ci	&jmp		(&label("done"));
21211cb0ef41Sopenharmony_ci
21221cb0ef41Sopenharmony_ci&set_label("one",16);
21231cb0ef41Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_));
21241cb0ef41Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
21251cb0ef41Sopenharmony_ci
21261cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
21271cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
21281cb0ef41Sopenharmony_ci
21291cb0ef41Sopenharmony_ci	&pxor		($inout5,$rndkey0);		# ^ last offset_i
21301cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
21311cb0ef41Sopenharmony_ci	&pxor		($inout0,$inout5);		# ^ offset_i
21321cb0ef41Sopenharmony_ci
21331cb0ef41Sopenharmony_ci	&movdqa		($inout4,$rndkey1);
21341cb0ef41Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
21351cb0ef41Sopenharmony_ci	if ($inline)
21361cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("enc");	}
21371cb0ef41Sopenharmony_ci	else
21381cb0ef41Sopenharmony_ci	{   &call	("_aesni_encrypt1");	}
21391cb0ef41Sopenharmony_ci
21401cb0ef41Sopenharmony_ci	&xorps		($inout0,$inout5);		# ^ offset_i
21411cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
21421cb0ef41Sopenharmony_ci	&movdqa		($rndkey1,$inout4);		# pass the checksum
21431cb0ef41Sopenharmony_ci	&movups		(&QWP(0,$out,$inp),$inout0);
21441cb0ef41Sopenharmony_ci
21451cb0ef41Sopenharmony_ci	&jmp		(&label("done"));
21461cb0ef41Sopenharmony_ci
21471cb0ef41Sopenharmony_ci&set_label("two",16);
21481cb0ef41Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
21491cb0ef41Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
21501cb0ef41Sopenharmony_ci	&bsf		($i1,$i1);
21511cb0ef41Sopenharmony_ci	&shl		($i1,4);
21521cb0ef41Sopenharmony_ci	&movdqu		($inout4,&QWP(0,$l_));
21531cb0ef41Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i1));
21541cb0ef41Sopenharmony_ci
21551cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
21561cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
21571cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
21581cb0ef41Sopenharmony_ci
21591cb0ef41Sopenharmony_ci	&pxor		($inout4,$rndkey0);		# ^ last offset_i
21601cb0ef41Sopenharmony_ci	&pxor		($inout5,$inout4);
21611cb0ef41Sopenharmony_ci
21621cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
21631cb0ef41Sopenharmony_ci	&pxor		($inout0,$inout4);		# ^ offset_i
21641cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout1);
21651cb0ef41Sopenharmony_ci	&pxor		($inout1,$inout5);
21661cb0ef41Sopenharmony_ci
21671cb0ef41Sopenharmony_ci	&movdqa		($inout3,$rndkey1)
21681cb0ef41Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
21691cb0ef41Sopenharmony_ci	&call		("_aesni_encrypt2");
21701cb0ef41Sopenharmony_ci
21711cb0ef41Sopenharmony_ci	&xorps		($inout0,$inout4);		# ^ offset_i
21721cb0ef41Sopenharmony_ci	&xorps		($inout1,$inout5);
21731cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
21741cb0ef41Sopenharmony_ci	&movdqa		($rndkey1,$inout3);		# pass the checksum
21751cb0ef41Sopenharmony_ci	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
21761cb0ef41Sopenharmony_ci	&movups		(&QWP(16*1,$out,$inp),$inout1);
21771cb0ef41Sopenharmony_ci
21781cb0ef41Sopenharmony_ci	&jmp		(&label("done"));
21791cb0ef41Sopenharmony_ci
21801cb0ef41Sopenharmony_ci&set_label("three",16);
21811cb0ef41Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
21821cb0ef41Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
21831cb0ef41Sopenharmony_ci	&bsf		($i1,$i1);
21841cb0ef41Sopenharmony_ci	&shl		($i1,4);
21851cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_));
21861cb0ef41Sopenharmony_ci	&movdqu		($inout4,&QWP(0,$l_,$i1));
21871cb0ef41Sopenharmony_ci	&movdqa		($inout5,$inout3);
21881cb0ef41Sopenharmony_ci
21891cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
21901cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
21911cb0ef41Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
21921cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
21931cb0ef41Sopenharmony_ci
21941cb0ef41Sopenharmony_ci	&pxor		($inout3,$rndkey0);		# ^ last offset_i
21951cb0ef41Sopenharmony_ci	&pxor		($inout4,$inout3);
21961cb0ef41Sopenharmony_ci	&pxor		($inout5,$inout4);
21971cb0ef41Sopenharmony_ci
21981cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
21991cb0ef41Sopenharmony_ci	&pxor		($inout0,$inout3);		# ^ offset_i
22001cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout1);
22011cb0ef41Sopenharmony_ci	&pxor		($inout1,$inout4);
22021cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout2);
22031cb0ef41Sopenharmony_ci	&pxor		($inout2,$inout5);
22041cb0ef41Sopenharmony_ci
22051cb0ef41Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
22061cb0ef41Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
22071cb0ef41Sopenharmony_ci	&call		("_aesni_encrypt3");
22081cb0ef41Sopenharmony_ci
22091cb0ef41Sopenharmony_ci	&xorps		($inout0,$inout3);		# ^ offset_i
22101cb0ef41Sopenharmony_ci	&xorps		($inout1,$inout4);
22111cb0ef41Sopenharmony_ci	&xorps		($inout2,$inout5);
22121cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
22131cb0ef41Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
22141cb0ef41Sopenharmony_ci	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
22151cb0ef41Sopenharmony_ci	&movups		(&QWP(16*1,$out,$inp),$inout1);
22161cb0ef41Sopenharmony_ci	&movups		(&QWP(16*2,$out,$inp),$inout2);
22171cb0ef41Sopenharmony_ci
22181cb0ef41Sopenharmony_ci	&jmp		(&label("done"));
22191cb0ef41Sopenharmony_ci
22201cb0ef41Sopenharmony_ci&set_label("four",16);
22211cb0ef41Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
22221cb0ef41Sopenharmony_ci	&lea		($i3,&DWP(3,$block));
22231cb0ef41Sopenharmony_ci	&bsf		($i1,$i1);
22241cb0ef41Sopenharmony_ci	&bsf		($i3,$i3);
22251cb0ef41Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
22261cb0ef41Sopenharmony_ci	&shl		($i1,4);
22271cb0ef41Sopenharmony_ci	&shl		($i3,4);
22281cb0ef41Sopenharmony_ci	&movdqu		($inout2,&QWP(0,$l_));
22291cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_,$i1));
22301cb0ef41Sopenharmony_ci	&movdqa		($inout4,$inout2);
22311cb0ef41Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i3));
22321cb0ef41Sopenharmony_ci
22331cb0ef41Sopenharmony_ci	&pxor		($inout2,$rndkey0);		# ^ last offset_i
22341cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
22351cb0ef41Sopenharmony_ci	&pxor		($inout3,$inout2);
22361cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
22371cb0ef41Sopenharmony_ci	&pxor		($inout4,$inout3);
22381cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$inout2);
22391cb0ef41Sopenharmony_ci	&pxor		($inout5,$inout4);
22401cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$inout3);
22411cb0ef41Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
22421cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(16*3,$inp));
22431cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
22441cb0ef41Sopenharmony_ci
22451cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
22461cb0ef41Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
22471cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout1);
22481cb0ef41Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
22491cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout2);
22501cb0ef41Sopenharmony_ci	&pxor		($inout2,$inout4);
22511cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout3);
22521cb0ef41Sopenharmony_ci	&pxor		($inout3,$inout5);
22531cb0ef41Sopenharmony_ci
22541cb0ef41Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1)
22551cb0ef41Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
22561cb0ef41Sopenharmony_ci	&call		("_aesni_encrypt4");
22571cb0ef41Sopenharmony_ci
22581cb0ef41Sopenharmony_ci	&xorps		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
22591cb0ef41Sopenharmony_ci	&xorps		($inout1,&QWP(16*1,"esp"));
22601cb0ef41Sopenharmony_ci	&xorps		($inout2,$inout4);
22611cb0ef41Sopenharmony_ci	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
22621cb0ef41Sopenharmony_ci	&xorps		($inout3,$inout5);
22631cb0ef41Sopenharmony_ci	&movups		(&QWP(16*1,$out,$inp),$inout1);
22641cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
22651cb0ef41Sopenharmony_ci	&movups		(&QWP(16*2,$out,$inp),$inout2);
22661cb0ef41Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
22671cb0ef41Sopenharmony_ci	&movups		(&QWP(16*3,$out,$inp),$inout3);
22681cb0ef41Sopenharmony_ci
22691cb0ef41Sopenharmony_ci&set_label("done");
22701cb0ef41Sopenharmony_ci	&mov	($key,&DWP($esp_off,"esp"));
22711cb0ef41Sopenharmony_ci	&pxor	($inout0,$inout0);		# clear register bank
22721cb0ef41Sopenharmony_ci	&pxor	($inout1,$inout1);
22731cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*0,"esp"),$inout0);	# clear stack
22741cb0ef41Sopenharmony_ci	&pxor	($inout2,$inout2);
22751cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*1,"esp"),$inout0);
22761cb0ef41Sopenharmony_ci	&pxor	($inout3,$inout3);
22771cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*2,"esp"),$inout0);
22781cb0ef41Sopenharmony_ci	&pxor	($inout4,$inout4);
22791cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*3,"esp"),$inout0);
22801cb0ef41Sopenharmony_ci	&pxor	($inout5,$inout5);
22811cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*4,"esp"),$inout0);
22821cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*5,"esp"),$inout0);
22831cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*6,"esp"),$inout0);
22841cb0ef41Sopenharmony_ci
22851cb0ef41Sopenharmony_ci	&lea	("esp",&DWP(0,$key));
22861cb0ef41Sopenharmony_ci	&mov	($rounds,&wparam(5));		# &offset_i
22871cb0ef41Sopenharmony_ci	&mov	($rounds_,&wparam(7));		# &checksum
22881cb0ef41Sopenharmony_ci	&movdqu	(&QWP(0,$rounds),$rndkey0);
22891cb0ef41Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);
22901cb0ef41Sopenharmony_ci	&movdqu	(&QWP(0,$rounds_),$rndkey1);
22911cb0ef41Sopenharmony_ci	&pxor	($rndkey1,$rndkey1);
22921cb0ef41Sopenharmony_ci&function_end("aesni_ocb_encrypt");
22931cb0ef41Sopenharmony_ci
22941cb0ef41Sopenharmony_ci&function_begin("aesni_ocb_decrypt");
22951cb0ef41Sopenharmony_ci	&mov	($rounds,&wparam(5));		# &offset_i
22961cb0ef41Sopenharmony_ci	&mov	($rounds_,&wparam(7));		# &checksum
22971cb0ef41Sopenharmony_ci
22981cb0ef41Sopenharmony_ci	&mov	($inp,&wparam(0));
22991cb0ef41Sopenharmony_ci	&mov	($out,&wparam(1));
23001cb0ef41Sopenharmony_ci	&mov	($len,&wparam(2));
23011cb0ef41Sopenharmony_ci	&mov	($key,&wparam(3));
23021cb0ef41Sopenharmony_ci	&movdqu	($rndkey0,&QWP(0,$rounds));	# load offset_i
23031cb0ef41Sopenharmony_ci	&mov	($block,&wparam(4));		# start_block_num
23041cb0ef41Sopenharmony_ci	&movdqu	($rndkey1,&QWP(0,$rounds_));	# load checksum
23051cb0ef41Sopenharmony_ci	&mov	($l_,&wparam(6));		# L_
23061cb0ef41Sopenharmony_ci
23071cb0ef41Sopenharmony_ci	&mov	($rounds,"esp");
23081cb0ef41Sopenharmony_ci	&sub	("esp",$esp_off+4);		# alloca
23091cb0ef41Sopenharmony_ci	&and	("esp",-16);			# align stack
23101cb0ef41Sopenharmony_ci
23111cb0ef41Sopenharmony_ci	&sub	($out,$inp);
23121cb0ef41Sopenharmony_ci	&shl	($len,4);
23131cb0ef41Sopenharmony_ci	&lea	($len,&DWP(-16*6,$inp,$len));	# end of input - 16*6
23141cb0ef41Sopenharmony_ci	&mov	(&DWP($out_off,"esp"),$out);
23151cb0ef41Sopenharmony_ci	&mov	(&DWP($end_off,"esp"),$len);
23161cb0ef41Sopenharmony_ci	&mov	(&DWP($esp_off,"esp"),$rounds);
23171cb0ef41Sopenharmony_ci
23181cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
23191cb0ef41Sopenharmony_ci
23201cb0ef41Sopenharmony_ci	&test	($block,1);
23211cb0ef41Sopenharmony_ci	&jnz	(&label("odd"));
23221cb0ef41Sopenharmony_ci
23231cb0ef41Sopenharmony_ci	&bsf		($i3,$block);
23241cb0ef41Sopenharmony_ci	&add		($block,1);
23251cb0ef41Sopenharmony_ci	&shl		($i3,4);
23261cb0ef41Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i3));
23271cb0ef41Sopenharmony_ci	&mov		($i3,$key);			# put aside key
23281cb0ef41Sopenharmony_ci
23291cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
23301cb0ef41Sopenharmony_ci	&lea		($inp,&DWP(16,$inp));
23311cb0ef41Sopenharmony_ci
23321cb0ef41Sopenharmony_ci	&pxor		($inout5,$rndkey0);		# ^ last offset_i
23331cb0ef41Sopenharmony_ci	&pxor		($inout0,$inout5);		# ^ offset_i
23341cb0ef41Sopenharmony_ci
23351cb0ef41Sopenharmony_ci	&movdqa		($inout4,$rndkey1);
23361cb0ef41Sopenharmony_ci	if ($inline)
23371cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
23381cb0ef41Sopenharmony_ci	else
23391cb0ef41Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
23401cb0ef41Sopenharmony_ci
23411cb0ef41Sopenharmony_ci	&xorps		($inout0,$inout5);		# ^ offset_i
23421cb0ef41Sopenharmony_ci	&movaps		($rndkey1,$inout4);		# pass the checksum
23431cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
23441cb0ef41Sopenharmony_ci	&xorps		($rndkey1,$inout0);		# checksum
23451cb0ef41Sopenharmony_ci	&movups		(&QWP(-16,$out,$inp),$inout0);	# store output
23461cb0ef41Sopenharmony_ci
23471cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP(240,$i3));
23481cb0ef41Sopenharmony_ci	&mov		($key,$i3);			# restore key
23491cb0ef41Sopenharmony_ci	&mov		($len,&DWP($end_off,"esp"));
23501cb0ef41Sopenharmony_ci
23511cb0ef41Sopenharmony_ci&set_label("odd");
23521cb0ef41Sopenharmony_ci	&shl		($rounds,4);
23531cb0ef41Sopenharmony_ci	&mov		($out,16);
23541cb0ef41Sopenharmony_ci	&sub		($out,$rounds);			# twisted rounds
23551cb0ef41Sopenharmony_ci	&mov		(&DWP($key_off,"esp"),$key);
23561cb0ef41Sopenharmony_ci	&lea		($key,&DWP(32,$key,$rounds));	# end of key schedule
23571cb0ef41Sopenharmony_ci	&mov		(&DWP($rounds_off,"esp"),$out);
23581cb0ef41Sopenharmony_ci
23591cb0ef41Sopenharmony_ci	&cmp		($inp,$len);
23601cb0ef41Sopenharmony_ci	&ja		(&label("short"));
23611cb0ef41Sopenharmony_ci	&jmp		(&label("grandloop"));
23621cb0ef41Sopenharmony_ci
23631cb0ef41Sopenharmony_ci&set_label("grandloop",32);
23641cb0ef41Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
23651cb0ef41Sopenharmony_ci	&lea		($i3,&DWP(3,$block));
23661cb0ef41Sopenharmony_ci	&lea		($i5,&DWP(5,$block));
23671cb0ef41Sopenharmony_ci	&add		($block,6);
23681cb0ef41Sopenharmony_ci	&bsf		($i1,$i1);
23691cb0ef41Sopenharmony_ci	&bsf		($i3,$i3);
23701cb0ef41Sopenharmony_ci	&bsf		($i5,$i5);
23711cb0ef41Sopenharmony_ci	&shl		($i1,4);
23721cb0ef41Sopenharmony_ci	&shl		($i3,4);
23731cb0ef41Sopenharmony_ci	&shl		($i5,4);
23741cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(0,$l_));
23751cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(0,$l_,$i1));
23761cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP($rounds_off,"esp"));
23771cb0ef41Sopenharmony_ci	&movdqa		($inout2,$inout0);
23781cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_,$i3));
23791cb0ef41Sopenharmony_ci	&movdqa		($inout4,$inout0);
23801cb0ef41Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i5));
23811cb0ef41Sopenharmony_ci
23821cb0ef41Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ last offset_i
23831cb0ef41Sopenharmony_ci	&pxor		($inout1,$inout0);
23841cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$inout0);
23851cb0ef41Sopenharmony_ci	&pxor		($inout2,$inout1);
23861cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$inout1);
23871cb0ef41Sopenharmony_ci	&pxor		($inout3,$inout2);
23881cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*2,"esp"),$inout2);
23891cb0ef41Sopenharmony_ci	&pxor		($inout4,$inout3);
23901cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$inout3);
23911cb0ef41Sopenharmony_ci	&pxor		($inout5,$inout4);
23921cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*4,"esp"),$inout4);
23931cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*5,"esp"),$inout5);
23941cb0ef41Sopenharmony_ci
23951cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
23961cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
23971cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
23981cb0ef41Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
23991cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(16*3,$inp));
24001cb0ef41Sopenharmony_ci	&movdqu		($inout4,&QWP(16*4,$inp));
24011cb0ef41Sopenharmony_ci	&movdqu		($inout5,&QWP(16*5,$inp));
24021cb0ef41Sopenharmony_ci	&lea		($inp,&DWP(16*6,$inp));
24031cb0ef41Sopenharmony_ci
24041cb0ef41Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
24051cb0ef41Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
24061cb0ef41Sopenharmony_ci	&pxor		($inout1,$rndkey0);
24071cb0ef41Sopenharmony_ci	&pxor		($inout2,$rndkey0);
24081cb0ef41Sopenharmony_ci	&pxor		($inout3,$rndkey0);
24091cb0ef41Sopenharmony_ci	&pxor		($inout4,$rndkey0);
24101cb0ef41Sopenharmony_ci	&pxor		($inout5,$rndkey0);
24111cb0ef41Sopenharmony_ci
24121cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
24131cb0ef41Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
24141cb0ef41Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
24151cb0ef41Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
24161cb0ef41Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
24171cb0ef41Sopenharmony_ci	&pxor		($inout4,&QWP(16*4,"esp"));
24181cb0ef41Sopenharmony_ci	&pxor		($inout5,&QWP(16*5,"esp"));
24191cb0ef41Sopenharmony_ci
24201cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
24211cb0ef41Sopenharmony_ci	&aesdec		($inout0,$rndkey1);
24221cb0ef41Sopenharmony_ci	&aesdec		($inout1,$rndkey1);
24231cb0ef41Sopenharmony_ci	&aesdec		($inout2,$rndkey1);
24241cb0ef41Sopenharmony_ci	&aesdec		($inout3,$rndkey1);
24251cb0ef41Sopenharmony_ci	&aesdec		($inout4,$rndkey1);
24261cb0ef41Sopenharmony_ci	&aesdec		($inout5,$rndkey1);
24271cb0ef41Sopenharmony_ci
24281cb0ef41Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
24291cb0ef41Sopenharmony_ci	&mov		($len,&DWP($end_off,"esp"));
24301cb0ef41Sopenharmony_ci	&call		("_aesni_decrypt6_enter");
24311cb0ef41Sopenharmony_ci
24321cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,&QWP(16*5,"esp"));	# pass last offset_i
24331cb0ef41Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
24341cb0ef41Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));
24351cb0ef41Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
24361cb0ef41Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
24371cb0ef41Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
24381cb0ef41Sopenharmony_ci	&pxor		($inout4,&QWP(16*4,"esp"));
24391cb0ef41Sopenharmony_ci	&pxor		($inout5,$rndkey0);
24401cb0ef41Sopenharmony_ci
24411cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
24421cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16*6,$out,$inp),$inout0);# store output
24431cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout1);
24441cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16*5,$out,$inp),$inout1);
24451cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout2);
24461cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16*4,$out,$inp),$inout2);
24471cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout3);
24481cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16*3,$out,$inp),$inout3);
24491cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout4);
24501cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16*2,$out,$inp),$inout4);
24511cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout5);
24521cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16*1,$out,$inp),$inout5);
24531cb0ef41Sopenharmony_ci	&cmp		($inp,$len);			# done yet?
24541cb0ef41Sopenharmony_ci	&jbe		(&label("grandloop"));
24551cb0ef41Sopenharmony_ci
24561cb0ef41Sopenharmony_ci&set_label("short");
24571cb0ef41Sopenharmony_ci	&add		($len,16*6);
24581cb0ef41Sopenharmony_ci	&sub		($len,$inp);
24591cb0ef41Sopenharmony_ci	&jz		(&label("done"));
24601cb0ef41Sopenharmony_ci
24611cb0ef41Sopenharmony_ci	&cmp		($len,16*2);
24621cb0ef41Sopenharmony_ci	&jb		(&label("one"));
24631cb0ef41Sopenharmony_ci	&je		(&label("two"));
24641cb0ef41Sopenharmony_ci
24651cb0ef41Sopenharmony_ci	&cmp		($len,16*4);
24661cb0ef41Sopenharmony_ci	&jb		(&label("three"));
24671cb0ef41Sopenharmony_ci	&je		(&label("four"));
24681cb0ef41Sopenharmony_ci
24691cb0ef41Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
24701cb0ef41Sopenharmony_ci	&lea		($i3,&DWP(3,$block));
24711cb0ef41Sopenharmony_ci	&bsf		($i1,$i1);
24721cb0ef41Sopenharmony_ci	&bsf		($i3,$i3);
24731cb0ef41Sopenharmony_ci	&shl		($i1,4);
24741cb0ef41Sopenharmony_ci	&shl		($i3,4);
24751cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(0,$l_));
24761cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(0,$l_,$i1));
24771cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP($rounds_off,"esp"));
24781cb0ef41Sopenharmony_ci	&movdqa		($inout2,$inout0);
24791cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_,$i3));
24801cb0ef41Sopenharmony_ci	&movdqa		($inout4,$inout0);
24811cb0ef41Sopenharmony_ci
24821cb0ef41Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ last offset_i
24831cb0ef41Sopenharmony_ci	&pxor		($inout1,$inout0);
24841cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$inout0);
24851cb0ef41Sopenharmony_ci	&pxor		($inout2,$inout1);
24861cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$inout1);
24871cb0ef41Sopenharmony_ci	&pxor		($inout3,$inout2);
24881cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*2,"esp"),$inout2);
24891cb0ef41Sopenharmony_ci	&pxor		($inout4,$inout3);
24901cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*3,"esp"),$inout3);
24911cb0ef41Sopenharmony_ci	&pxor		($inout5,$inout4);
24921cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*4,"esp"),$inout4);
24931cb0ef41Sopenharmony_ci
24941cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
24951cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
24961cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
24971cb0ef41Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
24981cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(16*3,$inp));
24991cb0ef41Sopenharmony_ci	&movdqu		($inout4,&QWP(16*4,$inp));
25001cb0ef41Sopenharmony_ci	&pxor		($inout5,$inout5);
25011cb0ef41Sopenharmony_ci
25021cb0ef41Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
25031cb0ef41Sopenharmony_ci	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
25041cb0ef41Sopenharmony_ci	&pxor		($inout1,$rndkey0);
25051cb0ef41Sopenharmony_ci	&pxor		($inout2,$rndkey0);
25061cb0ef41Sopenharmony_ci	&pxor		($inout3,$rndkey0);
25071cb0ef41Sopenharmony_ci	&pxor		($inout4,$rndkey0);
25081cb0ef41Sopenharmony_ci
25091cb0ef41Sopenharmony_ci	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
25101cb0ef41Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
25111cb0ef41Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
25121cb0ef41Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
25131cb0ef41Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
25141cb0ef41Sopenharmony_ci	&pxor		($inout4,&QWP(16*4,"esp"));
25151cb0ef41Sopenharmony_ci
25161cb0ef41Sopenharmony_ci	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
25171cb0ef41Sopenharmony_ci	&aesdec		($inout0,$rndkey1);
25181cb0ef41Sopenharmony_ci	&aesdec		($inout1,$rndkey1);
25191cb0ef41Sopenharmony_ci	&aesdec		($inout2,$rndkey1);
25201cb0ef41Sopenharmony_ci	&aesdec		($inout3,$rndkey1);
25211cb0ef41Sopenharmony_ci	&aesdec		($inout4,$rndkey1);
25221cb0ef41Sopenharmony_ci	&aesdec		($inout5,$rndkey1);
25231cb0ef41Sopenharmony_ci
25241cb0ef41Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
25251cb0ef41Sopenharmony_ci	&call		("_aesni_decrypt6_enter");
25261cb0ef41Sopenharmony_ci
25271cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,&QWP(16*4,"esp"));	# pass last offset_i
25281cb0ef41Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
25291cb0ef41Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));
25301cb0ef41Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
25311cb0ef41Sopenharmony_ci	&pxor		($inout2,&QWP(16*2,"esp"));
25321cb0ef41Sopenharmony_ci	&pxor		($inout3,&QWP(16*3,"esp"));
25331cb0ef41Sopenharmony_ci	&pxor		($inout4,$rndkey0);
25341cb0ef41Sopenharmony_ci
25351cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
25361cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*0,$out,$inp),$inout0);	# store output
25371cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout1);
25381cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*1,$out,$inp),$inout1);
25391cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout2);
25401cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*2,$out,$inp),$inout2);
25411cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout3);
25421cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*3,$out,$inp),$inout3);
25431cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout4);
25441cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16*4,$out,$inp),$inout4);
25451cb0ef41Sopenharmony_ci
25461cb0ef41Sopenharmony_ci	&jmp		(&label("done"));
25471cb0ef41Sopenharmony_ci
25481cb0ef41Sopenharmony_ci&set_label("one",16);
25491cb0ef41Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_));
25501cb0ef41Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
25511cb0ef41Sopenharmony_ci
25521cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
25531cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
25541cb0ef41Sopenharmony_ci
25551cb0ef41Sopenharmony_ci	&pxor		($inout5,$rndkey0);		# ^ last offset_i
25561cb0ef41Sopenharmony_ci	&pxor		($inout0,$inout5);		# ^ offset_i
25571cb0ef41Sopenharmony_ci
25581cb0ef41Sopenharmony_ci	&movdqa		($inout4,$rndkey1);
25591cb0ef41Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
25601cb0ef41Sopenharmony_ci	if ($inline)
25611cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
25621cb0ef41Sopenharmony_ci	else
25631cb0ef41Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
25641cb0ef41Sopenharmony_ci
25651cb0ef41Sopenharmony_ci	&xorps		($inout0,$inout5);		# ^ offset_i
25661cb0ef41Sopenharmony_ci	&movaps		($rndkey1,$inout4);		# pass the checksum
25671cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
25681cb0ef41Sopenharmony_ci	&xorps		($rndkey1,$inout0);		# checksum
25691cb0ef41Sopenharmony_ci	&movups		(&QWP(0,$out,$inp),$inout0);
25701cb0ef41Sopenharmony_ci
25711cb0ef41Sopenharmony_ci	&jmp		(&label("done"));
25721cb0ef41Sopenharmony_ci
25731cb0ef41Sopenharmony_ci&set_label("two",16);
25741cb0ef41Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
25751cb0ef41Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
25761cb0ef41Sopenharmony_ci	&bsf		($i1,$i1);
25771cb0ef41Sopenharmony_ci	&shl		($i1,4);
25781cb0ef41Sopenharmony_ci	&movdqu		($inout4,&QWP(0,$l_));
25791cb0ef41Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i1));
25801cb0ef41Sopenharmony_ci
25811cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
25821cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
25831cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
25841cb0ef41Sopenharmony_ci
25851cb0ef41Sopenharmony_ci	&movdqa		($inout3,$rndkey1);
25861cb0ef41Sopenharmony_ci	&pxor		($inout4,$rndkey0);		# ^ last offset_i
25871cb0ef41Sopenharmony_ci	&pxor		($inout5,$inout4);
25881cb0ef41Sopenharmony_ci
25891cb0ef41Sopenharmony_ci	&pxor		($inout0,$inout4);		# ^ offset_i
25901cb0ef41Sopenharmony_ci	&pxor		($inout1,$inout5);
25911cb0ef41Sopenharmony_ci
25921cb0ef41Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
25931cb0ef41Sopenharmony_ci	&call		("_aesni_decrypt2");
25941cb0ef41Sopenharmony_ci
25951cb0ef41Sopenharmony_ci	&xorps		($inout0,$inout4);		# ^ offset_i
25961cb0ef41Sopenharmony_ci	&xorps		($inout1,$inout5);
25971cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
25981cb0ef41Sopenharmony_ci	&xorps		($inout3,$inout0);		# checksum
25991cb0ef41Sopenharmony_ci	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
26001cb0ef41Sopenharmony_ci	&xorps		($inout3,$inout1);
26011cb0ef41Sopenharmony_ci	&movups		(&QWP(16*1,$out,$inp),$inout1);
26021cb0ef41Sopenharmony_ci	&movaps		($rndkey1,$inout3);		# pass the checksum
26031cb0ef41Sopenharmony_ci
26041cb0ef41Sopenharmony_ci	&jmp		(&label("done"));
26051cb0ef41Sopenharmony_ci
26061cb0ef41Sopenharmony_ci&set_label("three",16);
26071cb0ef41Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
26081cb0ef41Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
26091cb0ef41Sopenharmony_ci	&bsf		($i1,$i1);
26101cb0ef41Sopenharmony_ci	&shl		($i1,4);
26111cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_));
26121cb0ef41Sopenharmony_ci	&movdqu		($inout4,&QWP(0,$l_,$i1));
26131cb0ef41Sopenharmony_ci	&movdqa		($inout5,$inout3);
26141cb0ef41Sopenharmony_ci
26151cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
26161cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
26171cb0ef41Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
26181cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
26191cb0ef41Sopenharmony_ci
26201cb0ef41Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
26211cb0ef41Sopenharmony_ci	&pxor		($inout3,$rndkey0);		# ^ last offset_i
26221cb0ef41Sopenharmony_ci	&pxor		($inout4,$inout3);
26231cb0ef41Sopenharmony_ci	&pxor		($inout5,$inout4);
26241cb0ef41Sopenharmony_ci
26251cb0ef41Sopenharmony_ci	&pxor		($inout0,$inout3);		# ^ offset_i
26261cb0ef41Sopenharmony_ci	&pxor		($inout1,$inout4);
26271cb0ef41Sopenharmony_ci	&pxor		($inout2,$inout5);
26281cb0ef41Sopenharmony_ci
26291cb0ef41Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
26301cb0ef41Sopenharmony_ci	&call		("_aesni_decrypt3");
26311cb0ef41Sopenharmony_ci
26321cb0ef41Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
26331cb0ef41Sopenharmony_ci	&xorps		($inout0,$inout3);		# ^ offset_i
26341cb0ef41Sopenharmony_ci	&xorps		($inout1,$inout4);
26351cb0ef41Sopenharmony_ci	&xorps		($inout2,$inout5);
26361cb0ef41Sopenharmony_ci	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
26371cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
26381cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
26391cb0ef41Sopenharmony_ci	&movups		(&QWP(16*1,$out,$inp),$inout1);
26401cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout1);
26411cb0ef41Sopenharmony_ci	&movups		(&QWP(16*2,$out,$inp),$inout2);
26421cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout2);
26431cb0ef41Sopenharmony_ci
26441cb0ef41Sopenharmony_ci	&jmp		(&label("done"));
26451cb0ef41Sopenharmony_ci
26461cb0ef41Sopenharmony_ci&set_label("four",16);
26471cb0ef41Sopenharmony_ci	&lea		($i1,&DWP(1,$block));
26481cb0ef41Sopenharmony_ci	&lea		($i3,&DWP(3,$block));
26491cb0ef41Sopenharmony_ci	&bsf		($i1,$i1);
26501cb0ef41Sopenharmony_ci	&bsf		($i3,$i3);
26511cb0ef41Sopenharmony_ci	&mov		($key,&DWP($key_off,"esp"));	# restore key
26521cb0ef41Sopenharmony_ci	&shl		($i1,4);
26531cb0ef41Sopenharmony_ci	&shl		($i3,4);
26541cb0ef41Sopenharmony_ci	&movdqu		($inout2,&QWP(0,$l_));
26551cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(0,$l_,$i1));
26561cb0ef41Sopenharmony_ci	&movdqa		($inout4,$inout2);
26571cb0ef41Sopenharmony_ci	&movdqu		($inout5,&QWP(0,$l_,$i3));
26581cb0ef41Sopenharmony_ci
26591cb0ef41Sopenharmony_ci	&pxor		($inout2,$rndkey0);		# ^ last offset_i
26601cb0ef41Sopenharmony_ci	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
26611cb0ef41Sopenharmony_ci	&pxor		($inout3,$inout2);
26621cb0ef41Sopenharmony_ci	&movdqu		($inout1,&QWP(16*1,$inp));
26631cb0ef41Sopenharmony_ci	&pxor		($inout4,$inout3);
26641cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*0,"esp"),$inout2);
26651cb0ef41Sopenharmony_ci	&pxor		($inout5,$inout4);
26661cb0ef41Sopenharmony_ci	&movdqa		(&QWP(16*1,"esp"),$inout3);
26671cb0ef41Sopenharmony_ci	&movdqu		($inout2,&QWP(16*2,$inp));
26681cb0ef41Sopenharmony_ci	&movdqu		($inout3,&QWP(16*3,$inp));
26691cb0ef41Sopenharmony_ci	&mov		($rounds,&DWP(240,$key));
26701cb0ef41Sopenharmony_ci
26711cb0ef41Sopenharmony_ci	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
26721cb0ef41Sopenharmony_ci	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
26731cb0ef41Sopenharmony_ci	&pxor		($inout1,&QWP(16*1,"esp"));
26741cb0ef41Sopenharmony_ci	&pxor		($inout2,$inout4);
26751cb0ef41Sopenharmony_ci	&pxor		($inout3,$inout5);
26761cb0ef41Sopenharmony_ci
26771cb0ef41Sopenharmony_ci	&mov		($out,&DWP($out_off,"esp"));
26781cb0ef41Sopenharmony_ci	&call		("_aesni_decrypt4");
26791cb0ef41Sopenharmony_ci
26801cb0ef41Sopenharmony_ci	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
26811cb0ef41Sopenharmony_ci	&xorps		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
26821cb0ef41Sopenharmony_ci	&xorps		($inout1,&QWP(16*1,"esp"));
26831cb0ef41Sopenharmony_ci	&xorps		($inout2,$inout4);
26841cb0ef41Sopenharmony_ci	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
26851cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout0);		# checksum
26861cb0ef41Sopenharmony_ci	&xorps		($inout3,$inout5);
26871cb0ef41Sopenharmony_ci	&movups		(&QWP(16*1,$out,$inp),$inout1);
26881cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout1);
26891cb0ef41Sopenharmony_ci	&movdqa		($rndkey0,$inout5);		# pass last offset_i
26901cb0ef41Sopenharmony_ci	&movups		(&QWP(16*2,$out,$inp),$inout2);
26911cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout2);
26921cb0ef41Sopenharmony_ci	&movups		(&QWP(16*3,$out,$inp),$inout3);
26931cb0ef41Sopenharmony_ci	&pxor		($rndkey1,$inout3);
26941cb0ef41Sopenharmony_ci
26951cb0ef41Sopenharmony_ci&set_label("done");
26961cb0ef41Sopenharmony_ci	&mov	($key,&DWP($esp_off,"esp"));
26971cb0ef41Sopenharmony_ci	&pxor	($inout0,$inout0);		# clear register bank
26981cb0ef41Sopenharmony_ci	&pxor	($inout1,$inout1);
26991cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*0,"esp"),$inout0);	# clear stack
27001cb0ef41Sopenharmony_ci	&pxor	($inout2,$inout2);
27011cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*1,"esp"),$inout0);
27021cb0ef41Sopenharmony_ci	&pxor	($inout3,$inout3);
27031cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*2,"esp"),$inout0);
27041cb0ef41Sopenharmony_ci	&pxor	($inout4,$inout4);
27051cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*3,"esp"),$inout0);
27061cb0ef41Sopenharmony_ci	&pxor	($inout5,$inout5);
27071cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*4,"esp"),$inout0);
27081cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*5,"esp"),$inout0);
27091cb0ef41Sopenharmony_ci	&movdqa	(&QWP(16*6,"esp"),$inout0);
27101cb0ef41Sopenharmony_ci
27111cb0ef41Sopenharmony_ci	&lea	("esp",&DWP(0,$key));
27121cb0ef41Sopenharmony_ci	&mov	($rounds,&wparam(5));		# &offset_i
27131cb0ef41Sopenharmony_ci	&mov	($rounds_,&wparam(7));		# &checksum
27141cb0ef41Sopenharmony_ci	&movdqu	(&QWP(0,$rounds),$rndkey0);
27151cb0ef41Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);
27161cb0ef41Sopenharmony_ci	&movdqu	(&QWP(0,$rounds_),$rndkey1);
27171cb0ef41Sopenharmony_ci	&pxor	($rndkey1,$rndkey1);
27181cb0ef41Sopenharmony_ci&function_end("aesni_ocb_decrypt");
27191cb0ef41Sopenharmony_ci}
27201cb0ef41Sopenharmony_ci}
27211cb0ef41Sopenharmony_ci
27221cb0ef41Sopenharmony_ci######################################################################
27231cb0ef41Sopenharmony_ci# void $PREFIX_cbc_encrypt (const void *inp, void *out,
27241cb0ef41Sopenharmony_ci#                           size_t length, const AES_KEY *key,
27251cb0ef41Sopenharmony_ci#                           unsigned char *ivp,const int enc);
27261cb0ef41Sopenharmony_ci&function_begin("${PREFIX}_cbc_encrypt");
27271cb0ef41Sopenharmony_ci	&mov	($inp,&wparam(0));
27281cb0ef41Sopenharmony_ci	&mov	($rounds_,"esp");
27291cb0ef41Sopenharmony_ci	&mov	($out,&wparam(1));
27301cb0ef41Sopenharmony_ci	&sub	($rounds_,24);
27311cb0ef41Sopenharmony_ci	&mov	($len,&wparam(2));
27321cb0ef41Sopenharmony_ci	&and	($rounds_,-16);
27331cb0ef41Sopenharmony_ci	&mov	($key,&wparam(3));
27341cb0ef41Sopenharmony_ci	&mov	($key_,&wparam(4));
27351cb0ef41Sopenharmony_ci	&test	($len,$len);
27361cb0ef41Sopenharmony_ci	&jz	(&label("cbc_abort"));
27371cb0ef41Sopenharmony_ci
27381cb0ef41Sopenharmony_ci	&cmp	(&wparam(5),0);
27391cb0ef41Sopenharmony_ci	&xchg	($rounds_,"esp");		# alloca
27401cb0ef41Sopenharmony_ci	&movups	($ivec,&QWP(0,$key_));		# load IV
27411cb0ef41Sopenharmony_ci	&mov	($rounds,&DWP(240,$key));
27421cb0ef41Sopenharmony_ci	&mov	($key_,$key);			# backup $key
27431cb0ef41Sopenharmony_ci	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
27441cb0ef41Sopenharmony_ci	&mov	($rounds_,$rounds);		# backup $rounds
27451cb0ef41Sopenharmony_ci	&je	(&label("cbc_decrypt"));
27461cb0ef41Sopenharmony_ci
27471cb0ef41Sopenharmony_ci	&movaps	($inout0,$ivec);
27481cb0ef41Sopenharmony_ci	&cmp	($len,16);
27491cb0ef41Sopenharmony_ci	&jb	(&label("cbc_enc_tail"));
27501cb0ef41Sopenharmony_ci	&sub	($len,16);
27511cb0ef41Sopenharmony_ci	&jmp	(&label("cbc_enc_loop"));
27521cb0ef41Sopenharmony_ci
27531cb0ef41Sopenharmony_ci&set_label("cbc_enc_loop",16);
27541cb0ef41Sopenharmony_ci	&movups	($ivec,&QWP(0,$inp));		# input actually
27551cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(16,$inp));
27561cb0ef41Sopenharmony_ci	if ($inline)
27571cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
27581cb0ef41Sopenharmony_ci	else
27591cb0ef41Sopenharmony_ci	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
27601cb0ef41Sopenharmony_ci	&mov	($rounds,$rounds_);	# restore $rounds
27611cb0ef41Sopenharmony_ci	&mov	($key,$key_);		# restore $key
27621cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);	# store output
27631cb0ef41Sopenharmony_ci	&lea	($out,&DWP(16,$out));
27641cb0ef41Sopenharmony_ci	&sub	($len,16);
27651cb0ef41Sopenharmony_ci	&jnc	(&label("cbc_enc_loop"));
27661cb0ef41Sopenharmony_ci	&add	($len,16);
27671cb0ef41Sopenharmony_ci	&jnz	(&label("cbc_enc_tail"));
27681cb0ef41Sopenharmony_ci	&movaps	($ivec,$inout0);
27691cb0ef41Sopenharmony_ci	&pxor	($inout0,$inout0);
27701cb0ef41Sopenharmony_ci	&jmp	(&label("cbc_ret"));
27711cb0ef41Sopenharmony_ci
27721cb0ef41Sopenharmony_ci&set_label("cbc_enc_tail");
27731cb0ef41Sopenharmony_ci	&mov	("ecx",$len);		# zaps $rounds
27741cb0ef41Sopenharmony_ci	&data_word(0xA4F3F689);		# rep movsb
27751cb0ef41Sopenharmony_ci	&mov	("ecx",16);		# zero tail
27761cb0ef41Sopenharmony_ci	&sub	("ecx",$len);
27771cb0ef41Sopenharmony_ci	&xor	("eax","eax");		# zaps $len
27781cb0ef41Sopenharmony_ci	&data_word(0xAAF3F689);		# rep stosb
27791cb0ef41Sopenharmony_ci	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
27801cb0ef41Sopenharmony_ci	&mov	($rounds,$rounds_);	# restore $rounds
27811cb0ef41Sopenharmony_ci	&mov	($inp,$out);		# $inp and $out are the same
27821cb0ef41Sopenharmony_ci	&mov	($key,$key_);		# restore $key
27831cb0ef41Sopenharmony_ci	&jmp	(&label("cbc_enc_loop"));
27841cb0ef41Sopenharmony_ci######################################################################
27851cb0ef41Sopenharmony_ci&set_label("cbc_decrypt",16);
27861cb0ef41Sopenharmony_ci	&cmp	($len,0x50);
27871cb0ef41Sopenharmony_ci	&jbe	(&label("cbc_dec_tail"));
27881cb0ef41Sopenharmony_ci	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
27891cb0ef41Sopenharmony_ci	&sub	($len,0x50);
27901cb0ef41Sopenharmony_ci	&jmp	(&label("cbc_dec_loop6_enter"));
27911cb0ef41Sopenharmony_ci
27921cb0ef41Sopenharmony_ci&set_label("cbc_dec_loop6",16);
27931cb0ef41Sopenharmony_ci	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
27941cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout5);
27951cb0ef41Sopenharmony_ci	&lea	($out,&DWP(0x10,$out));
27961cb0ef41Sopenharmony_ci&set_label("cbc_dec_loop6_enter");
27971cb0ef41Sopenharmony_ci	&movdqu	($inout0,&QWP(0,$inp));
27981cb0ef41Sopenharmony_ci	&movdqu	($inout1,&QWP(0x10,$inp));
27991cb0ef41Sopenharmony_ci	&movdqu	($inout2,&QWP(0x20,$inp));
28001cb0ef41Sopenharmony_ci	&movdqu	($inout3,&QWP(0x30,$inp));
28011cb0ef41Sopenharmony_ci	&movdqu	($inout4,&QWP(0x40,$inp));
28021cb0ef41Sopenharmony_ci	&movdqu	($inout5,&QWP(0x50,$inp));
28031cb0ef41Sopenharmony_ci
28041cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt6");
28051cb0ef41Sopenharmony_ci
28061cb0ef41Sopenharmony_ci	&movups	($rndkey1,&QWP(0,$inp));
28071cb0ef41Sopenharmony_ci	&movups	($rndkey0,&QWP(0x10,$inp));
28081cb0ef41Sopenharmony_ci	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
28091cb0ef41Sopenharmony_ci	&xorps	($inout1,$rndkey1);
28101cb0ef41Sopenharmony_ci	&movups	($rndkey1,&QWP(0x20,$inp));
28111cb0ef41Sopenharmony_ci	&xorps	($inout2,$rndkey0);
28121cb0ef41Sopenharmony_ci	&movups	($rndkey0,&QWP(0x30,$inp));
28131cb0ef41Sopenharmony_ci	&xorps	($inout3,$rndkey1);
28141cb0ef41Sopenharmony_ci	&movups	($rndkey1,&QWP(0x40,$inp));
28151cb0ef41Sopenharmony_ci	&xorps	($inout4,$rndkey0);
28161cb0ef41Sopenharmony_ci	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
28171cb0ef41Sopenharmony_ci	&xorps	($inout5,$rndkey1);
28181cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
28191cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
28201cb0ef41Sopenharmony_ci	&lea	($inp,&DWP(0x60,$inp));
28211cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
28221cb0ef41Sopenharmony_ci	&mov	($rounds,$rounds_);		# restore $rounds
28231cb0ef41Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
28241cb0ef41Sopenharmony_ci	&mov	($key,$key_);			# restore $key
28251cb0ef41Sopenharmony_ci	&movups	(&QWP(0x40,$out),$inout4);
28261cb0ef41Sopenharmony_ci	&lea	($out,&DWP(0x50,$out));
28271cb0ef41Sopenharmony_ci	&sub	($len,0x60);
28281cb0ef41Sopenharmony_ci	&ja	(&label("cbc_dec_loop6"));
28291cb0ef41Sopenharmony_ci
28301cb0ef41Sopenharmony_ci	&movaps	($inout0,$inout5);
28311cb0ef41Sopenharmony_ci	&movaps	($ivec,$rndkey0);
28321cb0ef41Sopenharmony_ci	&add	($len,0x50);
28331cb0ef41Sopenharmony_ci	&jle	(&label("cbc_dec_clear_tail_collected"));
28341cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
28351cb0ef41Sopenharmony_ci	&lea	($out,&DWP(0x10,$out));
28361cb0ef41Sopenharmony_ci&set_label("cbc_dec_tail");
28371cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));
28381cb0ef41Sopenharmony_ci	&movaps	($in0,$inout0);
28391cb0ef41Sopenharmony_ci	&cmp	($len,0x10);
28401cb0ef41Sopenharmony_ci	&jbe	(&label("cbc_dec_one"));
28411cb0ef41Sopenharmony_ci
28421cb0ef41Sopenharmony_ci	&movups	($inout1,&QWP(0x10,$inp));
28431cb0ef41Sopenharmony_ci	&movaps	($in1,$inout1);
28441cb0ef41Sopenharmony_ci	&cmp	($len,0x20);
28451cb0ef41Sopenharmony_ci	&jbe	(&label("cbc_dec_two"));
28461cb0ef41Sopenharmony_ci
28471cb0ef41Sopenharmony_ci	&movups	($inout2,&QWP(0x20,$inp));
28481cb0ef41Sopenharmony_ci	&cmp	($len,0x30);
28491cb0ef41Sopenharmony_ci	&jbe	(&label("cbc_dec_three"));
28501cb0ef41Sopenharmony_ci
28511cb0ef41Sopenharmony_ci	&movups	($inout3,&QWP(0x30,$inp));
28521cb0ef41Sopenharmony_ci	&cmp	($len,0x40);
28531cb0ef41Sopenharmony_ci	&jbe	(&label("cbc_dec_four"));
28541cb0ef41Sopenharmony_ci
28551cb0ef41Sopenharmony_ci	&movups	($inout4,&QWP(0x40,$inp));
28561cb0ef41Sopenharmony_ci	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
28571cb0ef41Sopenharmony_ci	&movups	($inout0,&QWP(0,$inp));
28581cb0ef41Sopenharmony_ci	&xorps	($inout5,$inout5);
28591cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt6");
28601cb0ef41Sopenharmony_ci	&movups	($rndkey1,&QWP(0,$inp));
28611cb0ef41Sopenharmony_ci	&movups	($rndkey0,&QWP(0x10,$inp));
28621cb0ef41Sopenharmony_ci	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
28631cb0ef41Sopenharmony_ci	&xorps	($inout1,$rndkey1);
28641cb0ef41Sopenharmony_ci	&movups	($rndkey1,&QWP(0x20,$inp));
28651cb0ef41Sopenharmony_ci	&xorps	($inout2,$rndkey0);
28661cb0ef41Sopenharmony_ci	&movups	($rndkey0,&QWP(0x30,$inp));
28671cb0ef41Sopenharmony_ci	&xorps	($inout3,$rndkey1);
28681cb0ef41Sopenharmony_ci	&movups	($ivec,&QWP(0x40,$inp));	# IV
28691cb0ef41Sopenharmony_ci	&xorps	($inout4,$rndkey0);
28701cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
28711cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
28721cb0ef41Sopenharmony_ci	&pxor	($inout1,$inout1);
28731cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
28741cb0ef41Sopenharmony_ci	&pxor	($inout2,$inout2);
28751cb0ef41Sopenharmony_ci	&movups	(&QWP(0x30,$out),$inout3);
28761cb0ef41Sopenharmony_ci	&pxor	($inout3,$inout3);
28771cb0ef41Sopenharmony_ci	&lea	($out,&DWP(0x40,$out));
28781cb0ef41Sopenharmony_ci	&movaps	($inout0,$inout4);
28791cb0ef41Sopenharmony_ci	&pxor	($inout4,$inout4);
28801cb0ef41Sopenharmony_ci	&sub	($len,0x50);
28811cb0ef41Sopenharmony_ci	&jmp	(&label("cbc_dec_tail_collected"));
28821cb0ef41Sopenharmony_ci
28831cb0ef41Sopenharmony_ci&set_label("cbc_dec_one",16);
28841cb0ef41Sopenharmony_ci	if ($inline)
28851cb0ef41Sopenharmony_ci	{   &aesni_inline_generate1("dec");	}
28861cb0ef41Sopenharmony_ci	else
28871cb0ef41Sopenharmony_ci	{   &call	("_aesni_decrypt1");	}
28881cb0ef41Sopenharmony_ci	&xorps	($inout0,$ivec);
28891cb0ef41Sopenharmony_ci	&movaps	($ivec,$in0);
28901cb0ef41Sopenharmony_ci	&sub	($len,0x10);
28911cb0ef41Sopenharmony_ci	&jmp	(&label("cbc_dec_tail_collected"));
28921cb0ef41Sopenharmony_ci
28931cb0ef41Sopenharmony_ci&set_label("cbc_dec_two",16);
28941cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt2");
28951cb0ef41Sopenharmony_ci	&xorps	($inout0,$ivec);
28961cb0ef41Sopenharmony_ci	&xorps	($inout1,$in0);
28971cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
28981cb0ef41Sopenharmony_ci	&movaps	($inout0,$inout1);
28991cb0ef41Sopenharmony_ci	&pxor	($inout1,$inout1);
29001cb0ef41Sopenharmony_ci	&lea	($out,&DWP(0x10,$out));
29011cb0ef41Sopenharmony_ci	&movaps	($ivec,$in1);
29021cb0ef41Sopenharmony_ci	&sub	($len,0x20);
29031cb0ef41Sopenharmony_ci	&jmp	(&label("cbc_dec_tail_collected"));
29041cb0ef41Sopenharmony_ci
29051cb0ef41Sopenharmony_ci&set_label("cbc_dec_three",16);
29061cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt3");
29071cb0ef41Sopenharmony_ci	&xorps	($inout0,$ivec);
29081cb0ef41Sopenharmony_ci	&xorps	($inout1,$in0);
29091cb0ef41Sopenharmony_ci	&xorps	($inout2,$in1);
29101cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
29111cb0ef41Sopenharmony_ci	&movaps	($inout0,$inout2);
29121cb0ef41Sopenharmony_ci	&pxor	($inout2,$inout2);
29131cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
29141cb0ef41Sopenharmony_ci	&pxor	($inout1,$inout1);
29151cb0ef41Sopenharmony_ci	&lea	($out,&DWP(0x20,$out));
29161cb0ef41Sopenharmony_ci	&movups	($ivec,&QWP(0x20,$inp));
29171cb0ef41Sopenharmony_ci	&sub	($len,0x30);
29181cb0ef41Sopenharmony_ci	&jmp	(&label("cbc_dec_tail_collected"));
29191cb0ef41Sopenharmony_ci
29201cb0ef41Sopenharmony_ci&set_label("cbc_dec_four",16);
29211cb0ef41Sopenharmony_ci	&call	("_aesni_decrypt4");
29221cb0ef41Sopenharmony_ci	&movups	($rndkey1,&QWP(0x10,$inp));
29231cb0ef41Sopenharmony_ci	&movups	($rndkey0,&QWP(0x20,$inp));
29241cb0ef41Sopenharmony_ci	&xorps	($inout0,$ivec);
29251cb0ef41Sopenharmony_ci	&movups	($ivec,&QWP(0x30,$inp));
29261cb0ef41Sopenharmony_ci	&xorps	($inout1,$in0);
29271cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
29281cb0ef41Sopenharmony_ci	&xorps	($inout2,$rndkey1);
29291cb0ef41Sopenharmony_ci	&movups	(&QWP(0x10,$out),$inout1);
29301cb0ef41Sopenharmony_ci	&pxor	($inout1,$inout1);
29311cb0ef41Sopenharmony_ci	&xorps	($inout3,$rndkey0);
29321cb0ef41Sopenharmony_ci	&movups	(&QWP(0x20,$out),$inout2);
29331cb0ef41Sopenharmony_ci	&pxor	($inout2,$inout2);
29341cb0ef41Sopenharmony_ci	&lea	($out,&DWP(0x30,$out));
29351cb0ef41Sopenharmony_ci	&movaps	($inout0,$inout3);
29361cb0ef41Sopenharmony_ci	&pxor	($inout3,$inout3);
29371cb0ef41Sopenharmony_ci	&sub	($len,0x40);
29381cb0ef41Sopenharmony_ci	&jmp	(&label("cbc_dec_tail_collected"));
29391cb0ef41Sopenharmony_ci
29401cb0ef41Sopenharmony_ci&set_label("cbc_dec_clear_tail_collected",16);
29411cb0ef41Sopenharmony_ci	&pxor	($inout1,$inout1);
29421cb0ef41Sopenharmony_ci	&pxor	($inout2,$inout2);
29431cb0ef41Sopenharmony_ci	&pxor	($inout3,$inout3);
29441cb0ef41Sopenharmony_ci	&pxor	($inout4,$inout4);
29451cb0ef41Sopenharmony_ci&set_label("cbc_dec_tail_collected");
29461cb0ef41Sopenharmony_ci	&and	($len,15);
29471cb0ef41Sopenharmony_ci	&jnz	(&label("cbc_dec_tail_partial"));
29481cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$out),$inout0);
29491cb0ef41Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);
29501cb0ef41Sopenharmony_ci	&jmp	(&label("cbc_ret"));
29511cb0ef41Sopenharmony_ci
29521cb0ef41Sopenharmony_ci&set_label("cbc_dec_tail_partial",16);
29531cb0ef41Sopenharmony_ci	&movaps	(&QWP(0,"esp"),$inout0);
29541cb0ef41Sopenharmony_ci	&pxor	($rndkey0,$rndkey0);
29551cb0ef41Sopenharmony_ci	&mov	("ecx",16);
29561cb0ef41Sopenharmony_ci	&mov	($inp,"esp");
29571cb0ef41Sopenharmony_ci	&sub	("ecx",$len);
29581cb0ef41Sopenharmony_ci	&data_word(0xA4F3F689);		# rep movsb
29591cb0ef41Sopenharmony_ci	&movdqa	(&QWP(0,"esp"),$inout0);
29601cb0ef41Sopenharmony_ci
29611cb0ef41Sopenharmony_ci&set_label("cbc_ret");
29621cb0ef41Sopenharmony_ci	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
29631cb0ef41Sopenharmony_ci	&mov	($key_,&wparam(4));
29641cb0ef41Sopenharmony_ci	&pxor	($inout0,$inout0);
29651cb0ef41Sopenharmony_ci	&pxor	($rndkey1,$rndkey1);
29661cb0ef41Sopenharmony_ci	&movups	(&QWP(0,$key_),$ivec);	# output IV
29671cb0ef41Sopenharmony_ci	&pxor	($ivec,$ivec);
29681cb0ef41Sopenharmony_ci&set_label("cbc_abort");
29691cb0ef41Sopenharmony_ci&function_end("${PREFIX}_cbc_encrypt");
29701cb0ef41Sopenharmony_ci
29711cb0ef41Sopenharmony_ci######################################################################
29721cb0ef41Sopenharmony_ci# Mechanical port from aesni-x86_64.pl.
29731cb0ef41Sopenharmony_ci#
29741cb0ef41Sopenharmony_ci# _aesni_set_encrypt_key is private interface,
29751cb0ef41Sopenharmony_ci# input:
29761cb0ef41Sopenharmony_ci#	"eax"	const unsigned char *userKey
29771cb0ef41Sopenharmony_ci#	$rounds	int bits
29781cb0ef41Sopenharmony_ci#	$key	AES_KEY *key
29791cb0ef41Sopenharmony_ci# output:
29801cb0ef41Sopenharmony_ci#	"eax"	return code
29811cb0ef41Sopenharmony_ci#	$round	rounds
29821cb0ef41Sopenharmony_ci
29831cb0ef41Sopenharmony_ci&function_begin_B("_aesni_set_encrypt_key");
29841cb0ef41Sopenharmony_ci	&push	("ebp");
29851cb0ef41Sopenharmony_ci	&push	("ebx");
29861cb0ef41Sopenharmony_ci	&test	("eax","eax");
29871cb0ef41Sopenharmony_ci	&jz	(&label("bad_pointer"));
29881cb0ef41Sopenharmony_ci	&test	($key,$key);
29891cb0ef41Sopenharmony_ci	&jz	(&label("bad_pointer"));
29901cb0ef41Sopenharmony_ci
29911cb0ef41Sopenharmony_ci	&call	(&label("pic"));
29921cb0ef41Sopenharmony_ci&set_label("pic");
29931cb0ef41Sopenharmony_ci	&blindpop("ebx");
29941cb0ef41Sopenharmony_ci	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
29951cb0ef41Sopenharmony_ci
29961cb0ef41Sopenharmony_ci	&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
29971cb0ef41Sopenharmony_ci	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
29981cb0ef41Sopenharmony_ci	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
29991cb0ef41Sopenharmony_ci	&mov	("ebp",&DWP(4,"ebp"));
30001cb0ef41Sopenharmony_ci	&lea	($key,&DWP(16,$key));
30011cb0ef41Sopenharmony_ci	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
30021cb0ef41Sopenharmony_ci	&cmp	($rounds,256);
30031cb0ef41Sopenharmony_ci	&je	(&label("14rounds"));
30041cb0ef41Sopenharmony_ci	&cmp	($rounds,192);
30051cb0ef41Sopenharmony_ci	&je	(&label("12rounds"));
30061cb0ef41Sopenharmony_ci	&cmp	($rounds,128);
30071cb0ef41Sopenharmony_ci	&jne	(&label("bad_keybits"));
30081cb0ef41Sopenharmony_ci
30091cb0ef41Sopenharmony_ci&set_label("10rounds",16);
30101cb0ef41Sopenharmony_ci	&cmp		("ebp",1<<28);
30111cb0ef41Sopenharmony_ci	&je		(&label("10rounds_alt"));
30121cb0ef41Sopenharmony_ci
30131cb0ef41Sopenharmony_ci	&mov		($rounds,9);
30141cb0ef41Sopenharmony_ci	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
30151cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
30161cb0ef41Sopenharmony_ci	&call		(&label("key_128_cold"));
30171cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
30181cb0ef41Sopenharmony_ci	&call		(&label("key_128"));
30191cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
30201cb0ef41Sopenharmony_ci	&call		(&label("key_128"));
30211cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
30221cb0ef41Sopenharmony_ci	&call		(&label("key_128"));
30231cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
30241cb0ef41Sopenharmony_ci	&call		(&label("key_128"));
30251cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
30261cb0ef41Sopenharmony_ci	&call		(&label("key_128"));
30271cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
30281cb0ef41Sopenharmony_ci	&call		(&label("key_128"));
30291cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
30301cb0ef41Sopenharmony_ci	&call		(&label("key_128"));
30311cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
30321cb0ef41Sopenharmony_ci	&call		(&label("key_128"));
30331cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
30341cb0ef41Sopenharmony_ci	&call		(&label("key_128"));
30351cb0ef41Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
30361cb0ef41Sopenharmony_ci	&mov		(&DWP(80,$key),$rounds);
30371cb0ef41Sopenharmony_ci
30381cb0ef41Sopenharmony_ci	&jmp	(&label("good_key"));
30391cb0ef41Sopenharmony_ci
30401cb0ef41Sopenharmony_ci&set_label("key_128",16);
30411cb0ef41Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
30421cb0ef41Sopenharmony_ci	&lea		($key,&DWP(16,$key));
30431cb0ef41Sopenharmony_ci&set_label("key_128_cold");
30441cb0ef41Sopenharmony_ci	&shufps		("xmm4","xmm0",0b00010000);
30451cb0ef41Sopenharmony_ci	&xorps		("xmm0","xmm4");
30461cb0ef41Sopenharmony_ci	&shufps		("xmm4","xmm0",0b10001100);
30471cb0ef41Sopenharmony_ci	&xorps		("xmm0","xmm4");
30481cb0ef41Sopenharmony_ci	&shufps		("xmm1","xmm1",0b11111111);	# critical path
30491cb0ef41Sopenharmony_ci	&xorps		("xmm0","xmm1");
30501cb0ef41Sopenharmony_ci	&ret();
30511cb0ef41Sopenharmony_ci
30521cb0ef41Sopenharmony_ci&set_label("10rounds_alt",16);
30531cb0ef41Sopenharmony_ci	&movdqa		("xmm5",&QWP(0x00,"ebx"));
30541cb0ef41Sopenharmony_ci	&mov		($rounds,8);
30551cb0ef41Sopenharmony_ci	&movdqa		("xmm4",&QWP(0x20,"ebx"));
30561cb0ef41Sopenharmony_ci	&movdqa		("xmm2","xmm0");
30571cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16,$key),"xmm0");
30581cb0ef41Sopenharmony_ci
30591cb0ef41Sopenharmony_ci&set_label("loop_key128");
30601cb0ef41Sopenharmony_ci	&pshufb		("xmm0","xmm5");
30611cb0ef41Sopenharmony_ci	&aesenclast	("xmm0","xmm4");
30621cb0ef41Sopenharmony_ci	&pslld		("xmm4",1);
30631cb0ef41Sopenharmony_ci	&lea		($key,&DWP(16,$key));
30641cb0ef41Sopenharmony_ci
30651cb0ef41Sopenharmony_ci	&movdqa		("xmm3","xmm2");
30661cb0ef41Sopenharmony_ci	&pslldq		("xmm2",4);
30671cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm2");
30681cb0ef41Sopenharmony_ci	&pslldq		("xmm2",4);
30691cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm2");
30701cb0ef41Sopenharmony_ci	&pslldq		("xmm2",4);
30711cb0ef41Sopenharmony_ci	&pxor		("xmm2","xmm3");
30721cb0ef41Sopenharmony_ci
30731cb0ef41Sopenharmony_ci	&pxor		("xmm0","xmm2");
30741cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16,$key),"xmm0");
30751cb0ef41Sopenharmony_ci	&movdqa		("xmm2","xmm0");
30761cb0ef41Sopenharmony_ci
30771cb0ef41Sopenharmony_ci	&dec		($rounds);
30781cb0ef41Sopenharmony_ci	&jnz		(&label("loop_key128"));
30791cb0ef41Sopenharmony_ci
30801cb0ef41Sopenharmony_ci	&movdqa		("xmm4",&QWP(0x30,"ebx"));
30811cb0ef41Sopenharmony_ci
30821cb0ef41Sopenharmony_ci	&pshufb		("xmm0","xmm5");
30831cb0ef41Sopenharmony_ci	&aesenclast	("xmm0","xmm4");
30841cb0ef41Sopenharmony_ci	&pslld		("xmm4",1);
30851cb0ef41Sopenharmony_ci
30861cb0ef41Sopenharmony_ci	&movdqa		("xmm3","xmm2");
30871cb0ef41Sopenharmony_ci	&pslldq		("xmm2",4);
30881cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm2");
30891cb0ef41Sopenharmony_ci	&pslldq		("xmm2",4);
30901cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm2");
30911cb0ef41Sopenharmony_ci	&pslldq		("xmm2",4);
30921cb0ef41Sopenharmony_ci	&pxor		("xmm2","xmm3");
30931cb0ef41Sopenharmony_ci
30941cb0ef41Sopenharmony_ci	&pxor		("xmm0","xmm2");
30951cb0ef41Sopenharmony_ci	&movdqu		(&QWP(0,$key),"xmm0");
30961cb0ef41Sopenharmony_ci
30971cb0ef41Sopenharmony_ci	&movdqa		("xmm2","xmm0");
30981cb0ef41Sopenharmony_ci	&pshufb		("xmm0","xmm5");
30991cb0ef41Sopenharmony_ci	&aesenclast	("xmm0","xmm4");
31001cb0ef41Sopenharmony_ci
31011cb0ef41Sopenharmony_ci	&movdqa		("xmm3","xmm2");
31021cb0ef41Sopenharmony_ci	&pslldq		("xmm2",4);
31031cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm2");
31041cb0ef41Sopenharmony_ci	&pslldq		("xmm2",4);
31051cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm2");
31061cb0ef41Sopenharmony_ci	&pslldq		("xmm2",4);
31071cb0ef41Sopenharmony_ci	&pxor		("xmm2","xmm3");
31081cb0ef41Sopenharmony_ci
31091cb0ef41Sopenharmony_ci	&pxor		("xmm0","xmm2");
31101cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16,$key),"xmm0");
31111cb0ef41Sopenharmony_ci
31121cb0ef41Sopenharmony_ci	&mov		($rounds,9);
31131cb0ef41Sopenharmony_ci	&mov		(&DWP(96,$key),$rounds);
31141cb0ef41Sopenharmony_ci
31151cb0ef41Sopenharmony_ci	&jmp	(&label("good_key"));
31161cb0ef41Sopenharmony_ci
31171cb0ef41Sopenharmony_ci&set_label("12rounds",16);
31181cb0ef41Sopenharmony_ci	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
31191cb0ef41Sopenharmony_ci	&cmp		("ebp",1<<28);
31201cb0ef41Sopenharmony_ci	&je		(&label("12rounds_alt"));
31211cb0ef41Sopenharmony_ci
31221cb0ef41Sopenharmony_ci	&mov		($rounds,11);
31231cb0ef41Sopenharmony_ci	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
31241cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
31251cb0ef41Sopenharmony_ci	&call		(&label("key_192a_cold"));
31261cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
31271cb0ef41Sopenharmony_ci	&call		(&label("key_192b"));
31281cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
31291cb0ef41Sopenharmony_ci	&call		(&label("key_192a"));
31301cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
31311cb0ef41Sopenharmony_ci	&call		(&label("key_192b"));
31321cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
31331cb0ef41Sopenharmony_ci	&call		(&label("key_192a"));
31341cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
31351cb0ef41Sopenharmony_ci	&call		(&label("key_192b"));
31361cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
31371cb0ef41Sopenharmony_ci	&call		(&label("key_192a"));
31381cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
31391cb0ef41Sopenharmony_ci	&call		(&label("key_192b"));
31401cb0ef41Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
31411cb0ef41Sopenharmony_ci	&mov		(&DWP(48,$key),$rounds);
31421cb0ef41Sopenharmony_ci
31431cb0ef41Sopenharmony_ci	&jmp	(&label("good_key"));
31441cb0ef41Sopenharmony_ci
31451cb0ef41Sopenharmony_ci&set_label("key_192a",16);
31461cb0ef41Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
31471cb0ef41Sopenharmony_ci	&lea		($key,&DWP(16,$key));
31481cb0ef41Sopenharmony_ci&set_label("key_192a_cold",16);
31491cb0ef41Sopenharmony_ci	&movaps		("xmm5","xmm2");
31501cb0ef41Sopenharmony_ci&set_label("key_192b_warm");
31511cb0ef41Sopenharmony_ci	&shufps		("xmm4","xmm0",0b00010000);
31521cb0ef41Sopenharmony_ci	&movdqa		("xmm3","xmm2");
31531cb0ef41Sopenharmony_ci	&xorps		("xmm0","xmm4");
31541cb0ef41Sopenharmony_ci	&shufps		("xmm4","xmm0",0b10001100);
31551cb0ef41Sopenharmony_ci	&pslldq		("xmm3",4);
31561cb0ef41Sopenharmony_ci	&xorps		("xmm0","xmm4");
31571cb0ef41Sopenharmony_ci	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
31581cb0ef41Sopenharmony_ci	&pxor		("xmm2","xmm3");
31591cb0ef41Sopenharmony_ci	&pxor		("xmm0","xmm1");
31601cb0ef41Sopenharmony_ci	&pshufd		("xmm3","xmm0",0b11111111);
31611cb0ef41Sopenharmony_ci	&pxor		("xmm2","xmm3");
31621cb0ef41Sopenharmony_ci	&ret();
31631cb0ef41Sopenharmony_ci
31641cb0ef41Sopenharmony_ci&set_label("key_192b",16);
31651cb0ef41Sopenharmony_ci	&movaps		("xmm3","xmm0");
31661cb0ef41Sopenharmony_ci	&shufps		("xmm5","xmm0",0b01000100);
31671cb0ef41Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm5");
31681cb0ef41Sopenharmony_ci	&shufps		("xmm3","xmm2",0b01001110);
31691cb0ef41Sopenharmony_ci	&$movekey	(&QWP(16,$key),"xmm3");
31701cb0ef41Sopenharmony_ci	&lea		($key,&DWP(32,$key));
31711cb0ef41Sopenharmony_ci	&jmp		(&label("key_192b_warm"));
31721cb0ef41Sopenharmony_ci
31731cb0ef41Sopenharmony_ci&set_label("12rounds_alt",16);
31741cb0ef41Sopenharmony_ci	&movdqa		("xmm5",&QWP(0x10,"ebx"));
31751cb0ef41Sopenharmony_ci	&movdqa		("xmm4",&QWP(0x20,"ebx"));
31761cb0ef41Sopenharmony_ci	&mov		($rounds,8);
31771cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16,$key),"xmm0");
31781cb0ef41Sopenharmony_ci
31791cb0ef41Sopenharmony_ci&set_label("loop_key192");
31801cb0ef41Sopenharmony_ci	&movq		(&QWP(0,$key),"xmm2");
31811cb0ef41Sopenharmony_ci	&movdqa		("xmm1","xmm2");
31821cb0ef41Sopenharmony_ci	&pshufb		("xmm2","xmm5");
31831cb0ef41Sopenharmony_ci	&aesenclast	("xmm2","xmm4");
31841cb0ef41Sopenharmony_ci	&pslld		("xmm4",1);
31851cb0ef41Sopenharmony_ci	&lea		($key,&DWP(24,$key));
31861cb0ef41Sopenharmony_ci
31871cb0ef41Sopenharmony_ci	&movdqa		("xmm3","xmm0");
31881cb0ef41Sopenharmony_ci	&pslldq		("xmm0",4);
31891cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm0");
31901cb0ef41Sopenharmony_ci	&pslldq		("xmm0",4);
31911cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm0");
31921cb0ef41Sopenharmony_ci	&pslldq		("xmm0",4);
31931cb0ef41Sopenharmony_ci	&pxor		("xmm0","xmm3");
31941cb0ef41Sopenharmony_ci
31951cb0ef41Sopenharmony_ci	&pshufd		("xmm3","xmm0",0xff);
31961cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm1");
31971cb0ef41Sopenharmony_ci	&pslldq		("xmm1",4);
31981cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm1");
31991cb0ef41Sopenharmony_ci
32001cb0ef41Sopenharmony_ci	&pxor		("xmm0","xmm2");
32011cb0ef41Sopenharmony_ci	&pxor		("xmm2","xmm3");
32021cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16,$key),"xmm0");
32031cb0ef41Sopenharmony_ci
32041cb0ef41Sopenharmony_ci	&dec		($rounds);
32051cb0ef41Sopenharmony_ci	&jnz		(&label("loop_key192"));
32061cb0ef41Sopenharmony_ci
32071cb0ef41Sopenharmony_ci	&mov	($rounds,11);
32081cb0ef41Sopenharmony_ci	&mov	(&DWP(32,$key),$rounds);
32091cb0ef41Sopenharmony_ci
32101cb0ef41Sopenharmony_ci	&jmp	(&label("good_key"));
32111cb0ef41Sopenharmony_ci
32121cb0ef41Sopenharmony_ci&set_label("14rounds",16);
32131cb0ef41Sopenharmony_ci	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
32141cb0ef41Sopenharmony_ci	&lea		($key,&DWP(16,$key));
32151cb0ef41Sopenharmony_ci	&cmp		("ebp",1<<28);
32161cb0ef41Sopenharmony_ci	&je		(&label("14rounds_alt"));
32171cb0ef41Sopenharmony_ci
32181cb0ef41Sopenharmony_ci	&mov		($rounds,13);
32191cb0ef41Sopenharmony_ci	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
32201cb0ef41Sopenharmony_ci	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
32211cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
32221cb0ef41Sopenharmony_ci	&call		(&label("key_256a_cold"));
32231cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
32241cb0ef41Sopenharmony_ci	&call		(&label("key_256b"));
32251cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
32261cb0ef41Sopenharmony_ci	&call		(&label("key_256a"));
32271cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
32281cb0ef41Sopenharmony_ci	&call		(&label("key_256b"));
32291cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
32301cb0ef41Sopenharmony_ci	&call		(&label("key_256a"));
32311cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
32321cb0ef41Sopenharmony_ci	&call		(&label("key_256b"));
32331cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
32341cb0ef41Sopenharmony_ci	&call		(&label("key_256a"));
32351cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
32361cb0ef41Sopenharmony_ci	&call		(&label("key_256b"));
32371cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
32381cb0ef41Sopenharmony_ci	&call		(&label("key_256a"));
32391cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
32401cb0ef41Sopenharmony_ci	&call		(&label("key_256b"));
32411cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
32421cb0ef41Sopenharmony_ci	&call		(&label("key_256a"));
32431cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
32441cb0ef41Sopenharmony_ci	&call		(&label("key_256b"));
32451cb0ef41Sopenharmony_ci	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
32461cb0ef41Sopenharmony_ci	&call		(&label("key_256a"));
32471cb0ef41Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
32481cb0ef41Sopenharmony_ci	&mov		(&DWP(16,$key),$rounds);
32491cb0ef41Sopenharmony_ci	&xor		("eax","eax");
32501cb0ef41Sopenharmony_ci
32511cb0ef41Sopenharmony_ci	&jmp	(&label("good_key"));
32521cb0ef41Sopenharmony_ci
32531cb0ef41Sopenharmony_ci&set_label("key_256a",16);
32541cb0ef41Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm2");
32551cb0ef41Sopenharmony_ci	&lea		($key,&DWP(16,$key));
32561cb0ef41Sopenharmony_ci&set_label("key_256a_cold");
32571cb0ef41Sopenharmony_ci	&shufps		("xmm4","xmm0",0b00010000);
32581cb0ef41Sopenharmony_ci	&xorps		("xmm0","xmm4");
32591cb0ef41Sopenharmony_ci	&shufps		("xmm4","xmm0",0b10001100);
32601cb0ef41Sopenharmony_ci	&xorps		("xmm0","xmm4");
32611cb0ef41Sopenharmony_ci	&shufps		("xmm1","xmm1",0b11111111);	# critical path
32621cb0ef41Sopenharmony_ci	&xorps		("xmm0","xmm1");
32631cb0ef41Sopenharmony_ci	&ret();
32641cb0ef41Sopenharmony_ci
32651cb0ef41Sopenharmony_ci&set_label("key_256b",16);
32661cb0ef41Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
32671cb0ef41Sopenharmony_ci	&lea		($key,&DWP(16,$key));
32681cb0ef41Sopenharmony_ci
32691cb0ef41Sopenharmony_ci	&shufps		("xmm4","xmm2",0b00010000);
32701cb0ef41Sopenharmony_ci	&xorps		("xmm2","xmm4");
32711cb0ef41Sopenharmony_ci	&shufps		("xmm4","xmm2",0b10001100);
32721cb0ef41Sopenharmony_ci	&xorps		("xmm2","xmm4");
32731cb0ef41Sopenharmony_ci	&shufps		("xmm1","xmm1",0b10101010);	# critical path
32741cb0ef41Sopenharmony_ci	&xorps		("xmm2","xmm1");
32751cb0ef41Sopenharmony_ci	&ret();
32761cb0ef41Sopenharmony_ci
32771cb0ef41Sopenharmony_ci&set_label("14rounds_alt",16);
32781cb0ef41Sopenharmony_ci	&movdqa		("xmm5",&QWP(0x00,"ebx"));
32791cb0ef41Sopenharmony_ci	&movdqa		("xmm4",&QWP(0x20,"ebx"));
32801cb0ef41Sopenharmony_ci	&mov		($rounds,7);
32811cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-32,$key),"xmm0");
32821cb0ef41Sopenharmony_ci	&movdqa		("xmm1","xmm2");
32831cb0ef41Sopenharmony_ci	&movdqu		(&QWP(-16,$key),"xmm2");
32841cb0ef41Sopenharmony_ci
32851cb0ef41Sopenharmony_ci&set_label("loop_key256");
32861cb0ef41Sopenharmony_ci	&pshufb		("xmm2","xmm5");
32871cb0ef41Sopenharmony_ci	&aesenclast	("xmm2","xmm4");
32881cb0ef41Sopenharmony_ci
32891cb0ef41Sopenharmony_ci	&movdqa		("xmm3","xmm0");
32901cb0ef41Sopenharmony_ci	&pslldq		("xmm0",4);
32911cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm0");
32921cb0ef41Sopenharmony_ci	&pslldq		("xmm0",4);
32931cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm0");
32941cb0ef41Sopenharmony_ci	&pslldq		("xmm0",4);
32951cb0ef41Sopenharmony_ci	&pxor		("xmm0","xmm3");
32961cb0ef41Sopenharmony_ci	&pslld		("xmm4",1);
32971cb0ef41Sopenharmony_ci
32981cb0ef41Sopenharmony_ci	&pxor		("xmm0","xmm2");
32991cb0ef41Sopenharmony_ci	&movdqu		(&QWP(0,$key),"xmm0");
33001cb0ef41Sopenharmony_ci
33011cb0ef41Sopenharmony_ci	&dec		($rounds);
33021cb0ef41Sopenharmony_ci	&jz		(&label("done_key256"));
33031cb0ef41Sopenharmony_ci
33041cb0ef41Sopenharmony_ci	&pshufd		("xmm2","xmm0",0xff);
33051cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm3");
33061cb0ef41Sopenharmony_ci	&aesenclast	("xmm2","xmm3");
33071cb0ef41Sopenharmony_ci
33081cb0ef41Sopenharmony_ci	&movdqa		("xmm3","xmm1");
33091cb0ef41Sopenharmony_ci	&pslldq		("xmm1",4);
33101cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm1");
33111cb0ef41Sopenharmony_ci	&pslldq		("xmm1",4);
33121cb0ef41Sopenharmony_ci	&pxor		("xmm3","xmm1");
33131cb0ef41Sopenharmony_ci	&pslldq		("xmm1",4);
33141cb0ef41Sopenharmony_ci	&pxor		("xmm1","xmm3");
33151cb0ef41Sopenharmony_ci
33161cb0ef41Sopenharmony_ci	&pxor		("xmm2","xmm1");
33171cb0ef41Sopenharmony_ci	&movdqu		(&QWP(16,$key),"xmm2");
33181cb0ef41Sopenharmony_ci	&lea		($key,&DWP(32,$key));
33191cb0ef41Sopenharmony_ci	&movdqa		("xmm1","xmm2");
33201cb0ef41Sopenharmony_ci	&jmp		(&label("loop_key256"));
33211cb0ef41Sopenharmony_ci
33221cb0ef41Sopenharmony_ci&set_label("done_key256");
33231cb0ef41Sopenharmony_ci	&mov		($rounds,13);
33241cb0ef41Sopenharmony_ci	&mov		(&DWP(16,$key),$rounds);
33251cb0ef41Sopenharmony_ci
33261cb0ef41Sopenharmony_ci&set_label("good_key");
33271cb0ef41Sopenharmony_ci	&pxor	("xmm0","xmm0");
33281cb0ef41Sopenharmony_ci	&pxor	("xmm1","xmm1");
33291cb0ef41Sopenharmony_ci	&pxor	("xmm2","xmm2");
33301cb0ef41Sopenharmony_ci	&pxor	("xmm3","xmm3");
33311cb0ef41Sopenharmony_ci	&pxor	("xmm4","xmm4");
33321cb0ef41Sopenharmony_ci	&pxor	("xmm5","xmm5");
33331cb0ef41Sopenharmony_ci	&xor	("eax","eax");
33341cb0ef41Sopenharmony_ci	&pop	("ebx");
33351cb0ef41Sopenharmony_ci	&pop	("ebp");
33361cb0ef41Sopenharmony_ci	&ret	();
33371cb0ef41Sopenharmony_ci
33381cb0ef41Sopenharmony_ci&set_label("bad_pointer",4);
33391cb0ef41Sopenharmony_ci	&mov	("eax",-1);
33401cb0ef41Sopenharmony_ci	&pop	("ebx");
33411cb0ef41Sopenharmony_ci	&pop	("ebp");
33421cb0ef41Sopenharmony_ci	&ret	();
33431cb0ef41Sopenharmony_ci&set_label("bad_keybits",4);
33441cb0ef41Sopenharmony_ci	&pxor	("xmm0","xmm0");
33451cb0ef41Sopenharmony_ci	&mov	("eax",-2);
33461cb0ef41Sopenharmony_ci	&pop	("ebx");
33471cb0ef41Sopenharmony_ci	&pop	("ebp");
33481cb0ef41Sopenharmony_ci	&ret	();
33491cb0ef41Sopenharmony_ci&function_end_B("_aesni_set_encrypt_key");
33501cb0ef41Sopenharmony_ci
33511cb0ef41Sopenharmony_ci# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
33521cb0ef41Sopenharmony_ci#                              AES_KEY *key)
33531cb0ef41Sopenharmony_ci&function_begin_B("${PREFIX}_set_encrypt_key");
33541cb0ef41Sopenharmony_ci	&mov	("eax",&wparam(0));
33551cb0ef41Sopenharmony_ci	&mov	($rounds,&wparam(1));
33561cb0ef41Sopenharmony_ci	&mov	($key,&wparam(2));
33571cb0ef41Sopenharmony_ci	&call	("_aesni_set_encrypt_key");
33581cb0ef41Sopenharmony_ci	&ret	();
33591cb0ef41Sopenharmony_ci&function_end_B("${PREFIX}_set_encrypt_key");
33601cb0ef41Sopenharmony_ci
33611cb0ef41Sopenharmony_ci# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
33621cb0ef41Sopenharmony_ci#                              AES_KEY *key)
33631cb0ef41Sopenharmony_ci&function_begin_B("${PREFIX}_set_decrypt_key");
33641cb0ef41Sopenharmony_ci	&mov	("eax",&wparam(0));
33651cb0ef41Sopenharmony_ci	&mov	($rounds,&wparam(1));
33661cb0ef41Sopenharmony_ci	&mov	($key,&wparam(2));
33671cb0ef41Sopenharmony_ci	&call	("_aesni_set_encrypt_key");
33681cb0ef41Sopenharmony_ci	&mov	($key,&wparam(2));
33691cb0ef41Sopenharmony_ci	&shl	($rounds,4);	# rounds-1 after _aesni_set_encrypt_key
33701cb0ef41Sopenharmony_ci	&test	("eax","eax");
33711cb0ef41Sopenharmony_ci	&jnz	(&label("dec_key_ret"));
33721cb0ef41Sopenharmony_ci	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
33731cb0ef41Sopenharmony_ci
33741cb0ef41Sopenharmony_ci	&$movekey	("xmm0",&QWP(0,$key));	# just swap
33751cb0ef41Sopenharmony_ci	&$movekey	("xmm1",&QWP(0,"eax"));
33761cb0ef41Sopenharmony_ci	&$movekey	(&QWP(0,"eax"),"xmm0");
33771cb0ef41Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm1");
33781cb0ef41Sopenharmony_ci	&lea		($key,&DWP(16,$key));
33791cb0ef41Sopenharmony_ci	&lea		("eax",&DWP(-16,"eax"));
33801cb0ef41Sopenharmony_ci
33811cb0ef41Sopenharmony_ci&set_label("dec_key_inverse");
33821cb0ef41Sopenharmony_ci	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
33831cb0ef41Sopenharmony_ci	&$movekey	("xmm1",&QWP(0,"eax"));
33841cb0ef41Sopenharmony_ci	&aesimc		("xmm0","xmm0");
33851cb0ef41Sopenharmony_ci	&aesimc		("xmm1","xmm1");
33861cb0ef41Sopenharmony_ci	&lea		($key,&DWP(16,$key));
33871cb0ef41Sopenharmony_ci	&lea		("eax",&DWP(-16,"eax"));
33881cb0ef41Sopenharmony_ci	&$movekey	(&QWP(16,"eax"),"xmm0");
33891cb0ef41Sopenharmony_ci	&$movekey	(&QWP(-16,$key),"xmm1");
33901cb0ef41Sopenharmony_ci	&cmp		("eax",$key);
33911cb0ef41Sopenharmony_ci	&ja		(&label("dec_key_inverse"));
33921cb0ef41Sopenharmony_ci
33931cb0ef41Sopenharmony_ci	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
33941cb0ef41Sopenharmony_ci	&aesimc		("xmm0","xmm0");
33951cb0ef41Sopenharmony_ci	&$movekey	(&QWP(0,$key),"xmm0");
33961cb0ef41Sopenharmony_ci
33971cb0ef41Sopenharmony_ci	&pxor		("xmm0","xmm0");
33981cb0ef41Sopenharmony_ci	&pxor		("xmm1","xmm1");
33991cb0ef41Sopenharmony_ci	&xor		("eax","eax");		# return success
34001cb0ef41Sopenharmony_ci&set_label("dec_key_ret");
34011cb0ef41Sopenharmony_ci	&ret	();
34021cb0ef41Sopenharmony_ci&function_end_B("${PREFIX}_set_decrypt_key");
34031cb0ef41Sopenharmony_ci
34041cb0ef41Sopenharmony_ci&set_label("key_const",64);
34051cb0ef41Sopenharmony_ci&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
34061cb0ef41Sopenharmony_ci&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
34071cb0ef41Sopenharmony_ci&data_word(1,1,1,1);
34081cb0ef41Sopenharmony_ci&data_word(0x1b,0x1b,0x1b,0x1b);
34091cb0ef41Sopenharmony_ci&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
34101cb0ef41Sopenharmony_ci
34111cb0ef41Sopenharmony_ci&asm_finish();
34121cb0ef41Sopenharmony_ci
34131cb0ef41Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
3414